Compare commits

...

22 Commits

Author SHA1 Message Date
Simon Vieille a7d23b9027
fix ci syntax
ci/woodpecker/push/woodpecker Pipeline was successful Details
2023-09-29 16:37:56 +02:00
Simon Vieille 013fa584fa
remove jenkins stuff
ci/woodpecker/push/woodpecker Pipeline was successful Details
2023-03-31 21:34:27 +02:00
Simon Vieille 77a2b1bca7
update ci
ci/woodpecker/push/woodpecker Pipeline was successful Details
2023-03-31 17:57:16 +02:00
Simon Vieille edeb7024d0
add ci
ci/woodpecker/push/woodpecker Pipeline was successful Details
2023-03-31 17:55:52 +02:00
Simon Vieille 01a2b9bc0c
apply linter 2023-03-31 17:53:50 +02:00
Simon Vieille cf9dd3218e
update dependencies 2023-03-31 17:53:40 +02:00
Simon Vieille 5130a96edc
apply linter 2023-03-31 17:53:24 +02:00
Simon Vieille e9cb85c695
release v1.3.0
Gitnet/scraper/pipeline/head This commit looks good Details
2020-11-10 15:11:41 +01:00
Simon Vieille b680003549
add option to get multiple results
Gitnet/scraper/pipeline/head This commit looks good Details
2020-11-10 13:30:38 +01:00
Simon Vieille bdd90b0e09
add documentation
Gitnet/scraper/pipeline/head This commit looks good Details
2020-09-02 10:56:51 +02:00
Simon Vieille a56c6367e9
release v1.2.0
Gitnet/scraper/pipeline/head This commit looks good Details
2020-09-02 10:56:24 +02:00
Simon Vieille 92b3c5cac8
add documentation
Gitnet/scraper/pipeline/head This commit looks good Details
2020-09-02 10:55:15 +02:00
Simon Vieille 2d4b6ad2e3
add tests 2020-09-02 10:52:59 +02:00
Simon Vieille 458593e454
add tests
Gitnet/scraper/pipeline/head This commit looks good Details
2020-09-02 10:49:00 +02:00
Simon Vieille 62da111aca
add tests
Gitnet/scraper/pipeline/head There was a failure building this commit Details
2020-09-02 10:44:33 +02:00
Simon Vieille 5619191a00
add documentation 2020-04-15 10:34:47 +02:00
Simon Vieille 229242ed11
packaging 2020-04-14 18:18:04 +02:00
Simon Vieille 208088416e
refactoring: index.js is now a module, add cli.js 2020-04-14 18:16:19 +02:00
Simon Vieille 3bdd7f8950
add package keywords 2020-04-14 17:34:50 +02:00
Simon Vieille 32e9ce0edb
add package keywords 2020-04-14 17:31:13 +02:00
Simon Vieille 83cddecec6
fix dependency (request) 2020-04-14 16:50:35 +02:00
Simon Vieille 202c25f1ed
fix dependency (request) 2020-04-14 16:50:23 +02:00
17 changed files with 4249 additions and 371 deletions

18
.woodpecker.yml Normal file
View File

@ -0,0 +1,18 @@
steps:
dependencies:
image: node:16
pull: true
commands:
- npm i
osv-detector:
image: gitnet.fr/deblan/osv-detector:v0.10
commands:
- osv-detector package-lock.json
failure: ignore
tests:
image: node:16
pull: true
commands:
- npm run test

View File

@ -1,13 +1,18 @@
Scraper
=======
[![Build Status](https://ci.gitnet.fr/api/badges/deblan/scraper/status.svg)](https://ci.gitnet.fr/deblan/scraper)
This project is a basic tool to scrap a data from a website
using a CSS selector.
For example, you are able to retrieve the number of a project's releases hosted on github:
For example, if you want to retrieve the number of a project's releases hosted on github:
With CLI
---
```
node src/index.js \
node src/cli.js \
--url https://github.com/foo/bar \
--selector '.repository-content .numbers-summary li:nth-child(4) a' \
--tags \
@ -19,7 +24,44 @@ node src/index.js \
...will show `XXX releases`.
More help with `node src/index.js --help`.
More help with `node src/cli.js --help`.
With code
---------
```
const scraper = require('deblan-scraper')
const options = {
url: 'https://github.com/foo/bar',
acceptAllStatus: false, // Optional, default is `false`
method: 'GET', // Optional, default is `GET`
}
const isMultiple = false // get the first result, `true` to get an array of results
const selector = '.repository-content .numbers-summary li:nth-child(4) a'
const filters = {
tags: null, // Removes tags. You can specify the tags to remove (separated by comma)
breaks: null, // Removes breaks (\n, \r)
spaces: null, // Replaces 2 successive spaces by 1, except breaks
trim: null, // Strips whitespaces from the beginning and end of the value
}
scraper(
options,
selector,
filters,
function(value) {
console.log(value)
},
function(error) {
console.log(error)
},
isMultiple
)
```
Installation
------------

2921
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +1,29 @@
{
"name": "deblan-scraper",
"description": "Web scraper using CSS selector",
"version": "1.0.0",
"version": "1.3.0",
"main": "src/index.js",
"devDependencies": {},
"devDependencies": {
"mocha": "^10.2.0"
},
"dependencies": {
"cheerio": "^1.0.0-rc.3",
"extends-classes": "^1.0.5",
"minimist": "^1.2.5",
"request-promise": "^4.2.5",
"request": "^2.88.2",
"request-promise": "^4.2.6",
"striptags": "^3.1.1",
"trim": "^0.0.1"
"trim": "^1.0.1"
},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
"test": "mocha"
},
"repository": {
"type": "git",
"url": "https://gitnet.fr/deblan/scraper.git"
},
"keywords": [
"scraper"
"scraper selector htmlparser cli source-scraper extractor"
],
"author": "Simon Vieille <contact@deblan.fr> (https://www.deblan.io)",
"license": "ISC"

110
src/cli.js Normal file
View File

@ -0,0 +1,110 @@
const Output = require('./console/output')
const Input = require('./console/input')
const scraper = require('./index')
const output = new Output()
const input = new Input(process)
const filters = [
'breaks',
'spaces',
'tags',
'trim',
]
const usage = `
${input.node} ${input.script} [params]
Parameters
--url [URL] URL to scrap
--selector [Selector] CSS selector
Optional parameters
Filters
Order has a meaning.
--breaks Removes breaks (\\n, \\r)
--trim Strips whitespaces from the beginning and end of the value
--spaces Replaces 2 successive spaces by 1, except breaks
--tags, --tags [TAGS] Removes tags. You can specify the tags to remove (separated by comma)
HTTP
--method [METHOD] HTTP Method
--accept-http-error Accepts all status codes (like 404)
--verbose, -v Show message of error
--multiple, -m The output must contain all the selector targets
`
if (input.has('help')) {
output.write(usage)
process.exit(0)
}
if (!input.has('url')) {
output.write('You must specify --url')
output.write(usage)
process.exit(1)
}
if (!input.has('selector')) {
output.write('You must specify --selector')
output.write(usage)
process.exit(1)
}
const url = input.get('url')
const selector = input.get('selector')
const method = input.has('method') ? input.get('method') : 'GET'
const acceptAllStatus = input.has('accept-http-error')
const verbose = input.has('verbose') || input.has('v')
const isMultiple = input.has('multiple') || input.has('m')
let filtersToApply = {}
for (let param in input.args) {
if (filters.includes(param)) {
let value = input.args[param]
if (value !== true) {
value = value.split(',')
} else {
value = null
}
filtersToApply[param] = value
}
}
const options = {
url: url,
method: method,
acceptAllStatus: acceptAllStatus,
}
const onSuccess = function(value) {
if (isMultiple && value instanceof Array) {
for (let item of value) {
output.write(item)
}
} else {
output.write(value)
}
}
const onError = function(error) {
output.write(error.name, 'error')
if (verbose) {
output.write(error.message, 'error')
}
process.exit(1)
}
scraper(options, selector, filtersToApply, onSuccess, onError, isMultiple)

View File

@ -3,56 +3,56 @@
var Minimist = require('minimist');
class Input {
/**
* Constructor.
*
* @param object process
*/
constructor(process) {
this.args = Minimist(process.argv.slice(2))
this.node = process.argv[0]
this.script = process.argv[1]
/**
* Constructor.
*
* @param object process
*/
constructor(process) {
this.args = Minimist(process.argv.slice(2))
this.node = process.argv[0]
this.script = process.argv[1]
}
/**
* Return the value of the given name argument.
*
* @param string name
* @param mixed default
*
* @return mixed
*/
get(name, defaultValue) {
if (this.has(name)) {
return this.args[name]
}
/**
* Return the value of the given name argument.
*
* @param string name
* @param mixed default
*
* @return mixed
*/
get(name, defaultValue) {
if (this.has(name)) {
return this.args[name]
}
if (defaultValue !== undefined) {
return defaultValue
}
return null;
if (defaultValue !== undefined) {
return defaultValue
}
/**
* Check the given argument name exists.
*
* @param string name
*
* @return boolean
*/
has(name) {
return this.args.hasOwnProperty(name)
}
return null;
}
/**
* Return if args is empty.
*
* @return boolean
*/
empty() {
return Object.keys(this.args).length === 1
}
/**
* Check the given argument name exists.
*
* @param string name
*
* @return boolean
*/
has(name) {
return this.args.hasOwnProperty(name)
}
/**
* Return if args is empty.
*
* @return boolean
*/
empty() {
return Object.keys(this.args).length === 1
}
}
module.exports = Input

View File

@ -1,37 +1,37 @@
"use strict";
class Output {
/**
* Convert and print data to json.
*
* @param mixed data
*/
json(data, pretty) {
data = JSON.stringify(
data,
function(key, value) {
if (value === undefined) {
return null
}
/**
* Convert and print data to json.
*
* @param mixed data
*/
json(data, pretty) {
data = JSON.stringify(
data,
function(key, value) {
if (value === undefined) {
return null
}
return value
},
pretty ? 2 : null
);
return value
},
pretty ? 2 : null
);
return this.write(data)
}
return this.write(data)
}
/**
* Print data.
*
* @param mixed data
*/
write(data, level) {
level = level || 'log'
/**
* Print data.
*
* @param mixed data
*/
write(data, level) {
level = level || 'log'
console[level](data)
}
console[level](data)
}
}
module.exports = Output

View File

@ -1,5 +1,5 @@
const filter = function(value) {
return value.replace(/(\n|\r)/g, '')
return value.replace(/(\n|\r)/g, '')
}
module.exports = filter

View File

@ -1,5 +1,5 @@
const filter = function(value) {
return value.replace(/\s{2,}/g, ' ')
return value.replace(/\s{2,}/g, ' ')
}
module.exports = filter

View File

@ -1,7 +1,7 @@
const striptags = require('striptags')
const filter = function(value, tags) {
return striptags(value, tags)
return striptags(value, tags)
}
module.exports = filter

View File

@ -1,7 +1,7 @@
const trim = require('trim')
const filter = function(value) {
return trim(value)
return trim(value)
}
module.exports = filter

View File

@ -1,8 +1,5 @@
const Output = require('./console/output')
const Input = require('./console/input')
const rq = require('request-promise')
const cheerio = require('cheerio')
const filters = {
breaks: require('./filter/breaks'),
spaces: require('./filter/spaces'),
@ -10,99 +7,47 @@ const filters = {
trim: require('./filter/trim'),
}
const output = new Output()
const input = new Input(process)
const scraper = function(options, selector, filtersToApply, callbackSuccess, callbackError, isMultiple) {
filtersToApply = filtersToApply || {}
const usage = `
${input.node} ${input.script} [params]
Parameters
--url [URL] URL to scrap
--selector [Selector] CSS selector
Optional parameters
Filters
Order has a meaning.
--breaks Removes breaks (\\n, \\r)
--trim Strips whitespaces from the beginning and end of the value
--spaces Replaces 2 successive spaces by 1, except breaks
--tags, --tags [TAGS] Removes tags. You can specify the tags to remove (separated by comma)
HTTP
--method [METHOD] HTTP Method
--accept-http-error Accepts all status codes (like 404)
--verbose, -v Show message of error
`
if (input.has('help')) {
output.write(usage)
process.exit(0)
}
if (!input.has('url')) {
output.write('You must specify --url')
output.write(usage)
process.exit(1)
}
if (!input.has('selector')) {
output.write('You must specify --selector')
output.write(usage)
process.exit(1)
}
const url = input.get('url')
const selector = input.get('selector')
const method = input.has('method') ? input.get('method') : 'GET'
const acceptAllStatus = input.has('accept-http-error')
const verbose = input.has('verbose') || input.has('v')
let filtersToApply = {}
for (let param in input.args) {
if (filters.hasOwnProperty(param)) {
let value = input.args[param]
if (value !== true) {
value = value.split(',')
} else {
value = null
}
filtersToApply[param] = value
}
}
rq({
uri: url,
method: method,
simple: !acceptAllStatus
})
.then(function(body) {
const $ = cheerio.load(body)
let value = $(selector).html()
for (let filter in filtersToApply) {
value = filters[filter](value, filtersToApply[filter])
}
output.write(value)
rq({
uri: options.url,
method: options.method || 'GET',
simple: !(options.acceptAllStatus || false)
})
.catch(function(error) {
output.write(error.name, 'error')
.then(function(body) {
const $ = cheerio.load(body)
let value = []
if (verbose) {
output.write(error.message, 'error')
}
if (isMultiple) {
let nodes = $(selector)
process.exit(1)
})
nodes.each(function(i, node) {
value.push($(node).html())
})
} else {
value = $(selector).html()
}
for (let filter in filtersToApply) {
if (isMultiple) {
for (let i in value) {
value[i] = filters[filter](value[i], filtersToApply[filter])
}
} else {
value = filters[filter](value, filtersToApply[filter])
}
}
if (callbackSuccess) {
callbackSuccess(value)
}
})
.catch(function(error) {
if (callbackError) {
callbackError(error)
}
})
}
module.exports = scraper

20
test/breaks.test.js Normal file
View File

@ -0,0 +1,20 @@
const filter = require('../src/filter/breaks')
const assert = require('assert')
describe('Check "breaks"', () => {
it('Should return empty string', () => {
assert.equal(filter("\n"), "")
})
it('Should return empty string', () => {
assert.equal(filter("\r"), "")
})
it('Should return "foo"', () => {
assert.equal(filter("foo"), "foo")
})
it('Should return "foobar"', () => {
assert.equal(filter("\r\nfoo\nbar\r\n"), "foobar")
})
});

16
test/spaces.test.js Normal file
View File

@ -0,0 +1,16 @@
const filter = require('../src/filter/spaces')
const assert = require('assert')
describe('Check "spaces"', () => {
it('Should return ""', () => {
assert.equal(filter(""), "")
})
it('Should return "foo bar"', () => {
assert.equal(filter("foo bar"), "foo bar")
})
it('Should return "foo bar"', () => {
assert.equal(filter("foo \nbar"), "foo bar")
})
});

5
test/tags.test.js Normal file
View File

@ -0,0 +1,5 @@
const filter = require('../src/filter/tags')
const assert = require('assert')
describe('Check "tags"', () => {
});

20
test/trim.test.js Normal file
View File

@ -0,0 +1,20 @@
const filter = require('../src/filter/trim')
const assert = require('assert')
describe('Check "trim"', () => {
it('Should return ""', () => {
assert.equal(filter(""), "")
})
it('Should return ""', () => {
assert.equal(filter(" "), "")
})
it('Should return "foo"', () => {
assert.equal(filter(" foo "), "foo")
})
it('Should return "foo bar"', () => {
assert.equal(filter(" foo bar \n"), "foo bar")
})
});

1160
yarn.lock

File diff suppressed because it is too large Load Diff