Compare commits
16 commits
Author | SHA1 | Date | |
---|---|---|---|
a7d23b9027 | |||
013fa584fa | |||
77a2b1bca7 | |||
edeb7024d0 | |||
01a2b9bc0c | |||
cf9dd3218e | |||
5130a96edc | |||
Simon Vieille | e9cb85c695 | ||
Simon Vieille | b680003549 | ||
Simon Vieille | bdd90b0e09 | ||
Simon Vieille | a56c6367e9 | ||
Simon Vieille | 92b3c5cac8 | ||
Simon Vieille | 2d4b6ad2e3 | ||
Simon Vieille | 458593e454 | ||
Simon Vieille | 62da111aca | ||
Simon Vieille | 5619191a00 |
18
.woodpecker.yml
Normal file
18
.woodpecker.yml
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
steps:
|
||||||
|
dependencies:
|
||||||
|
image: node:16
|
||||||
|
pull: true
|
||||||
|
commands:
|
||||||
|
- npm i
|
||||||
|
|
||||||
|
osv-detector:
|
||||||
|
image: gitnet.fr/deblan/osv-detector:v0.10
|
||||||
|
commands:
|
||||||
|
- osv-detector package-lock.json
|
||||||
|
failure: ignore
|
||||||
|
|
||||||
|
tests:
|
||||||
|
image: node:16
|
||||||
|
pull: true
|
||||||
|
commands:
|
||||||
|
- npm run test
|
19
README.md
19
README.md
|
@ -1,6 +1,8 @@
|
||||||
Scraper
|
Scraper
|
||||||
=======
|
=======
|
||||||
|
|
||||||
|
[![Build Status](https://ci.gitnet.fr/api/badges/deblan/scraper/status.svg)](https://ci.gitnet.fr/deblan/scraper)
|
||||||
|
|
||||||
This project is a basic tool to scrap a data from a website
|
This project is a basic tool to scrap a data from a website
|
||||||
using a CSS selector.
|
using a CSS selector.
|
||||||
|
|
||||||
|
@ -32,17 +34,19 @@ const scraper = require('deblan-scraper')
|
||||||
|
|
||||||
const options = {
|
const options = {
|
||||||
url: 'https://github.com/foo/bar',
|
url: 'https://github.com/foo/bar',
|
||||||
acceptAllStatus: false, // Optional
|
acceptAllStatus: false, // Optional, default is `false`
|
||||||
method: 'GET', // Optional
|
method: 'GET', // Optional, default is `GET`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const isMultiple = false // get the first result, `true` to get an array of results
|
||||||
|
|
||||||
const selector = '.repository-content .numbers-summary li:nth-child(4) a'
|
const selector = '.repository-content .numbers-summary li:nth-child(4) a'
|
||||||
|
|
||||||
const filters = {
|
const filters = {
|
||||||
tags: null,
|
tags: null, // Removes tags. You can specify the tags to remove (separated by comma)
|
||||||
breaks: null,
|
breaks: null, // Removes breaks (\n, \r)
|
||||||
spaces: null,
|
spaces: null, // Replaces 2 successive spaces by 1, except breaks
|
||||||
trim: null,
|
trim: null, // Strips whitespaces from the beginning and end of the value
|
||||||
}
|
}
|
||||||
|
|
||||||
scraper(
|
scraper(
|
||||||
|
@ -54,7 +58,8 @@ scraper(
|
||||||
},
|
},
|
||||||
function(error) {
|
function(error) {
|
||||||
console.log(error)
|
console.log(error)
|
||||||
}
|
},
|
||||||
|
isMultiple
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
2921
package-lock.json
generated
Normal file
2921
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load diff
12
package.json
12
package.json
|
@ -1,20 +1,22 @@
|
||||||
{
|
{
|
||||||
"name": "deblan-scraper",
|
"name": "deblan-scraper",
|
||||||
"description": "Web scraper using CSS selector",
|
"description": "Web scraper using CSS selector",
|
||||||
"version": "1.1.0",
|
"version": "1.3.0",
|
||||||
"main": "src/index.js",
|
"main": "src/index.js",
|
||||||
"devDependencies": {},
|
"devDependencies": {
|
||||||
|
"mocha": "^10.2.0"
|
||||||
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"cheerio": "^1.0.0-rc.3",
|
"cheerio": "^1.0.0-rc.3",
|
||||||
"extends-classes": "^1.0.5",
|
"extends-classes": "^1.0.5",
|
||||||
"minimist": "^1.2.5",
|
"minimist": "^1.2.5",
|
||||||
"request": "^2.88.2",
|
"request": "^2.88.2",
|
||||||
"request-promise": "^4.2.5",
|
"request-promise": "^4.2.6",
|
||||||
"striptags": "^3.1.1",
|
"striptags": "^3.1.1",
|
||||||
"trim": "^0.0.1"
|
"trim": "^1.0.1"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
"test": "mocha"
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
|
|
10
src/cli.js
10
src/cli.js
|
@ -35,6 +35,7 @@ Optional parameters
|
||||||
--accept-http-error Accepts all status codes (like 404)
|
--accept-http-error Accepts all status codes (like 404)
|
||||||
|
|
||||||
--verbose, -v Show message of error
|
--verbose, -v Show message of error
|
||||||
|
--multiple, -m The output must contain all the selector targets
|
||||||
`
|
`
|
||||||
|
|
||||||
if (input.has('help')) {
|
if (input.has('help')) {
|
||||||
|
@ -62,6 +63,7 @@ const selector = input.get('selector')
|
||||||
const method = input.has('method') ? input.get('method') : 'GET'
|
const method = input.has('method') ? input.get('method') : 'GET'
|
||||||
const acceptAllStatus = input.has('accept-http-error')
|
const acceptAllStatus = input.has('accept-http-error')
|
||||||
const verbose = input.has('verbose') || input.has('v')
|
const verbose = input.has('verbose') || input.has('v')
|
||||||
|
const isMultiple = input.has('multiple') || input.has('m')
|
||||||
|
|
||||||
let filtersToApply = {}
|
let filtersToApply = {}
|
||||||
|
|
||||||
|
@ -86,8 +88,14 @@ const options = {
|
||||||
}
|
}
|
||||||
|
|
||||||
const onSuccess = function(value) {
|
const onSuccess = function(value) {
|
||||||
|
if (isMultiple && value instanceof Array) {
|
||||||
|
for (let item of value) {
|
||||||
|
output.write(item)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
output.write(value)
|
output.write(value)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const onError = function(error) {
|
const onError = function(error) {
|
||||||
output.write(error.name, 'error')
|
output.write(error.name, 'error')
|
||||||
|
@ -99,4 +107,4 @@ const onError = function(error) {
|
||||||
process.exit(1)
|
process.exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
scraper(options, selector, filtersToApply, onSuccess, onError)
|
scraper(options, selector, filtersToApply, onSuccess, onError, isMultiple)
|
||||||
|
|
20
src/index.js
20
src/index.js
|
@ -7,7 +7,7 @@ const filters = {
|
||||||
trim: require('./filter/trim'),
|
trim: require('./filter/trim'),
|
||||||
}
|
}
|
||||||
|
|
||||||
const scraper = function(options, selector, filtersToApply, callbackSuccess, callbackError) {
|
const scraper = function(options, selector, filtersToApply, callbackSuccess, callbackError, isMultiple) {
|
||||||
filtersToApply = filtersToApply || {}
|
filtersToApply = filtersToApply || {}
|
||||||
|
|
||||||
rq({
|
rq({
|
||||||
|
@ -17,11 +17,27 @@ const scraper = function(options, selector, filtersToApply, callbackSuccess, cal
|
||||||
})
|
})
|
||||||
.then(function(body) {
|
.then(function(body) {
|
||||||
const $ = cheerio.load(body)
|
const $ = cheerio.load(body)
|
||||||
let value = $(selector).html()
|
let value = []
|
||||||
|
|
||||||
|
if (isMultiple) {
|
||||||
|
let nodes = $(selector)
|
||||||
|
|
||||||
|
nodes.each(function(i, node) {
|
||||||
|
value.push($(node).html())
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
value = $(selector).html()
|
||||||
|
}
|
||||||
|
|
||||||
for (let filter in filtersToApply) {
|
for (let filter in filtersToApply) {
|
||||||
|
if (isMultiple) {
|
||||||
|
for (let i in value) {
|
||||||
|
value[i] = filters[filter](value[i], filtersToApply[filter])
|
||||||
|
}
|
||||||
|
} else {
|
||||||
value = filters[filter](value, filtersToApply[filter])
|
value = filters[filter](value, filtersToApply[filter])
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (callbackSuccess) {
|
if (callbackSuccess) {
|
||||||
callbackSuccess(value)
|
callbackSuccess(value)
|
||||||
|
|
20
test/breaks.test.js
Normal file
20
test/breaks.test.js
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
const filter = require('../src/filter/breaks')
|
||||||
|
const assert = require('assert')
|
||||||
|
|
||||||
|
describe('Check "breaks"', () => {
|
||||||
|
it('Should return empty string', () => {
|
||||||
|
assert.equal(filter("\n"), "")
|
||||||
|
})
|
||||||
|
|
||||||
|
it('Should return empty string', () => {
|
||||||
|
assert.equal(filter("\r"), "")
|
||||||
|
})
|
||||||
|
|
||||||
|
it('Should return "foo"', () => {
|
||||||
|
assert.equal(filter("foo"), "foo")
|
||||||
|
})
|
||||||
|
|
||||||
|
it('Should return "foobar"', () => {
|
||||||
|
assert.equal(filter("\r\nfoo\nbar\r\n"), "foobar")
|
||||||
|
})
|
||||||
|
});
|
16
test/spaces.test.js
Normal file
16
test/spaces.test.js
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
const filter = require('../src/filter/spaces')
|
||||||
|
const assert = require('assert')
|
||||||
|
|
||||||
|
describe('Check "spaces"', () => {
|
||||||
|
it('Should return ""', () => {
|
||||||
|
assert.equal(filter(""), "")
|
||||||
|
})
|
||||||
|
|
||||||
|
it('Should return "foo bar"', () => {
|
||||||
|
assert.equal(filter("foo bar"), "foo bar")
|
||||||
|
})
|
||||||
|
|
||||||
|
it('Should return "foo bar"', () => {
|
||||||
|
assert.equal(filter("foo \nbar"), "foo bar")
|
||||||
|
})
|
||||||
|
});
|
5
test/tags.test.js
Normal file
5
test/tags.test.js
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
const filter = require('../src/filter/tags')
|
||||||
|
const assert = require('assert')
|
||||||
|
|
||||||
|
describe('Check "tags"', () => {
|
||||||
|
});
|
20
test/trim.test.js
Normal file
20
test/trim.test.js
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
const filter = require('../src/filter/trim')
|
||||||
|
const assert = require('assert')
|
||||||
|
|
||||||
|
describe('Check "trim"', () => {
|
||||||
|
it('Should return ""', () => {
|
||||||
|
assert.equal(filter(""), "")
|
||||||
|
})
|
||||||
|
|
||||||
|
it('Should return ""', () => {
|
||||||
|
assert.equal(filter(" "), "")
|
||||||
|
})
|
||||||
|
|
||||||
|
it('Should return "foo"', () => {
|
||||||
|
assert.equal(filter(" foo "), "foo")
|
||||||
|
})
|
||||||
|
|
||||||
|
it('Should return "foo bar"', () => {
|
||||||
|
assert.equal(filter(" foo bar \n"), "foo bar")
|
||||||
|
})
|
||||||
|
});
|
Loading…
Reference in a new issue