Compare commits
No commits in common. "master" and "v1.0.0" have entirely different histories.
|
@ -1,18 +0,0 @@
|
|||
steps:
|
||||
dependencies:
|
||||
image: node:16
|
||||
pull: true
|
||||
commands:
|
||||
- npm i
|
||||
|
||||
osv-detector:
|
||||
image: gitnet.fr/deblan/osv-detector:v0.10
|
||||
commands:
|
||||
- osv-detector package-lock.json
|
||||
failure: ignore
|
||||
|
||||
tests:
|
||||
image: node:16
|
||||
pull: true
|
||||
commands:
|
||||
- npm run test
|
48
README.md
48
README.md
|
@ -1,18 +1,13 @@
|
|||
Scraper
|
||||
=======
|
||||
|
||||
[![Build Status](https://ci.gitnet.fr/api/badges/deblan/scraper/status.svg)](https://ci.gitnet.fr/deblan/scraper)
|
||||
|
||||
This project is a basic tool to scrap a data from a website
|
||||
using a CSS selector.
|
||||
|
||||
For example, if you want to retrieve the number of a project's releases hosted on github:
|
||||
|
||||
With CLI
|
||||
---
|
||||
For example, you are able to retrieve the number of a project's releases hosted on github:
|
||||
|
||||
```
|
||||
node src/cli.js \
|
||||
node src/index.js \
|
||||
--url https://github.com/foo/bar \
|
||||
--selector '.repository-content .numbers-summary li:nth-child(4) a' \
|
||||
--tags \
|
||||
|
@ -24,44 +19,7 @@ node src/cli.js \
|
|||
|
||||
...will show `XXX releases`.
|
||||
|
||||
More help with `node src/cli.js --help`.
|
||||
|
||||
With code
|
||||
---------
|
||||
|
||||
```
|
||||
const scraper = require('deblan-scraper')
|
||||
|
||||
const options = {
|
||||
url: 'https://github.com/foo/bar',
|
||||
acceptAllStatus: false, // Optional, default is `false`
|
||||
method: 'GET', // Optional, default is `GET`
|
||||
}
|
||||
|
||||
const isMultiple = false // get the first result, `true` to get an array of results
|
||||
|
||||
const selector = '.repository-content .numbers-summary li:nth-child(4) a'
|
||||
|
||||
const filters = {
|
||||
tags: null, // Removes tags. You can specify the tags to remove (separated by comma)
|
||||
breaks: null, // Removes breaks (\n, \r)
|
||||
spaces: null, // Replaces 2 successive spaces by 1, except breaks
|
||||
trim: null, // Strips whitespaces from the beginning and end of the value
|
||||
}
|
||||
|
||||
scraper(
|
||||
options,
|
||||
selector,
|
||||
filters,
|
||||
function(value) {
|
||||
console.log(value)
|
||||
},
|
||||
function(error) {
|
||||
console.log(error)
|
||||
},
|
||||
isMultiple
|
||||
)
|
||||
```
|
||||
More help with `node src/index.js --help`.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
|
2921
package-lock.json
generated
2921
package-lock.json
generated
File diff suppressed because it is too large
Load diff
15
package.json
15
package.json
|
@ -1,29 +1,26 @@
|
|||
{
|
||||
"name": "deblan-scraper",
|
||||
"description": "Web scraper using CSS selector",
|
||||
"version": "1.3.0",
|
||||
"version": "1.0.0",
|
||||
"main": "src/index.js",
|
||||
"devDependencies": {
|
||||
"mocha": "^10.2.0"
|
||||
},
|
||||
"devDependencies": {},
|
||||
"dependencies": {
|
||||
"cheerio": "^1.0.0-rc.3",
|
||||
"extends-classes": "^1.0.5",
|
||||
"minimist": "^1.2.5",
|
||||
"request": "^2.88.2",
|
||||
"request-promise": "^4.2.6",
|
||||
"request-promise": "^4.2.5",
|
||||
"striptags": "^3.1.1",
|
||||
"trim": "^1.0.1"
|
||||
"trim": "^0.0.1"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "mocha"
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://gitnet.fr/deblan/scraper.git"
|
||||
},
|
||||
"keywords": [
|
||||
"scraper selector htmlparser cli source-scraper extractor"
|
||||
"scraper"
|
||||
],
|
||||
"author": "Simon Vieille <contact@deblan.fr> (https://www.deblan.io)",
|
||||
"license": "ISC"
|
||||
|
|
110
src/cli.js
110
src/cli.js
|
@ -1,110 +0,0 @@
|
|||
const Output = require('./console/output')
|
||||
const Input = require('./console/input')
|
||||
const scraper = require('./index')
|
||||
const output = new Output()
|
||||
const input = new Input(process)
|
||||
const filters = [
|
||||
'breaks',
|
||||
'spaces',
|
||||
'tags',
|
||||
'trim',
|
||||
]
|
||||
|
||||
const usage = `
|
||||
${input.node} ${input.script} [params]
|
||||
|
||||
Parameters
|
||||
|
||||
--url [URL] URL to scrap
|
||||
--selector [Selector] CSS selector
|
||||
|
||||
Optional parameters
|
||||
|
||||
Filters
|
||||
|
||||
Order has a meaning.
|
||||
|
||||
--breaks Removes breaks (\\n, \\r)
|
||||
--trim Strips whitespaces from the beginning and end of the value
|
||||
--spaces Replaces 2 successive spaces by 1, except breaks
|
||||
--tags, --tags [TAGS] Removes tags. You can specify the tags to remove (separated by comma)
|
||||
|
||||
HTTP
|
||||
|
||||
--method [METHOD] HTTP Method
|
||||
--accept-http-error Accepts all status codes (like 404)
|
||||
|
||||
--verbose, -v Show message of error
|
||||
--multiple, -m The output must contain all the selector targets
|
||||
`
|
||||
|
||||
if (input.has('help')) {
|
||||
output.write(usage)
|
||||
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
if (!input.has('url')) {
|
||||
output.write('You must specify --url')
|
||||
output.write(usage)
|
||||
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (!input.has('selector')) {
|
||||
output.write('You must specify --selector')
|
||||
output.write(usage)
|
||||
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const url = input.get('url')
|
||||
const selector = input.get('selector')
|
||||
const method = input.has('method') ? input.get('method') : 'GET'
|
||||
const acceptAllStatus = input.has('accept-http-error')
|
||||
const verbose = input.has('verbose') || input.has('v')
|
||||
const isMultiple = input.has('multiple') || input.has('m')
|
||||
|
||||
let filtersToApply = {}
|
||||
|
||||
for (let param in input.args) {
|
||||
if (filters.includes(param)) {
|
||||
let value = input.args[param]
|
||||
|
||||
if (value !== true) {
|
||||
value = value.split(',')
|
||||
} else {
|
||||
value = null
|
||||
}
|
||||
|
||||
filtersToApply[param] = value
|
||||
}
|
||||
}
|
||||
|
||||
const options = {
|
||||
url: url,
|
||||
method: method,
|
||||
acceptAllStatus: acceptAllStatus,
|
||||
}
|
||||
|
||||
const onSuccess = function(value) {
|
||||
if (isMultiple && value instanceof Array) {
|
||||
for (let item of value) {
|
||||
output.write(item)
|
||||
}
|
||||
} else {
|
||||
output.write(value)
|
||||
}
|
||||
}
|
||||
|
||||
const onError = function(error) {
|
||||
output.write(error.name, 'error')
|
||||
|
||||
if (verbose) {
|
||||
output.write(error.message, 'error')
|
||||
}
|
||||
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
scraper(options, selector, filtersToApply, onSuccess, onError, isMultiple)
|
129
src/index.js
129
src/index.js
|
@ -1,5 +1,8 @@
|
|||
const Output = require('./console/output')
|
||||
const Input = require('./console/input')
|
||||
const rq = require('request-promise')
|
||||
const cheerio = require('cheerio')
|
||||
|
||||
const filters = {
|
||||
breaks: require('./filter/breaks'),
|
||||
spaces: require('./filter/spaces'),
|
||||
|
@ -7,47 +10,99 @@ const filters = {
|
|||
trim: require('./filter/trim'),
|
||||
}
|
||||
|
||||
const scraper = function(options, selector, filtersToApply, callbackSuccess, callbackError, isMultiple) {
|
||||
filtersToApply = filtersToApply || {}
|
||||
const output = new Output()
|
||||
const input = new Input(process)
|
||||
|
||||
rq({
|
||||
uri: options.url,
|
||||
method: options.method || 'GET',
|
||||
simple: !(options.acceptAllStatus || false)
|
||||
})
|
||||
.then(function(body) {
|
||||
const $ = cheerio.load(body)
|
||||
let value = []
|
||||
const usage = `
|
||||
${input.node} ${input.script} [params]
|
||||
|
||||
if (isMultiple) {
|
||||
let nodes = $(selector)
|
||||
Parameters
|
||||
|
||||
nodes.each(function(i, node) {
|
||||
value.push($(node).html())
|
||||
})
|
||||
} else {
|
||||
value = $(selector).html()
|
||||
}
|
||||
--url [URL] URL to scrap
|
||||
--selector [Selector] CSS selector
|
||||
|
||||
for (let filter in filtersToApply) {
|
||||
if (isMultiple) {
|
||||
for (let i in value) {
|
||||
value[i] = filters[filter](value[i], filtersToApply[filter])
|
||||
}
|
||||
} else {
|
||||
value = filters[filter](value, filtersToApply[filter])
|
||||
}
|
||||
}
|
||||
Optional parameters
|
||||
|
||||
if (callbackSuccess) {
|
||||
callbackSuccess(value)
|
||||
}
|
||||
})
|
||||
.catch(function(error) {
|
||||
if (callbackError) {
|
||||
callbackError(error)
|
||||
}
|
||||
})
|
||||
Filters
|
||||
|
||||
Order has a meaning.
|
||||
|
||||
--breaks Removes breaks (\\n, \\r)
|
||||
--trim Strips whitespaces from the beginning and end of the value
|
||||
--spaces Replaces 2 successive spaces by 1, except breaks
|
||||
--tags, --tags [TAGS] Removes tags. You can specify the tags to remove (separated by comma)
|
||||
|
||||
HTTP
|
||||
|
||||
--method [METHOD] HTTP Method
|
||||
--accept-http-error Accepts all status codes (like 404)
|
||||
|
||||
--verbose, -v Show message of error
|
||||
`
|
||||
|
||||
if (input.has('help')) {
|
||||
output.write(usage)
|
||||
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
module.exports = scraper
|
||||
if (!input.has('url')) {
|
||||
output.write('You must specify --url')
|
||||
output.write(usage)
|
||||
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (!input.has('selector')) {
|
||||
output.write('You must specify --selector')
|
||||
output.write(usage)
|
||||
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const url = input.get('url')
|
||||
const selector = input.get('selector')
|
||||
const method = input.has('method') ? input.get('method') : 'GET'
|
||||
const acceptAllStatus = input.has('accept-http-error')
|
||||
const verbose = input.has('verbose') || input.has('v')
|
||||
|
||||
let filtersToApply = {}
|
||||
|
||||
for (let param in input.args) {
|
||||
if (filters.hasOwnProperty(param)) {
|
||||
let value = input.args[param]
|
||||
|
||||
if (value !== true) {
|
||||
value = value.split(',')
|
||||
} else {
|
||||
value = null
|
||||
}
|
||||
|
||||
filtersToApply[param] = value
|
||||
}
|
||||
}
|
||||
|
||||
rq({
|
||||
uri: url,
|
||||
method: method,
|
||||
simple: !acceptAllStatus
|
||||
})
|
||||
.then(function(body) {
|
||||
const $ = cheerio.load(body)
|
||||
let value = $(selector).html()
|
||||
|
||||
for (let filter in filtersToApply) {
|
||||
value = filters[filter](value, filtersToApply[filter])
|
||||
}
|
||||
|
||||
output.write(value)
|
||||
})
|
||||
.catch(function(error) {
|
||||
output.write(error.name, 'error')
|
||||
|
||||
if (verbose) {
|
||||
output.write(error.message, 'error')
|
||||
}
|
||||
|
||||
process.exit(1)
|
||||
})
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
const filter = require('../src/filter/breaks')
|
||||
const assert = require('assert')
|
||||
|
||||
describe('Check "breaks"', () => {
|
||||
it('Should return empty string', () => {
|
||||
assert.equal(filter("\n"), "")
|
||||
})
|
||||
|
||||
it('Should return empty string', () => {
|
||||
assert.equal(filter("\r"), "")
|
||||
})
|
||||
|
||||
it('Should return "foo"', () => {
|
||||
assert.equal(filter("foo"), "foo")
|
||||
})
|
||||
|
||||
it('Should return "foobar"', () => {
|
||||
assert.equal(filter("\r\nfoo\nbar\r\n"), "foobar")
|
||||
})
|
||||
});
|
|
@ -1,16 +0,0 @@
|
|||
const filter = require('../src/filter/spaces')
|
||||
const assert = require('assert')
|
||||
|
||||
describe('Check "spaces"', () => {
|
||||
it('Should return ""', () => {
|
||||
assert.equal(filter(""), "")
|
||||
})
|
||||
|
||||
it('Should return "foo bar"', () => {
|
||||
assert.equal(filter("foo bar"), "foo bar")
|
||||
})
|
||||
|
||||
it('Should return "foo bar"', () => {
|
||||
assert.equal(filter("foo \nbar"), "foo bar")
|
||||
})
|
||||
});
|
|
@ -1,5 +0,0 @@
|
|||
const filter = require('../src/filter/tags')
|
||||
const assert = require('assert')
|
||||
|
||||
describe('Check "tags"', () => {
|
||||
});
|
|
@ -1,20 +0,0 @@
|
|||
const filter = require('../src/filter/trim')
|
||||
const assert = require('assert')
|
||||
|
||||
describe('Check "trim"', () => {
|
||||
it('Should return ""', () => {
|
||||
assert.equal(filter(""), "")
|
||||
})
|
||||
|
||||
it('Should return ""', () => {
|
||||
assert.equal(filter(" "), "")
|
||||
})
|
||||
|
||||
it('Should return "foo"', () => {
|
||||
assert.equal(filter(" foo "), "foo")
|
||||
})
|
||||
|
||||
it('Should return "foo bar"', () => {
|
||||
assert.equal(filter(" foo bar \n"), "foo bar")
|
||||
})
|
||||
});
|
Loading…
Reference in a new issue