Browse Source

refactoring: index.js is now a module, add cli.js

tags/v1.1.0
Simon Vieille 1 month ago
parent
commit
208088416e
Signed by: deblan <simon@deblan.fr> GPG Key ID: 03383D15A1D31745
3 changed files with 167 additions and 99 deletions
  1. +40
    -3
      README.md
  2. +102
    -0
      src/cli.js
  3. +25
    -96
      src/index.js

+ 40
- 3
README.md View File

@@ -4,10 +4,13 @@ Scraper
This project is a basic tool to scrap a data from a website
using a CSS selector.

For example, you are able to retrieve the number of a project's releases hosted on github:
For example, if you want to retrieve the number of a project's releases hosted on github:

With CLI
---

```
node src/index.js \
node src/cli.js \
--url https://github.com/foo/bar \
--selector '.repository-content .numbers-summary li:nth-child(4) a' \
--tags \
@@ -19,7 +22,41 @@ node src/index.js \

...will show `XXX releases`.

More help with `node src/index.js --help`.
More help with `node src/cli.js --help`.

With code
---------

```
const scraper = require('deblan-scraper')

const options = {
url: 'https://github.com/foo/bar',
acceptAllStatus: false, // Optional
method: 'GET', // Optional
}

const selector = '.repository-content .numbers-summary li:nth-child(4) a'

const filters = {
tags: null,
breaks: null,
spaces: null,
trim: null,
}

scraper(
options,
selector,
filters,
function(value) {
console.log(value)
},
function(error) {
console.log(error)
}
)
```

Installation
------------


+ 102
- 0
src/cli.js View File

@@ -0,0 +1,102 @@
const Output = require('./console/output')
const Input = require('./console/input')
const scraper = require('./index')
const output = new Output()
const input = new Input(process)
const filters = [
'breaks',
'spaces',
'tags',
'trim',
]

const usage = `
${input.node} ${input.script} [params]

Parameters

--url [URL] URL to scrap
--selector [Selector] CSS selector

Optional parameters

Filters

Order has a meaning.

--breaks Removes breaks (\\n, \\r)
--trim Strips whitespaces from the beginning and end of the value
--spaces Replaces 2 successive spaces by 1, except breaks
--tags, --tags [TAGS] Removes tags. You can specify the tags to remove (separated by comma)

HTTP

--method [METHOD] HTTP Method
--accept-http-error Accepts all status codes (like 404)

--verbose, -v Show message of error
`

if (input.has('help')) {
output.write(usage)

process.exit(0)
}

if (!input.has('url')) {
output.write('You must specify --url')
output.write(usage)

process.exit(1)
}

if (!input.has('selector')) {
output.write('You must specify --selector')
output.write(usage)

process.exit(1)
}

const url = input.get('url')
const selector = input.get('selector')
const method = input.has('method') ? input.get('method') : 'GET'
const acceptAllStatus = input.has('accept-http-error')
const verbose = input.has('verbose') || input.has('v')

let filtersToApply = {}

for (let param in input.args) {
if (filters.includes(param)) {
let value = input.args[param]

if (value !== true) {
value = value.split(',')
} else {
value = null
}

filtersToApply[param] = value
}
}

const options = {
url: url,
method: method,
acceptAllStatus: acceptAllStatus,
}

const onSuccess = function(value) {
output.write(value)
}

const onError = function(error) {
output.write(error.name, 'error')

if (verbose) {
output.write(error.message, 'error')
}

process.exit(1)
}

scraper(options, selector, filtersToApply, onSuccess, onError)

+ 25
- 96
src/index.js View File

@@ -1,8 +1,5 @@
const Output = require('./console/output')
const Input = require('./console/input')
const rq = require('request-promise')
const cheerio = require('cheerio')

const filters = {
breaks: require('./filter/breaks'),
spaces: require('./filter/spaces'),
@@ -10,99 +7,31 @@ const filters = {
trim: require('./filter/trim'),
}

const output = new Output()
const input = new Input(process)

const usage = `
${input.node} ${input.script} [params]

Parameters

--url [URL] URL to scrap
--selector [Selector] CSS selector

Optional parameters

Filters

Order has a meaning.

--breaks Removes breaks (\\n, \\r)
--trim Strips whitespaces from the beginning and end of the value
--spaces Replaces 2 successive spaces by 1, except breaks
--tags, --tags [TAGS] Removes tags. You can specify the tags to remove (separated by comma)

HTTP

--method [METHOD] HTTP Method
--accept-http-error Accepts all status codes (like 404)

--verbose, -v Show message of error
`

if (input.has('help')) {
output.write(usage)

process.exit(0)
}

if (!input.has('url')) {
output.write('You must specify --url')
output.write(usage)
const scraper = function(options, selector, filtersToApply, callbackSuccess, callbackError) {
filtersToApply = filtersToApply || {}

process.exit(1)
}

if (!input.has('selector')) {
output.write('You must specify --selector')
output.write(usage)

process.exit(1)
}

const url = input.get('url')
const selector = input.get('selector')
const method = input.has('method') ? input.get('method') : 'GET'
const acceptAllStatus = input.has('accept-http-error')
const verbose = input.has('verbose') || input.has('v')

let filtersToApply = {}

for (let param in input.args) {
if (filters.hasOwnProperty(param)) {
let value = input.args[param]

if (value !== true) {
value = value.split(',')
} else {
value = null
}

filtersToApply[param] = value
}
}

rq({
uri: url,
method: method,
simple: !acceptAllStatus
})
.then(function(body) {
const $ = cheerio.load(body)
let value = $(selector).html()

for (let filter in filtersToApply) {
value = filters[filter](value, filtersToApply[filter])
}

output.write(value)
rq({
uri: options.url,
method: options.method || 'GET',
simple: !(options.acceptAllStatus || false)
})
.catch(function(error) {
output.write(error.name, 'error')

if (verbose) {
output.write(error.message, 'error')
}
.then(function(body) {
const $ = cheerio.load(body)
let value = $(selector).html()

for (let filter in filtersToApply) {
value = filters[filter](value, filtersToApply[filter])
}

if (callbackSuccess) {
callbackSuccess(value)
}
})
.catch(function(error) {
if (callbackError) {
callbackError(error)
}
})
}

process.exit(1)
})
module.exports = scraper

Loading…
Cancel
Save