From 208088416ee81665e962c3344bb68b15536ca050 Mon Sep 17 00:00:00 2001 From: Simon Vieille Date: Tue, 14 Apr 2020 18:16:19 +0200 Subject: [PATCH] refactoring: index.js is now a module, add cli.js --- README.md | 43 +++++++++++++++++-- src/cli.js | 102 +++++++++++++++++++++++++++++++++++++++++++ src/index.js | 119 +++++++++++---------------------------------------- 3 files changed, 166 insertions(+), 98 deletions(-) create mode 100644 src/cli.js diff --git a/README.md b/README.md index f2e4669..cbb9377 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,13 @@ Scraper This project is a basic tool to scrap a data from a website using a CSS selector. -For example, you are able to retrieve the number of a project's releases hosted on github: +For example, if you want to retrieve the number of a project's releases hosted on github: + +With CLI +--- ``` -node src/index.js \ +node src/cli.js \ --url https://github.com/foo/bar \ --selector '.repository-content .numbers-summary li:nth-child(4) a' \ --tags \ @@ -19,7 +22,41 @@ node src/index.js \ ...will show `XXX releases`. -More help with `node src/index.js --help`. +More help with `node src/cli.js --help`. + +With code +--------- + +``` +const scraper = require('deblan-scraper') + +const options = { + url: 'https://github.com/foo/bar', + acceptAllStatus: false, // Optional + method: 'GET', // Optional +} + +const selector = '.repository-content .numbers-summary li:nth-child(4) a' + +const filters = { + tags: null, + breaks: null, + spaces: null, + trim: null, +} + +scraper( + options, + selector, + filters, + function(value) { + console.log(value) + }, + function(error) { + console.log(error) + } +) +``` Installation ------------ diff --git a/src/cli.js b/src/cli.js new file mode 100644 index 0000000..1b147fa --- /dev/null +++ b/src/cli.js @@ -0,0 +1,102 @@ +const Output = require('./console/output') +const Input = require('./console/input') +const scraper = require('./index') +const output = new Output() +const input = new Input(process) +const filters = [ + 'breaks', + 'spaces', + 'tags', + 'trim', +] + +const usage = ` +${input.node} ${input.script} [params] + +Parameters + + --url [URL] URL to scrap + --selector [Selector] CSS selector + +Optional parameters + + Filters + + Order has a meaning. + + --breaks Removes breaks (\\n, \\r) + --trim Strips whitespaces from the beginning and end of the value + --spaces Replaces 2 successive spaces by 1, except breaks + --tags, --tags [TAGS] Removes tags. You can specify the tags to remove (separated by comma) + + HTTP + + --method [METHOD] HTTP Method + --accept-http-error Accepts all status codes (like 404) + + --verbose, -v Show message of error +` + +if (input.has('help')) { + output.write(usage) + + process.exit(0) +} + +if (!input.has('url')) { + output.write('You must specify --url') + output.write(usage) + + process.exit(1) +} + +if (!input.has('selector')) { + output.write('You must specify --selector') + output.write(usage) + + process.exit(1) +} + +const url = input.get('url') +const selector = input.get('selector') +const method = input.has('method') ? input.get('method') : 'GET' +const acceptAllStatus = input.has('accept-http-error') +const verbose = input.has('verbose') || input.has('v') + +let filtersToApply = {} + +for (let param in input.args) { + if (filters.includes(param)) { + let value = input.args[param] + + if (value !== true) { + value = value.split(',') + } else { + value = null + } + + filtersToApply[param] = value + } +} + +const options = { + url: url, + method: method, + acceptAllStatus: acceptAllStatus, +} + +const onSuccess = function(value) { + output.write(value) +} + +const onError = function(error) { + output.write(error.name, 'error') + + if (verbose) { + output.write(error.message, 'error') + } + + process.exit(1) +} + +scraper(options, selector, filtersToApply, onSuccess, onError) diff --git a/src/index.js b/src/index.js index 1344208..7544659 100644 --- a/src/index.js +++ b/src/index.js @@ -1,8 +1,5 @@ -const Output = require('./console/output') -const Input = require('./console/input') const rq = require('request-promise') const cheerio = require('cheerio') - const filters = { breaks: require('./filter/breaks'), spaces: require('./filter/spaces'), @@ -10,99 +7,31 @@ const filters = { trim: require('./filter/trim'), } -const output = new Output() -const input = new Input(process) +const scraper = function(options, selector, filtersToApply, callbackSuccess, callbackError) { + filtersToApply = filtersToApply || {} -const usage = ` -${input.node} ${input.script} [params] - -Parameters - - --url [URL] URL to scrap - --selector [Selector] CSS selector - -Optional parameters - - Filters - - Order has a meaning. - - --breaks Removes breaks (\\n, \\r) - --trim Strips whitespaces from the beginning and end of the value - --spaces Replaces 2 successive spaces by 1, except breaks - --tags, --tags [TAGS] Removes tags. You can specify the tags to remove (separated by comma) - - HTTP - - --method [METHOD] HTTP Method - --accept-http-error Accepts all status codes (like 404) - - --verbose, -v Show message of error -` - -if (input.has('help')) { - output.write(usage) - - process.exit(0) -} - -if (!input.has('url')) { - output.write('You must specify --url') - output.write(usage) - - process.exit(1) -} - -if (!input.has('selector')) { - output.write('You must specify --selector') - output.write(usage) - - process.exit(1) -} - -const url = input.get('url') -const selector = input.get('selector') -const method = input.has('method') ? input.get('method') : 'GET' -const acceptAllStatus = input.has('accept-http-error') -const verbose = input.has('verbose') || input.has('v') - -let filtersToApply = {} - -for (let param in input.args) { - if (filters.hasOwnProperty(param)) { - let value = input.args[param] - - if (value !== true) { - value = value.split(',') - } else { - value = null - } - - filtersToApply[param] = value - } -} - -rq({ - uri: url, - method: method, - simple: !acceptAllStatus -}) - .then(function(body) { - const $ = cheerio.load(body) - let value = $(selector).html() - - for (let filter in filtersToApply) { - value = filters[filter](value, filtersToApply[filter]) - } - - output.write(value) + rq({ + uri: options.url, + method: options.method || 'GET', + simple: !(options.acceptAllStatus || false) }) - .catch(function(error) { - output.write(error.name, 'error') + .then(function(body) { + const $ = cheerio.load(body) + let value = $(selector).html() - if (verbose) { - output.write(error.message, 'error') - } + for (let filter in filtersToApply) { + value = filters[filter](value, filtersToApply[filter]) + } - process.exit(1) - }) + if (callbackSuccess) { + callbackSuccess(value) + } + }) + .catch(function(error) { + if (callbackError) { + callbackError(error) + } + }) +} + +module.exports = scraper