Browse Source

add scraper

tags/v1.0.0
Simon Vieille 1 month ago
parent
commit
661b5c90f5
8 changed files with 228 additions and 0 deletions
  1. +1
    -0
      .gitignore
  2. +58
    -0
      src/console/input.js
  3. +37
    -0
      src/console/output.js
  4. +5
    -0
      src/filter/breaks.js
  5. +5
    -0
      src/filter/spaces.js
  6. +7
    -0
      src/filter/tags.js
  7. +7
    -0
      src/filter/trim.js
  8. +108
    -0
      src/index.js

+ 1
- 0
.gitignore View File

@@ -0,0 +1 @@
/node_modules

+ 58
- 0
src/console/input.js View File

@@ -0,0 +1,58 @@
"use strict";

var Minimist = require('minimist');

class Input {
/**
* Constructor.
*
* @param object process
*/
constructor(process) {
this.args = Minimist(process.argv.slice(2));
this.node = process.argv[0];
this.script = process.argv[1];
}

/**
* Return the value of the given name argument.
*
* @param string name
* @param mixed default
*
* @return mixed
*/
get(name, defaultValue) {
if (this.has(name)) {
return this.args[name];
}

if (defaultValue !== undefined) {
return defaultValue;
}

return null;
}

/**
* Check the given argument name exists.
*
* @param string name
*
* @return boolean
*/
has(name) {
return this.args.hasOwnProperty(name);
}

/**
* Return if args is empty.
*
* @return boolean
*/
empty() {
return Object.keys(this.args).length === 1;
}
}

module.exports = Input

+ 37
- 0
src/console/output.js View File

@@ -0,0 +1,37 @@
"use strict";

class Output {
/**
* Convert and print data to json.
*
* @param mixed data
*/
json(data, pretty) {
data = JSON.stringify(
data,
function(key, value) {
if (value === undefined) {
return null;
}

return value;
},
pretty ? 2 : null
);

return this.write(data);
}

/**
* Print data.
*
* @param mixed data
*/
write(data, level) {
level = level || 'log'

console[level](data)
}
}

module.exports = Output

+ 5
- 0
src/filter/breaks.js View File

@@ -0,0 +1,5 @@
const filter = function(value) {
return value.replace(/(\n|\r)/g, '')
}

module.exports = filter

+ 5
- 0
src/filter/spaces.js View File

@@ -0,0 +1,5 @@
const filter = function(value) {
return value.replace(/\s{2,}/g, ' ')
}

module.exports = filter

+ 7
- 0
src/filter/tags.js View File

@@ -0,0 +1,7 @@
const striptags = require('striptags')

const filter = function(value, tags) {
return striptags(value, tags)
}

module.exports = filter

+ 7
- 0
src/filter/trim.js View File

@@ -0,0 +1,7 @@
const trim = require('trim')

const filter = function(value) {
return trim(value)
}

module.exports = filter

+ 108
- 0
src/index.js View File

@@ -0,0 +1,108 @@
const Output = require('./console/output')
const Input = require('./console/input')
const rq = require('request-promise')
const cheerio = require('cheerio')

const filters = {
breaks: require('./filter/breaks'),
spaces: require('./filter/spaces'),
tags: require('./filter/tags'),
trim: require('./filter/trim'),
}

const output = new Output()
const input = new Input(process)

const usage = `
${input.node} ${input.script} [params]

Parameters

--url [URL] URL to scrap
--selector [Selector] CSS selector

Optional parameters

Filters

Order has a meaning.

--breaks Removes breaks (\\n, \\r)
--trim Strips whitespaces from the beginning and end of the value
--spaces Replaces 2 successive spaces by 1, except breaks
--tags, --tags [TAGS] Removes tags. You can specify the tags to remove (separated by comma)

HTTP

--method [METHOD] HTTP Method
--accept-http-error Accepts all status code (like 404)

--verbose, -v Show message of error
`

if (input.has('help')) {
output.write(usage)

process.exit(0)
}

if (!input.has('url')) {
output.write('You must specify --url')
output.write(usage)

process.exit(1)
}

if (!input.has('selector')) {
output.write('You must specify --selector')
output.write(usage)

process.exit(1)
}

const url = input.get('url')
const selector = input.get('selector')
const method = input.has('method') ? input.get('method') : 'GET'
const acceptAllStatus = input.has('accept-http-error')
const verbose = input.has('verbose') || input.has('v')

let filtersToApply = {}

for (let param in input.args) {
if (filters.hasOwnProperty(param)) {
let value = input.args[param]

if (value !== true) {
value = value.split(',')
} else {
value = null
}

filtersToApply[param] = value
}
}

rq({
uri: url,
method: method,
simple: !acceptAllStatus
})
.then(function(body) {
const $ = cheerio.load(body)
let value = $(selector).html()

for (let filter in filtersToApply) {
value = filters[filter](value, filtersToApply[filter])
}

output.write(value)
})
.catch(function(error) {
output.write(error.name, 'error')

if (verbose) {
output.write(error.message, 'error')
}

process.exit(1)
})

Loading…
Cancel
Save