2020-04-11 22:31:23 +02:00
|
|
|
Scraper
|
|
|
|
=======
|
|
|
|
|
2023-03-31 21:34:27 +02:00
|
|
|
[![Build Status](https://ci.gitnet.fr/api/badges/deblan/scraper/status.svg)](https://ci.gitnet.fr/deblan/scraper)
|
2020-09-02 10:55:15 +02:00
|
|
|
|
2020-04-11 22:31:23 +02:00
|
|
|
This project is a basic tool to scrap a data from a website
|
|
|
|
using a CSS selector.
|
|
|
|
|
2020-04-14 18:16:19 +02:00
|
|
|
For example, if you want to retrieve the number of a project's releases hosted on github:
|
|
|
|
|
|
|
|
With CLI
|
|
|
|
---
|
2020-04-11 22:31:23 +02:00
|
|
|
|
|
|
|
```
|
2020-04-14 18:16:19 +02:00
|
|
|
node src/cli.js \
|
2020-04-11 22:45:15 +02:00
|
|
|
--url https://github.com/foo/bar \
|
|
|
|
--selector '.repository-content .numbers-summary li:nth-child(4) a' \
|
|
|
|
--tags \
|
|
|
|
--breaks \
|
|
|
|
--spaces \
|
|
|
|
--breaks \
|
|
|
|
--trim
|
2020-04-11 22:31:23 +02:00
|
|
|
```
|
|
|
|
|
|
|
|
...will show `XXX releases`.
|
|
|
|
|
2020-04-14 18:16:19 +02:00
|
|
|
More help with `node src/cli.js --help`.
|
|
|
|
|
|
|
|
With code
|
|
|
|
---------
|
|
|
|
|
|
|
|
```
|
|
|
|
const scraper = require('deblan-scraper')
|
|
|
|
|
|
|
|
const options = {
|
2020-04-15 10:34:47 +02:00
|
|
|
url: 'https://github.com/foo/bar',
|
|
|
|
acceptAllStatus: false, // Optional, default is `false`
|
|
|
|
method: 'GET', // Optional, default is `GET`
|
2020-04-14 18:16:19 +02:00
|
|
|
}
|
|
|
|
|
2020-11-10 13:30:38 +01:00
|
|
|
const isMultiple = false // get the first result, `true` to get an array of results
|
|
|
|
|
2020-04-14 18:16:19 +02:00
|
|
|
const selector = '.repository-content .numbers-summary li:nth-child(4) a'
|
|
|
|
|
|
|
|
const filters = {
|
2020-04-15 10:34:47 +02:00
|
|
|
tags: null, // Removes tags. You can specify the tags to remove (separated by comma)
|
|
|
|
breaks: null, // Removes breaks (\n, \r)
|
|
|
|
spaces: null, // Replaces 2 successive spaces by 1, except breaks
|
|
|
|
trim: null, // Strips whitespaces from the beginning and end of the value
|
2020-04-14 18:16:19 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
scraper(
|
2020-04-15 10:34:47 +02:00
|
|
|
options,
|
|
|
|
selector,
|
|
|
|
filters,
|
|
|
|
function(value) {
|
|
|
|
console.log(value)
|
|
|
|
},
|
|
|
|
function(error) {
|
|
|
|
console.log(error)
|
2020-11-10 13:30:38 +01:00
|
|
|
},
|
|
|
|
isMultiple
|
2020-04-14 18:16:19 +02:00
|
|
|
)
|
|
|
|
```
|
2020-04-11 22:35:30 +02:00
|
|
|
|
|
|
|
Installation
|
|
|
|
------------
|
|
|
|
|
|
|
|
Requirements:
|
|
|
|
|
|
|
|
* node >= 10
|
|
|
|
* yarn
|
|
|
|
|
|
|
|
```
|
|
|
|
$ git clone https://gitnet.fr/deblan/scraper.git
|
|
|
|
$ cd scraper
|
|
|
|
$ yarn
|
|
|
|
```
|