From d132b54db46ef64a8c7e300b79283747c79b3b9d Mon Sep 17 00:00:00 2001 From: Barbedouce Date: Sat, 2 May 2020 18:02:46 +0200 Subject: [PATCH] initial commit --- w3spider.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 w3spider.py diff --git a/w3spider.py b/w3spider.py new file mode 100644 index 0000000..51413ac --- /dev/null +++ b/w3spider.py @@ -0,0 +1,75 @@ +import json +import sys +import requests +from bs4 import BeautifulSoup +import tempfile + + +version = "0.1a" + +HTML_VALIDATOR_URL = "https://validator.w3.org/nu/?out=json" +#CSS_VALIDATOR_URL = "http://jigsaw.w3.org/css-validator/validator" +help = "w3spider"+version+"\n Usage : "+sys.argv[0]+" [TARGET URL] [0-9]\n This can take a while if you type high lvl of recursivity, or if there are a lot of errors in the target." + + + + +def validate(filename, verbose=False): #send url to W3C and display messages + is_remote = filename.startswith("http://") or filename.startswith("https://") + with tempfile.TemporaryFile() if is_remote else open(filename, "rb") as f: + if is_remote: + r = requests.get(filename, verify=False) + f.write(r.content) + f.seek(0) + r = requests.post(HTML_VALIDATOR_URL, files={"file": (filename, f, "text/html")}, data={"out": "json", "showsource": "yes"},verify=False) + result = r.json() + for msg in result['messages']: + msginline = "Ligne : "+str(msg['lastLine'])+" : "+msg['type'] +" "+ msg['message'] + print (msginline) + + +def spider(url): # search links and put them in waitlist + req = requests.get(url) + soup = BeautifulSoup(req.text, "lxml") + for link in soup.find_all('a',{"href":True}): + if link['href'].startswith(arg): + if link['href'] not in waitlist: + if link['href'].endswith(('.pdf','.jpeg','.png','.jpg')): # extension excluded + continue + else: + waitlist.append(link['href']) + print('new page found : ', link['href']) + if link['href'].startswith(('http','mailto','#')): + continue + else: + # if the link is not an anchor, not a mailto, not absolute, it may be relative link. + # let's transform it to absolute link : + link['href'] = arg +'/'+ link['href'] + if link['href'] not in waitlist: + print('new page found : ', link['href']) + waitlist.append(link['href']) + +def validthemall(): # submit all the waitlist to w3c + for x in waitlist: + print('/'*14,'w3c :', x,'/'*14) + validate(x) + +if __name__=="__main__": + if len(sys.argv) < 2: + print(help) + else: + arg = sys.argv[1] + if len(sys.argv) == 3: + lvl = int(sys.argv[2]) + else: + lvl = 0 + waitlist = [sys.argv[1]] + if lvl > 0: + print('/'*14,'Crawl :',arg,'with ', lvl,' level of recursivity.','/'*14) + + for i in range(0,lvl): + for url in waitlist: + print('Searching new pages in :', url) + spider(url) + + validthemall()