#!/usr/bin/python3 import json import sys import requests from bs4 import BeautifulSoup import tempfile version = " 0.1" HTML_VALIDATOR_URL = "https://validator.w3.org/nu/?out=json" #CSS_VALIDATOR_URL = "http://jigsaw.w3.org/css-validator/validator" help = "w3spider "+version+"\n Usage : "+sys.argv[0]+" [TARGET URL] [0-9]\n This can take a while if you type high lvl of recursivity, or if there are a lot of errors in the target." def validate(filename, verbose=False): #send url to W3C and display messages is_remote = filename.startswith("http://") or filename.startswith("https://") with tempfile.TemporaryFile() if is_remote else open(filename, "rb") as f: if is_remote: r = requests.get(filename, verify=False) f.write(r.content) f.seek(0) r = requests.post(HTML_VALIDATOR_URL, files={"file": (filename, f, "text/html")}, data={"out": "json", "showsource": "yes"},verify=False) result = r.json() for msg in result['messages']: msginline = "Ligne : "+str(msg['lastLine'])+" : "+msg['type'] +" "+ msg['message'] print (msginline) def spider(url): # search links and put them in waitlist req = requests.get(url) soup = BeautifulSoup(req.text, "lxml") for link in soup.find_all('a',{"href":True}): if link['href'].startswith(arg): if link['href'] not in waitlist: if link['href'].endswith(('.pdf','.jpeg','.png','.jpg')): # extension excluded continue else: waitlist.append(link['href']) print('new page found : ', link['href']) if link['href'].startswith(('http','mailto','#')): continue else: # if the link is not an anchor, not a mailto, not absolute, it may be relative link. # let's transform it to absolute link : link['href'] = arg +'/'+ link['href'] if link['href'] not in waitlist: print('new page found : ', link['href']) waitlist.append(link['href']) def validthemall(): # submit all the waitlist to w3c for x in waitlist: print('/'*14,'w3c :', x,'/'*14) validate(x) if __name__=="__main__": if len(sys.argv) < 2: print(help) else: arg = sys.argv[1] if len(sys.argv) == 3: lvl = int(sys.argv[2]) else: lvl = 0 waitlist = [sys.argv[1]] if lvl > 0: print('/'*14,'Crawl :',arg,'with ', lvl,' level of recursivity.','/'*14) for i in range(0,lvl): for url in waitlist: print('Searching new pages in :', url) spider(url) validthemall()