2020-05-23 22:16:15 +02:00
#!/usr/bin/python3
2020-05-02 18:02:46 +02:00
import json
import sys
import requests
from bs4 import BeautifulSoup
import tempfile
2020-05-23 22:16:15 +02:00
version = " 0.1 "
2020-05-02 18:02:46 +02:00
HTML_VALIDATOR_URL = " https://validator.w3.org/nu/?out=json "
#CSS_VALIDATOR_URL = "http://jigsaw.w3.org/css-validator/validator"
2020-05-23 22:16:15 +02:00
help = " w3spider " + version + " \n Usage : " + sys . argv [ 0 ] + " [TARGET URL] [0-9] \n This can take a while if you type high lvl of recursivity, or if there are a lot of errors in the target. "
2020-05-02 18:02:46 +02:00
def validate ( filename , verbose = False ) : #send url to W3C and display messages
is_remote = filename . startswith ( " http:// " ) or filename . startswith ( " https:// " )
with tempfile . TemporaryFile ( ) if is_remote else open ( filename , " rb " ) as f :
if is_remote :
r = requests . get ( filename , verify = False )
f . write ( r . content )
f . seek ( 0 )
r = requests . post ( HTML_VALIDATOR_URL , files = { " file " : ( filename , f , " text/html " ) } , data = { " out " : " json " , " showsource " : " yes " } , verify = False )
result = r . json ( )
for msg in result [ ' messages ' ] :
msginline = " Ligne : " + str ( msg [ ' lastLine ' ] ) + " : " + msg [ ' type ' ] + " " + msg [ ' message ' ]
print ( msginline )
def spider ( url ) : # search links and put them in waitlist
req = requests . get ( url )
soup = BeautifulSoup ( req . text , " lxml " )
for link in soup . find_all ( ' a ' , { " href " : True } ) :
if link [ ' href ' ] . startswith ( arg ) :
if link [ ' href ' ] not in waitlist :
if link [ ' href ' ] . endswith ( ( ' .pdf ' , ' .jpeg ' , ' .png ' , ' .jpg ' ) ) : # extension excluded
continue
else :
waitlist . append ( link [ ' href ' ] )
print ( ' new page found : ' , link [ ' href ' ] )
if link [ ' href ' ] . startswith ( ( ' http ' , ' mailto ' , ' # ' ) ) :
continue
else :
# if the link is not an anchor, not a mailto, not absolute, it may be relative link.
# let's transform it to absolute link :
link [ ' href ' ] = arg + ' / ' + link [ ' href ' ]
if link [ ' href ' ] not in waitlist :
print ( ' new page found : ' , link [ ' href ' ] )
waitlist . append ( link [ ' href ' ] )
def validthemall ( ) : # submit all the waitlist to w3c
for x in waitlist :
print ( ' / ' * 14 , ' w3c : ' , x , ' / ' * 14 )
validate ( x )
if __name__ == " __main__ " :
if len ( sys . argv ) < 2 :
print ( help )
else :
arg = sys . argv [ 1 ]
if len ( sys . argv ) == 3 :
lvl = int ( sys . argv [ 2 ] )
else :
lvl = 0
waitlist = [ sys . argv [ 1 ] ]
if lvl > 0 :
print ( ' / ' * 14 , ' Crawl : ' , arg , ' with ' , lvl , ' level of recursivity. ' , ' / ' * 14 )
for i in range ( 0 , lvl ) :
for url in waitlist :
print ( ' Searching new pages in : ' , url )
spider ( url )
validthemall ( )