#!/usr/bin/python3
import json
import sys
import requests 
from bs4 import BeautifulSoup
import tempfile


version = " 0.1"

HTML_VALIDATOR_URL = "https://validator.w3.org/nu/?out=json"
#CSS_VALIDATOR_URL = "http://jigsaw.w3.org/css-validator/validator"
help = "w3spider "+version+"\n Usage : "+sys.argv[0]+" [TARGET URL] [0-9]\n This can take a while if you type high lvl of recursivity, or if there are a lot of errors in the target."


def validate(filename, verbose=False): #send url to W3C and display messages 
   is_remote = filename.startswith("http://") or filename.startswith("https://")
   with tempfile.TemporaryFile() if is_remote else open(filename, "rb") as f:
      if is_remote:
         r = requests.get(filename, verify=False)
         f.write(r.content)
         f.seek(0)
      r = requests.post(HTML_VALIDATOR_URL, files={"file": (filename, f, "text/html")}, data={"out": "json", "showsource": "yes"},verify=False)
   result = r.json()
   for msg in result['messages']:
      msginline = "Ligne : "+str(msg['lastLine'])+" : "+msg['type'] +" "+ msg['message']
      print (msginline)


def spider(url): # search links and put them in waitlist
  req = requests.get(url)
  soup = BeautifulSoup(req.text, "lxml")
  for link in soup.find_all('a',{"href":True}):
    if link['href'].startswith(arg):
       if link['href'] not in waitlist:
          if link['href'].endswith(('.pdf','.jpeg','.png','.jpg')): # extension excluded
             continue
          else:
             waitlist.append(link['href'])
             print('new page found : ', link['href'])
    if link['href'].startswith(('http','mailto','#')):
          continue
    else:
          # if the link is not an anchor, not a mailto, not absolute, it may be relative link.
          # let's transform it to absolute link :
          link['href'] = arg +'/'+ link['href']	  
          if link['href'] not in waitlist:
            print('new page found : ', link['href'])
            waitlist.append(link['href'])

def validthemall(): # submit all the waitlist to w3c
  for x in waitlist:
      print('/'*14,'w3c :', x,'/'*14) 
      validate(x)

if __name__=="__main__":
   if len(sys.argv) < 2:
      print(help)
   else:
      arg = sys.argv[1]
      if len(sys.argv) == 3:
         lvl = int(sys.argv[2])
      else:
         lvl = 0
      waitlist = [sys.argv[1]]
      if lvl > 0:
         print('/'*14,'Crawl :',arg,'with ', lvl,' level of recursivity.','/'*14)

      for i in range(0,lvl):
         for url in waitlist:
           print('Searching new pages in :', url)
           spider(url)

      validthemall()