#!/usr/bin/env python
#
# Copyright (C) 2004 Stefan Seefeld
# All rights reserved.
# Licensed to the public under the terms of the GNU LGPL (>= 2),
# see the file COPYING for details.
#


from xml.sax import saxexts, saxlib, saxutils
import sys, os, string, urllib, urlparse
import getopt

verbose = False

class Reference:
    def __init__(self, orig, line, ref):
        self.orig = orig
        self.line = line
        self.ref = ref

class DocumentHandler(saxlib.DocumentHandler):
    """Store urefs with the linenumbers they were encountered in,
    so we can either traverse them, too, or report errors with specific
    line numbers."""

    def __init__(self):

        self.urefs = {}
        self.locator = None

    def get_urefs(self):
        
        urefs = [Reference(self.locator.getSystemId(), l, u) for (u, l) in self.urefs.items()]
        self.urefs = {}
        self.locator = None
        return urefs

    def setDocumentLocator(self, locator):
        "Receive an object for locating the origin of SAX document events."
        self.locator = locator

    def startElement(self, name, attrs):
        "Look for ancors and store links."

        if name == 'a':
            href = attrs.getValue('href')
            if not self.urefs.has_key(href):
                self.urefs[href] = self.locator.getLineNumber()

from xml.sax.drivers import drv_xmlproc
SAXparser=drv_xmlproc.SAX_XPParser()

handler = DocumentHandler()
SAXparser.setDocumentHandler(handler)
SAXparser.setErrorHandler(saxutils.ErrorRaiser())

def validate(url):
    """validate (x)html conformance using 'tidy'."""

    if verbose: print 'validating', url
    status = os.system('tidy -errors -quiet "%s"'%url)
    if os.WIFSIGNALED(status):
        print 'internal error:', os.WTERMSIG(status)
    elif os.WIFEXITED(status):
        if os.WEXITSTATUS(status) == 2:
            print 'validation failed'
        return
    else:
        print 'internal error !'
        
def usage():
   print 'Usage : %s [options] <input files>'%sys.argv[0]
   print """
List of options:

  -h, --help             help
  -p, --print            provide verbose feedback during validation
  -m, --maximum          maximum number of pages to validate
  -v, --validate         call http://validator.w3.org to validate html
  -e, --external         follow external links
"""

def main():
   global verbose

   max = 50
   external = False
   do_validate = False

   opts, args = getopt.getopt(sys.argv[1:],
                              'pm:evh',
                              ['print', 'maximum=', 'external', 'validate', 'help'])
   for o, a in opts:
      if o in ['-h', '--help']:
         usage()
         sys.exit(0)
      elif o in ['-p', '--print']: verbose = True
      elif o in ['-m', '--maximum']: max = int(a)
      elif o in ['-e', '--external']: external = True
      elif o in ['-v', '--validate']: do_validate = True

   if not args:
         usage()
         sys.exit(0)

   done = []
   urefs = [Reference('.', 0, args[0])]
   while urefs and (max == -1 or len(done) < max):

       uref = urefs.pop(0)
       url = urlparse.urljoin(uref.orig, uref.ref)
       scheme, location, path, query, fragment = urlparse.urlsplit(url)
       if not external and scheme and scheme != 'file': continue
       url = urlparse.urlunsplit((scheme, location, path, query, ''))
       if url in done: continue
       try:
           if verbose: print 'parsing', url
           SAXparser.parse(url)
           if do_validate: validate(url)
           done.append(url)
           urefs.extend(handler.get_urefs())
            
       except saxlib.SAXParseException, e:
           sys.stderr.write('%s; processing aborted\n'%e)
           break

if __name__ == '__main__':

    main()
