import java.io.*;
import java.net.*;
import java.util.*;

import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;

/**
 * Checks an entire site for XHTML compliance
 * written by Mitch Fincher, www.fincher.org
 */
public class XhtmlChecker {
    DocumentBuilderFactory dbf = null;
    DocumentBuilder documentBuilder = null;
    ArrayList previousURLs = null;
    String domain = null;
    int checkNumberMax = 10;
    static int checkNumber = 0;
    static int goodPages = 0;
    static int badPages = 0;
    static int outsidePages = 0;
    
    /**
     *initialize DocumentBuilder
     **/
    public void init() {
        previousURLs = new ArrayList();
        dbf = DocumentBuilderFactory.newInstance();
        dbf.setValidating(false);
        
        try {
            documentBuilder = dbf.newDocumentBuilder();
            OutputStreamWriter errorWriter =
            new OutputStreamWriter(System.err, "UTF-8");
            documentBuilder.setErrorHandler((ErrorHandler)new MyErrorHandler(new PrintWriter(errorWriter, true)));
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("e: \"" + e+ "\"");
            
        }
    }
    
    /**
     * simply constructor
     **/
    public XhtmlChecker() {
        init();
    }
    
    
    /**
     * @param startURL The current web page
     * @param level nesting level
     * @throws Exception
     */
    public void check(String startURL, int level) throws Exception {
        Document doc = null;
        URL url = new URL(startURL);
        //System.out.println("startURL="+url.toExternalForm());
        if(checkNumber >= checkNumberMax) {
            System.out.println(checkNumber+" Exceeded maximum number of checks - "+checkNumberMax);
            return;
        }
        //if its an outside link, lets skip it
        if( (! startURL.startsWith(domain)) && startURL.startsWith("http:")) {
            outsidePages++;
            System.out.println("    Skipping outside link: \"" + startURL + "\"");
            return;
        }
        
        
        System.out.print("Checking:: \"" + startURL + "\"");
        try {
            try {
                doc = documentBuilder.parse(startURL);
            } catch (IOException ioe) {
                System.out.println(" - IOException.  skipping.");
                return;
            }
            System.out.println(" - its OK.");System.out.flush();
            checkNumber++;
            goodPages++;
            org.w3c.dom.Element rootElement = doc.getDocumentElement();
            NodeList nodeList = rootElement.getElementsByTagName("a");
            URL baseURL = getBaseURL(url); //surely there is a utility method for this
            for(int i=0;i<nodeList.getLength();i++)  {
                String href = ((org.w3c.dom.Element)nodeList.item(i)).getAttribute("href");
                if(href.startsWith("mailto:")) { continue; }
                if(href.startsWith("#")) { continue; }
                if(href.startsWith("..")) { continue; }
                if(href.length() == 0) { continue; }
                 if(href.endsWith(".gif") || href.endsWith(".jpg") 
                 || href.endsWith(".txt")|| href.endsWith(".pl") || href.endsWith(".java") ) { continue; }
                //System.out.println("  nodeList.item("+i+") \"" +  href + "\"");
                if( ! href.startsWith("http:") ) { //its a local reference
                    href = "http://"+baseURL.getHost()+baseURL.getPath()+"/"+href;
                }
                if( ! previousURLs.contains(href)) {
                    previousURLs.add(href);
                    check(href,level++);
                }
            }
          } catch (Exception e) {
            //e.printStackTrace();
            badPages++;
            previousURLs.add(startURL);  //add the bad ones
            System.out.println("\n************ Problem in \"" + startURL + "\"************");
            System.out.println("e: \"" + e + "\"");
        }
        return;
    }
    
    private URL getBaseURL(URL url) throws java.net.MalformedURLException {
    String urlString = url.toExternalForm();
     if( urlString.endsWith(".html") || 
     urlString.endsWith(".shtml")) {
         //trim off filename part
         urlString = urlString.substring(0,urlString.lastIndexOf('/'));
         return new URL(urlString);
    } else {
        return url;
    }
    }
    /**
     * @param args first argument is the starting web page, http://www.cnn.com
     * second argument is the maximum number of pages to search
     */
    public static void main(String [] args) {
        XhtmlChecker xhtmlChecker = new XhtmlChecker();
        xhtmlChecker.domain = args[0];
        
        try {
            if(args.length >1) {
                xhtmlChecker.checkNumberMax = Integer.parseInt(args[1]);
                System.out.println("*checkNumberMax: \"" + xhtmlChecker.checkNumberMax + "\"");
            } else {
                System.out.println("setting checkNumberMax: to default: \"" + xhtmlChecker.checkNumberMax + "\"");
            }
            xhtmlChecker.check(args[0],0);
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("e: \"" + e+ "\"");
        }
        
        System.out.println("\n goodPages: \"" + goodPages + "\"");
        System.out.println(" badPages: \"" + badPages + "\"");
        System.out.println(" outsidePages/no permissions: \"" + outsidePages + "\"");
        
    }
    
    /***
     * Internal Class for handling parsing errors
     **/
    // Error handler to report errors and warnings
    private static class MyErrorHandler implements ErrorHandler {
        /** Error handler output goes here */
        private PrintWriter out;
        MyErrorHandler(PrintWriter out) {
            this.out = out;
        }
        private String getParseExceptionInfo(SAXParseException spe) {
            String systemId = spe.getSystemId();
            if (systemId == null) {
                systemId = "null";
            }
            String info = "URI=" + systemId +
            " Line=" + spe.getLineNumber() +
            ": " + spe.getMessage();
            return info;
        }
        public void warning(SAXParseException spe) throws SAXException {
            if(getParseExceptionInfo(spe).indexOf("Using original entity definition") == -1) {
                out.println("Warning: " + getParseExceptionInfo(spe));
            }
        }
        public void error(SAXParseException spe) throws SAXException {
            throw new SAXException("Error: " + getParseExceptionInfo(spe));
        }
        
        public void fatalError(SAXParseException spe) throws SAXException {
            throw new SAXException("Fatal Error: " + getParseExceptionInfo(spe));
        }
    } //inner class
    
    
}
