/**
A small site checker that spiders its way through your site checking for xhtml compliance
*/
import java.io.*;
import java.net.*;
import java.util.*;

import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;


public class SiteChecker {
    DocumentBuilderFactory dbf = null;
    DocumentBuilder documentBuilder = null;
    ArrayList previousURLs = new ArrayList();
    ArrayList URLs = new ArrayList();
    String domain = null;
    int checkNumberMax = 100;
    int checkNumber = 0;
    public static String[] skipThese = {"jpg","gif","txt","java","js","desc","pl","cs"};

public void init() {
    dbf = DocumentBuilderFactory.newInstance();
    dbf.setValidating(false);
    
    try {
	documentBuilder = dbf.newDocumentBuilder();
    OutputStreamWriter errorWriter =
	new OutputStreamWriter(System.err, "UTF-8");
    documentBuilder.setErrorHandler((ErrorHandler)new MyErrorHandler(new PrintWriter(errorWriter, true)));
    } catch (Exception e) {
	formatException(null,e,"init()");
    }
    Arrays.sort(skipThese);
}

public SiteChecker() {
    init();
}

    /*
     http://www.fincher.org
     http://www.fincher.org/
     http://fincher.org
     http://fincher.org/
     http://fincher.org/index.shtml
     index.shtml (will not be forwarded)
     */
public void check(String startURL, int level) throws Exception {
    Document doc = null;
    String domainName = null;
    if(checkNumber > checkNumberMax) {
	   System.out.println("Exceeded maximum number of checks - "+checkNumberMax);
	   System.exit(0);
    }

    System.out.print("checking: \"" + startURL + "\"");
    if( ! startURL.startsWith(domain)) {
	   System.out.println(" - skipping outside link:");
	   return;
    }
    try {
	   try {
		  doc = documentBuilder.parse(startURL);
	   } catch (IOException ioe) {
		  System.out.println(" - IOException.  skipping.");
		  return;
	   }
	System.out.println(" - its ok.");
	checkNumber++;
	org.w3c.dom.Element rootElement = doc.getDocumentElement();
	NodeList nodeList = rootElement.getElementsByTagName("a"); 
	for(int i=0;i<nodeList.getLength();i++)  {
	    String href = ((org.w3c.dom.Element)nodeList.item(i)).getAttribute("href");
	    if( ! previousURLs.contains(href)) {
		   previousURLs.add(href);
		   //is it a local reference?
		   URL url = new URL(new URL(startURL),href);
		   href= url.toString();
		   String filename = url.getFile();
		   String extension = null;

		   if(filename != null) {
		       int startOfFilename = filename.lastIndexOf('/');
		       if(startOfFilename > -1) {
			   filename = filename.substring(startOfFilename+1,filename.length());
			   int startOfDot = filename.indexOf('.');
			   if(startOfDot > -1) {
			       extension = filename.substring(startOfDot+1,filename.length());
			   }
			   if((extension != null) && Arrays.binarySearch(skipThese,extension) > -1) {
			       System.out.println("skipping extension: \"" + filename+ "\"");
			       return;
			   }
		       }
		   }
		   check(href,level++);
	    }
	}
    } catch (Exception e) {
	   previousURLs.add(startURL);  //add the bad ones
	   formatException(null,e,"init()");
    }
    return;

}

public static void main(String [] args) {
SiteChecker linkChecker = new SiteChecker();
linkChecker.domain = args[0];

try {
    if(args.length > 1) {
	linkChecker.checkNumberMax = Integer.parseInt(args[1]);
	System.out.println("setting checkNumberMax to: " + linkChecker.checkNumberMax);
    }
    linkChecker.check(args[0],0);
    } catch (Exception e) {
	formatException(null,e,"init()");
    }


}

/***
 * Internal Class for handling parsing errors
**/
    // Error handler to report errors and warnings
private static class MyErrorHandler implements ErrorHandler {
    /** Error handler output goes here */
    private PrintWriter out;
    MyErrorHandler(PrintWriter out) {
	this.out = out;
    }
private String getParseExceptionInfo(SAXParseException spe) {
	String systemId = spe.getSystemId();
	if (systemId == null) {
	    systemId = "null";
	}
	String info = "URI=" + systemId +
	    " Line=" + spe.getLineNumber() +
	    ": " + spe.getMessage();
	return info;
    }
public void warning(SAXParseException spe) throws SAXException {
    if(getParseExceptionInfo(spe).indexOf("Using original entity definition") == -1) {
	out.println("Warning: " + getParseExceptionInfo(spe));
    }
}    
public void error(SAXParseException spe) throws SAXException {
    throw new SAXException("Error: " + getParseExceptionInfo(spe));
}
    
public void fatalError(SAXParseException spe) throws SAXException {
    throw new SAXException("Fatal Error: " + getParseExceptionInfo(spe));
}
} //inner class



public static void formatException(Object obj, Exception e, String message)
{
    System.out.print("IQUtil.catchException()*******");
    if(obj != null) {
	System.out.print(obj.getClass().getName());
    }
    System.out.println(""+new java.util.Date()+ "\n  "+message +"\n     ****\n  " + e);
    if(e != null) {
	e.printStackTrace(System.out);
    }
    if(e instanceof SAXParseException) {
	System.out.println("SAXParseException: error on line " + ((SAXParseException)e).getLineNumber() );
	System.out.println("SAXParseException: error in column " + ((SAXParseException)e).getColumnNumber() );
	System.out.println("getPublicId():" + ((SAXParseException)e).getPublicId() );
	System.out.println("getMessage():" + ((SAXParseException)e).getMessage() );
	System.out.println("getSystemId():" + ((SAXParseException)e).getSystemId() );
	System.out.println("getException() : " + ((SAXParseException)e).getException());
    }

}
}
