/**
 * Author: Ted Guild <ted@w3.org> 
 * (c) COPYRIGHT W3C http://www.w3.org/Consortium/Legal/copyright-software
 * $Id: BlacklistChecker.java,v 1.5 2006/08/25 17:12:20 ted Exp $
 */

package org.w3c.app.util;

import java.io.*;
import java.net.*;
import java.util.*;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;

public class BlacklistChecker {

    private Hashtable lists = new Hashtable();
    private Vector whitehats = new Vector();
    private Vector blackhats = new Vector();
    private Vector twoLevelTLDs = new Vector();
    private static final String WHITELIST="/usr/local/etc/surbl.whitelist";
    private static final String BLACKLIST="/usr/local/etc/xslt.blacklist";
    private static final String surblHostString=".multi.surbl.org";
    //routinely update from http://spamcheck.freeapp.net/two-level-tlds
    private static final String TWOLEVELTLDS="/usr/local/etc/two-level-tlds";
    private RE IPv4Address;

    //read from blacklists and tld files on creation, optimize by having instantiated in servlet init
    public BlacklistChecker() {
	lists.put(WHITELIST, whitehats);
	lists.put(BLACKLIST, blackhats);
	lists.put(TWOLEVELTLDS, twoLevelTLDs);
	try { 
	    IPv4Address = new RE("^(1\\d\\d|2[0-4]\\d|25[0-5]|\\d\\d|\\d)\\.(1\\d\\d|2[0-4]\\d|25[0-5]|\\d\\d|\\d)\\.(1\\d\\d|2[0-4]\\d|25[0-5]|\\d\\d|\\d)\\.(1\\d\\d|2[0-4]\\d|25[0-5]|\\d\\d|\\d)");
	}
	//won't happen unless we change regex
	catch (RESyntaxException e) {
	    System.err.println(e);
	}
	loadLists();
    }

    //can be called to reload without restarting servlet engine    
    public void loadLists() {
	String line;
	for (Enumeration enumerator = lists.keys(); enumerator.hasMoreElements();) {
	    String listFile = (String) enumerator.nextElement();
	    Vector bucket =(Vector)lists.get(listFile);	    
	    try {
		BufferedReader br = new BufferedReader(new FileReader(listFile));
		while((line=br.readLine()) != null) {
		    if (! line.startsWith("#")) {
			bucket.add(line);
		    }
		}
	    }
	    //catch FileNotFoundException and java.io.IOException as these lists are optional
	    catch (FileNotFoundException e) {
		System.err.println("exception " + e);
	    }
	    catch (IOException e) {
		System.err.println("exception " + e);
	    }
	}
    }

    //we'll catch UnsupportedEncodingException upstream and treat is as a condition to blacklist?
    //unravel uris contained in querystrings by url decoding and then looking for all http:
    public Vector parseURIs(String uri) throws UnsupportedEncodingException {
	Vector URIs = new Vector();
	String compoundURI=URLDecoder.decode(uri,"UTF-8");
	int lastLoc=0;
	while(compoundURI.indexOf("http:",lastLoc) >= 0) {
	    int curLoc=compoundURI.indexOf("http:",lastLoc);
	    URIs.add(compoundURI.substring(curLoc));
	    lastLoc=curLoc+1;
	}
	return URIs;
    }

    //return null unless locally blacklisted or surbl in which case specify which
    public String checkURI(String uri) {
	try {
	    Vector URIs = parseURIs(uri);
	    Iterator walker = URIs.iterator();
	    while(walker.hasNext()) {
		String host=new URL((String)walker.next()).getHost();
		if (isWhitelisted(host)) {
		    continue;
		}
		if (isBlacklisted(host)) {
		    return host + " is locally blacklisted";
		}
		if(IPv4Address.match(host)) {
		    String arpa = IPv4Address.getParen(4)+"."+IPv4Address.getParen(3)+"."+IPv4Address.getParen(2)+"."+IPv4Address.getParen(1);
		    if (surblBlacklistedDomainIP(arpa + surblHostString)) {
			return host + " blacklisted by http://www.surbl.org";
		    }
		}
		else {
		    String domain=getDomain(host);
		    if (isBlacklisted(domain)) {
			return domain + " is locally blacklisted";
		    }
		    if (isWhitelisted(domain)) {
			continue;
		    }
		    if (surblBlacklistedDomainIP(domain + surblHostString)) {
			return domain + " blacklisted by http://www.surbl.org";
		    }
		}
	    }
	}
	catch (UnsupportedEncodingException e) {
	    return "uri encoding scheme suspect, unsupported";
	}
	catch (MalformedURLException e) {
	    return "malformed uri";
	}
	return null;
    }

    //it is recommended to have (properly configured) caching dns for efficiency
    public boolean surblBlacklistedDomainIP(String domain_or_ip) {
	try {
	    InetAddress.getAllByName(domain_or_ip);
	}
	catch (UnknownHostException e) {
	    return false;
	}
	return true;
    }

    public String getDomain(String host) {
	//might only be two segments as in example.org
	String domain=host;
	int position = host.lastIndexOf(".", host.lastIndexOf(".")-1);
	if (position>=0) {
	    domain=host.substring(position+1);
	}
	if (twoLevelTLDs.contains(domain)) {
	    domain=host.substring(host.lastIndexOf(".",position-1)+1);
	}
	return domain;
    }

    public boolean isWhitelisted(String domain) {
	return whitehats.contains(domain);
    }

    public boolean isBlacklisted(String domain) {
	return blackhats.contains(domain);
    }

    //so we can be called from prompt with file of uris to check
    public static void main(String[] args) {
	try {
	    BlacklistChecker blc = new BlacklistChecker();
	    String line;	    
	    if ( ! (args.length>0) ) {
		System.out.println("usage: java org.w3c.app.util.BlacklistChecker file_of_uris");
	    }
	    System.out.println("Opening file");
	    BufferedReader testURIs = new BufferedReader(new FileReader(args[0]));
	    while((line=testURIs.readLine()) != null) {
		if (! line.startsWith("#")) {
		    String result=blc.checkURI(line);
		    if (result != null) {
			System.out.println(result);
		    }
		}
	    }
	}
	catch (Exception e) {
	    System.out.println("Exception " + e);
	    e.printStackTrace();
	}
    }

}
