#!/usr/bin/env ruby
# $Id: foafwhite.rb,v 1.13 2002/06/22 15:15:49 danbri Exp $

# see end of doc for writeup

# 
# <foaf:NonSpamMailboxURI 
#  foaf:sha1Value="bd0e6edf5222a39534e55cc7486e95d4befc67dd"/>
#
# Todo: 
# - load rdf/xml from web
# - check GPG signature?
# - migrate to XSLT RDF parser or talk to Redland parser via real APIs
# - downcase domain names? in --check if not in actual data
#<danbri> I've just removed the downcasing in my code 
#<DanC> you may/should case-fold domain names, but not mailboxes
#<DanC> i.e. turn CONNOLLY@W3.ORG to CONNOLLY@w3.org but not to connolly@w3.org#
# - other databases (blacklist, mid)
# - rdf schema (refine, document)

mylib = ENV['RUBYRDF']
if mylib
  # puts "Using #{mylib} libdir" ## STDERR?
  require mylib+"/basicrdf"
else 
  require '../../basicrdf'	# fix this? (how? env variables?)	
end

require 'sdbm'			# lowest common denominator
require 'sha1'
require 'getoptlong'

opt = GetoptLong.new (
    ['--import','-i', GetoptLong::NO_ARGUMENT],
    ['--check', '-c', GetoptLong::REQUIRED_ARGUMENT],
    ['--test','-t', GetoptLong::NO_ARGUMENT],
    ['--verbose','-v', GetoptLong::NO_ARGUMENT],
    ['--collect','-f', GetoptLong::OPTIONAL_ARGUMENT],
    ['--diskdb','-d', GetoptLong::REQUIRED_ARGUMENT],
    ['--lowercase-domains','-l', GetoptLong::NO_ARGUMENT],
    ['--realwhitelist','-r', GetoptLong::REQUIRED_ARGUMENT],
    ['--scrape','-s', GetoptLong::OPTIONAL_ARGUMENT],
    ['--export', '-x', GetoptLong::OPTIONAL_ARGUMENT] )

#note: --lc is unused. should be used with --check
 
# A whitelist (including on-disk storage)
#
class FOAFWhiteList

attr_accessor :verbose, :diskdb, :realwhitelist

def initialize
  @verbose=false
  @diskdb = './data/whitelist'
  @realwhitelist = ENV['HOME']+"/.whitelist" #todo
  @lowercase=false # 
end

def checkMailbox (mbox)
  puts "Checking for mbox: #{mbox}" if @verbose
  mbox.downcase! if (@downcase)
  mbox = "mailto:"+mbox unless (mbox =~ /^mailto:/) # too gentle?
  sh = SHA1::new(mbox)
  bytes = sh.digest()
  garbled = ''
  bytes.each_byte {|i| garbled << sprintf("%02x", i) }
  puts "Garbled version of '#{mbox}' is: #{garbled}" if @verbose
  puts "Connecting to disk db: this.diskdb=#{@diskdb}"
  disk = SDBM::new( @diskdb )
  if disk[garbled] 
    return true
  else
    return false
  end
end

 
# build our on-disk store from RDF input
# todo:
# - more flexibility about how we get the RDF
# - GPG checking (where does that belong?)

def buildDiskDB(diskdb="./data/whitelist")
  db = Loader.nt2graph()		# N-Triples from STDIN (bit rough...)
  foaf = db.reg_xmlns 'http://xmlns.com/foaf/0.1/', 'foaf'
  rdf = db.reg_xmlns 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdf'
  foaf_NonSpamMailboxURI = Node.getResource(foaf+'NonSpamMailboxURI')
  rdf_type = Node.getResource rdf+"type"

  disk = SDBM::new(diskdb)
  
  # query: the RDF API needs beautifying
  db.ask (Statement.new(nil, rdf_type,foaf_NonSpamMailboxURI )).subjects.each do |mb|
    #
    puts "Got mb: #{mb} " #if @verbose

    garbled = mb.foaf_sha1Value()[0].to_s.gsub!(/"/,"") # see API notes below
    garbled = mb.foaf_sha1Value()[0].to_s
    if (!garbled) 
#	raise "No garbled value :(" 
	puts "No sha1 value"
        garbled='nil' ##todo: this shouldn't happen
    end
    if (disk[garbled] != nil)
      puts "#{garbled} already in file: #{diskdb}" if @verbose
    else
      disk[garbled]=1  
      puts "stored #{garbled} in whitelist file: #{diskdb} " if @verbose
    end
  end
  disk.close
end


# turn a text file whitelist into RDF 
# todo: -more flexibility (eg. sources other than plain text,
# such as sent-mail, textfiles, imap. (need this for --scrape)
# 
def realToGarbledXML

rdf='<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns:foaf="http://xmlns.com/foaf/0.1/">
';

File.open(@realwhitelist) do |f|
  f.each do |mbox|
    mbox.chomp!
    mbox.downcase!
    sh = SHA1::new("mailto:"+mbox)
    bytes = sh.digest()
    garbled = ''
    bytes.each_byte {|i| garbled << sprintf("%02x", i) }
    rdf += "<foaf:NonSpamMailboxURI foaf:sha1Value=\"#{garbled}\"/>\n"
  end
end

rdf += "\n</rdf:RDF>\n\n"

return rdf
# File.open(foafdata,'w') do |f|
#  f.write rdf
# end

end

def allRecipients(to_list)
  list=[]
  list =  `grep '^To:' mail/sent-mail| sort | uniq` if (to_list==nil)
  list = `cat #{to_list}` if (to_list != nil)
  
  addresses={}
  list.each do |line|
    line.chomp!
    while(  line.sub!(/<([^>]*)>/,"") ) do
      addresses[$1]=1
      # puts "Match: #{$1}\n"
    end
#    while(  line.sub!(/>([^<]*)>/,"") ) do
#      addresses[$1]=1
#      # puts "Match: #{$1}\n"
#    end

  end
  return addresses
end

end 

## Temporary testing stuff. 
## RUnit probably better bet.

def assert_false (testname, bool, msg)
  case (bool==false)
  when true
    puts "pass"
  when false
    puts "fail"
    puts "\tasserting false: #{testname} value: #{bool} msg: #{msg} \n"
  end
  return msg
end

def assert_true (testname, bool, msg)
 case (bool==true)
  when true
    puts "pass"
  when false
    puts "fail"
    puts "\tasserting true: #{testname} value: #{bool} msg: #{msg} \n"
  end  
  return msg
end

#########################################################################
#
# scripting stuff here
# 
# FOAFWhite and the seven spammers...

wl = FOAFWhiteList::new

#puts "use --help for commandline options" if (opt.empty?)
#todo


opt.each_option do |name,arg|


case name

  when '--verbose'
   wl.verbose=true
   puts "running in verbose mode"

  when '--diskdb'
    wl.diskdb=arg 

  when '--realwhitelist'
    wl.realwhitelist=arg 
 
  when '--import'
    wl.buildDiskDB	# assumes stream of N-Triples on STDIN 

  when '--realwhitelist'
    wl.realwhitelist = arg
    #puts "using real whitelist #{arg}"

  # the ultimate goal! check the mailbox
  # (how do we exit true or false?) 
  when '--check'
     wl.verbose=false

     puts "checking. currently:  wl.diskdb=#{wl.diskdb}"
     puts "#{wl.checkMailbox(arg)}"

  when '--export' 
     puts wl.realToGarbledXML

  # this is ugly as hell, calls itself as a script, runs shell crap
  # instead of using APIs, and is dependent on Redland's rdfdump
  # parser.
  # However it does mean we can delete the Makefile and start to 
  # treat this as a self-contained utility.
  when '--collect'
     puts "Collecting RDF from the Web"
     puts `rm -f foafwhite.xml`
     puts `rm -rf data/whitelist.*`
     puts `GET http://tux.w3.org/~danbri/rdfweb/foafwhite.xml>foafwhite.xml`
     puts `rdfdump -o ntriples file:foafwhite.xml | ./foafwhite.rb --import`

  when '--test'
    puts "Tests...\n\n"
    bogus_msg="unknown mailboxes shouldn't match"
    known_msg="known mailboxes should match"

    evil="evil-danbri@rdfweb.org"
    danbri1="danbri@rdfweb.org"
    danbri_mailto="mailto:danbri@rdfweb.org"
    as="mailto:aswartz@swartzfam.com"
    dANBRI="mailto:DANBRI@w3.ORG"

    assert_false(evil, wl.checkMailbox(evil), bogus_msg)
    assert_true(danbri1, wl.checkMailbox(danbri1), known_msg) 
    assert_true(danbri_mailto, wl.checkMailbox(danbri_mailto), known_msg) 
    assert_true(as, wl.checkMailbox(as), known_msg)
    assert_true(dANBRI, wl.checkMailbox(dANBRI), "case shouldn't matter")

    #see http://www.eng.dmu.ac.uk/~hgs/ruby/ruby-unit.html
    #...for RubyUnit docs:

	# should use this instead:
	#http://testunit.talbott.ws/doc/index.html

    require "runit/testcase"
    require 'runit/cui/testrunner'
    require 'runit/testsuite'

  when '--scrape'
    puts "Extracting mailboxes"
    all=wl.allRecipients(arg)
    all.each_key do |mb| puts mb end if (all != nil)
    # puts "scraped: #{all.inspect}"

  when '--check'
    puts "Testing RDFization: "+wl.realToGarbledXML()

  when '--version'
    puts "version $Id: foafwhite.rb,v 1.13 2002/06/22 15:15:49 danbri Exp $ \n"

  end

end
## --sentmail-extract
## in progress (should use IMAP?)
##allRecipients.each_key do |mb| puts mb end   

########################################################################
#
# RDF API notes
#
# (things I learned about this so-called API by trying to use it for real)
#
# trim '"' from return values (in .to_s for node?)
# clarify genid behaviour (reminder to use nils)
# use iterators with method_missing; return single value otherwise
# can nodes have multiple URIrefs? (don't see why not... but complicates)


=begin

This is Ruby code to produce and consume RDF documents that describe 
'whitelist's of internet mailbox URIs that represent 'known senders'
of non-spam email. This data can be used to support spam filtering apps.
To offer some degree of privacy and spam-protection, the RDF/XML data
format used does not reveal the full mailboxes. Instead, a sha1 of the
(typically 'mailto:') mailbox URI is used. 

This technique does not (as far as I can see) help deal with spam that 
fakes a 'respectable' From: field. This may become increasingly
problematic.

The tools here provide pretty basic facilities for loading RDF/XML
from STDIN (in N-Triples format currently, so external RDF parser
needed). The un-scrambled whitelist file is typically stored as a list
of mailboxes in .whitelist. The on-disk store derrived from the
RDF/XML uses SDBM, and should be fast to check from procmail-based
scripts 
(@@todo).

Example Usage 

Noting that this could be done more gracefully. But that's no excuse
for not documenting current workings:

First we might be starting with a file full of messy 'To:' lines 
greped from a mail folder:

	./foafwhite.rb --scrape example1.txt > test.txt

This normalises lines such as: 
		 Dan Brickley <Daniel.Brickley@bristol.ac.uk>
     into:	 Daniel.Brickley@bristol.ac.uk
     (@@todo: should lowercase at this stage)

We might have other plain text whitelists (eg. from addressbooks, or Gerald's
'atw' utility. We treat them all as one-mailbox per line text files,
and currently can only convert these into RDF/XML for loading, rather
than use them directly.

So,next we convert this 'whitelist' file into RDF/XML

	./foafwhite.rb --realwhitelist test.txt --export > test.xml

Eyes on the prize: we want to be able to check to see if the hash of 
some (lowercased) mailbox is known to our on disk store.

Sanity check:
        ./foafwhite.rb --check daniel.brickley@bristol.ac.uk
	false

(assuming the on-disk store is empty or at least hasn't heard of this 
mailbox yet)

We then import into our on-disk database (faster for lookups)

not yet:
   ./foafwhite.rb --collect test.xml
   (this calls an RDF parser and (@@fixme) invokes itself with --import)
   (***bug: no it doesn't, currently URI and filename is hardcoded)

but this works:
rdfdump -o ntriples file:localcopyofrdfwhitelist.xml | ./foafwhite.rb --import

Now we can run checks. 

Note that loading more RDF can be done at any time. Data is stored on
disk as key/value pairs in a DBM file. Currently the key is the sha1
hash of lowercased mailbox URI, the value is simply a '1'. An obvious
next step would be to track provenance information so the store can be
treated as a database instead of a melting pot, ie. we'd like it to to 
know which whitelist ack'd which mailbox. Maybe using a generic RDF
store would be worthwhile (though it might make lookups slower. Hiding
the optimised store behind an RDF API is probably worth doing.

Anyway, checks:

	./foafwhite.rb --check daniel.brickley@bristol.ac.UK
	true

	./foafwhite.rb --check
	daniel.brickley@trueloveandamilliondollars.spambot.example.com
	false

Next: fix up return values so this works for procmail etc



danbri@w3.org

=end

