#!/usr/bin/env ruby # $Id: foafwhite.rb,v 1.13 2002/06/22 15:15:49 danbri Exp $ # see end of doc for writeup # # # # Todo: # - load rdf/xml from web # - check GPG signature? # - migrate to XSLT RDF parser or talk to Redland parser via real APIs # - downcase domain names? in --check if not in actual data # I've just removed the downcasing in my code # you may/should case-fold domain names, but not mailboxes # i.e. turn CONNOLLY@W3.ORG to CONNOLLY@w3.org but not to connolly@w3.org# # - other databases (blacklist, mid) # - rdf schema (refine, document) mylib = ENV['RUBYRDF'] if mylib # puts "Using #{mylib} libdir" ## STDERR? require mylib+"/basicrdf" else require '../../basicrdf' # fix this? (how? env variables?) end require 'sdbm' # lowest common denominator require 'sha1' require 'getoptlong' opt = GetoptLong.new ( ['--import','-i', GetoptLong::NO_ARGUMENT], ['--check', '-c', GetoptLong::REQUIRED_ARGUMENT], ['--test','-t', GetoptLong::NO_ARGUMENT], ['--verbose','-v', GetoptLong::NO_ARGUMENT], ['--collect','-f', GetoptLong::OPTIONAL_ARGUMENT], ['--diskdb','-d', GetoptLong::REQUIRED_ARGUMENT], ['--lowercase-domains','-l', GetoptLong::NO_ARGUMENT], ['--realwhitelist','-r', GetoptLong::REQUIRED_ARGUMENT], ['--scrape','-s', GetoptLong::OPTIONAL_ARGUMENT], ['--export', '-x', GetoptLong::OPTIONAL_ARGUMENT] ) #note: --lc is unused. should be used with --check # A whitelist (including on-disk storage) # class FOAFWhiteList attr_accessor :verbose, :diskdb, :realwhitelist def initialize @verbose=false @diskdb = './data/whitelist' @realwhitelist = ENV['HOME']+"/.whitelist" #todo @lowercase=false # end def checkMailbox (mbox) puts "Checking for mbox: #{mbox}" if @verbose mbox.downcase! if (@downcase) mbox = "mailto:"+mbox unless (mbox =~ /^mailto:/) # too gentle? sh = SHA1::new(mbox) bytes = sh.digest() garbled = '' bytes.each_byte {|i| garbled << sprintf("%02x", i) } puts "Garbled version of '#{mbox}' is: #{garbled}" if @verbose puts "Connecting to disk db: this.diskdb=#{@diskdb}" disk = SDBM::new( @diskdb ) if disk[garbled] return true else return false end end # build our on-disk store from RDF input # todo: # - more flexibility about how we get the RDF # - GPG checking (where does that belong?) def buildDiskDB(diskdb="./data/whitelist") db = Loader.nt2graph() # N-Triples from STDIN (bit rough...) foaf = db.reg_xmlns 'http://xmlns.com/foaf/0.1/', 'foaf' rdf = db.reg_xmlns 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdf' foaf_NonSpamMailboxURI = Node.getResource(foaf+'NonSpamMailboxURI') rdf_type = Node.getResource rdf+"type" disk = SDBM::new(diskdb) # query: the RDF API needs beautifying db.ask (Statement.new(nil, rdf_type,foaf_NonSpamMailboxURI )).subjects.each do |mb| # puts "Got mb: #{mb} " #if @verbose garbled = mb.foaf_sha1Value()[0].to_s.gsub!(/"/,"") # see API notes below garbled = mb.foaf_sha1Value()[0].to_s if (!garbled) # raise "No garbled value :(" puts "No sha1 value" garbled='nil' ##todo: this shouldn't happen end if (disk[garbled] != nil) puts "#{garbled} already in file: #{diskdb}" if @verbose else disk[garbled]=1 puts "stored #{garbled} in whitelist file: #{diskdb} " if @verbose end end disk.close end # turn a text file whitelist into RDF # todo: -more flexibility (eg. sources other than plain text, # such as sent-mail, textfiles, imap. (need this for --scrape) # def realToGarbledXML rdf=' '; File.open(@realwhitelist) do |f| f.each do |mbox| mbox.chomp! mbox.downcase! sh = SHA1::new("mailto:"+mbox) bytes = sh.digest() garbled = '' bytes.each_byte {|i| garbled << sprintf("%02x", i) } rdf += "\n" end end rdf += "\n\n\n" return rdf # File.open(foafdata,'w') do |f| # f.write rdf # end end def allRecipients(to_list) list=[] list = `grep '^To:' mail/sent-mail| sort | uniq` if (to_list==nil) list = `cat #{to_list}` if (to_list != nil) addresses={} list.each do |line| line.chomp! while( line.sub!(/<([^>]*)>/,"") ) do addresses[$1]=1 # puts "Match: #{$1}\n" end # while( line.sub!(/>([^<]*)>/,"") ) do # addresses[$1]=1 # # puts "Match: #{$1}\n" # end end return addresses end end ## Temporary testing stuff. ## RUnit probably better bet. def assert_false (testname, bool, msg) case (bool==false) when true puts "pass" when false puts "fail" puts "\tasserting false: #{testname} value: #{bool} msg: #{msg} \n" end return msg end def assert_true (testname, bool, msg) case (bool==true) when true puts "pass" when false puts "fail" puts "\tasserting true: #{testname} value: #{bool} msg: #{msg} \n" end return msg end ######################################################################### # # scripting stuff here # # FOAFWhite and the seven spammers... wl = FOAFWhiteList::new #puts "use --help for commandline options" if (opt.empty?) #todo opt.each_option do |name,arg| case name when '--verbose' wl.verbose=true puts "running in verbose mode" when '--diskdb' wl.diskdb=arg when '--realwhitelist' wl.realwhitelist=arg when '--import' wl.buildDiskDB # assumes stream of N-Triples on STDIN when '--realwhitelist' wl.realwhitelist = arg #puts "using real whitelist #{arg}" # the ultimate goal! check the mailbox # (how do we exit true or false?) when '--check' wl.verbose=false puts "checking. currently: wl.diskdb=#{wl.diskdb}" puts "#{wl.checkMailbox(arg)}" when '--export' puts wl.realToGarbledXML # this is ugly as hell, calls itself as a script, runs shell crap # instead of using APIs, and is dependent on Redland's rdfdump # parser. # However it does mean we can delete the Makefile and start to # treat this as a self-contained utility. when '--collect' puts "Collecting RDF from the Web" puts `rm -f foafwhite.xml` puts `rm -rf data/whitelist.*` puts `GET http://tux.w3.org/~danbri/rdfweb/foafwhite.xml>foafwhite.xml` puts `rdfdump -o ntriples file:foafwhite.xml | ./foafwhite.rb --import` when '--test' puts "Tests...\n\n" bogus_msg="unknown mailboxes shouldn't match" known_msg="known mailboxes should match" evil="evil-danbri@rdfweb.org" danbri1="danbri@rdfweb.org" danbri_mailto="mailto:danbri@rdfweb.org" as="mailto:aswartz@swartzfam.com" dANBRI="mailto:DANBRI@w3.ORG" assert_false(evil, wl.checkMailbox(evil), bogus_msg) assert_true(danbri1, wl.checkMailbox(danbri1), known_msg) assert_true(danbri_mailto, wl.checkMailbox(danbri_mailto), known_msg) assert_true(as, wl.checkMailbox(as), known_msg) assert_true(dANBRI, wl.checkMailbox(dANBRI), "case shouldn't matter") #see http://www.eng.dmu.ac.uk/~hgs/ruby/ruby-unit.html #...for RubyUnit docs: # should use this instead: #http://testunit.talbott.ws/doc/index.html require "runit/testcase" require 'runit/cui/testrunner' require 'runit/testsuite' when '--scrape' puts "Extracting mailboxes" all=wl.allRecipients(arg) all.each_key do |mb| puts mb end if (all != nil) # puts "scraped: #{all.inspect}" when '--check' puts "Testing RDFization: "+wl.realToGarbledXML() when '--version' puts "version $Id: foafwhite.rb,v 1.13 2002/06/22 15:15:49 danbri Exp $ \n" end end ## --sentmail-extract ## in progress (should use IMAP?) ##allRecipients.each_key do |mb| puts mb end ######################################################################## # # RDF API notes # # (things I learned about this so-called API by trying to use it for real) # # trim '"' from return values (in .to_s for node?) # clarify genid behaviour (reminder to use nils) # use iterators with method_missing; return single value otherwise # can nodes have multiple URIrefs? (don't see why not... but complicates) =begin This is Ruby code to produce and consume RDF documents that describe 'whitelist's of internet mailbox URIs that represent 'known senders' of non-spam email. This data can be used to support spam filtering apps. To offer some degree of privacy and spam-protection, the RDF/XML data format used does not reveal the full mailboxes. Instead, a sha1 of the (typically 'mailto:') mailbox URI is used. This technique does not (as far as I can see) help deal with spam that fakes a 'respectable' From: field. This may become increasingly problematic. The tools here provide pretty basic facilities for loading RDF/XML from STDIN (in N-Triples format currently, so external RDF parser needed). The un-scrambled whitelist file is typically stored as a list of mailboxes in .whitelist. The on-disk store derrived from the RDF/XML uses SDBM, and should be fast to check from procmail-based scripts (@@todo). Example Usage Noting that this could be done more gracefully. But that's no excuse for not documenting current workings: First we might be starting with a file full of messy 'To:' lines greped from a mail folder: ./foafwhite.rb --scrape example1.txt > test.txt This normalises lines such as: Dan Brickley into: Daniel.Brickley@bristol.ac.uk (@@todo: should lowercase at this stage) We might have other plain text whitelists (eg. from addressbooks, or Gerald's 'atw' utility. We treat them all as one-mailbox per line text files, and currently can only convert these into RDF/XML for loading, rather than use them directly. So,next we convert this 'whitelist' file into RDF/XML ./foafwhite.rb --realwhitelist test.txt --export > test.xml Eyes on the prize: we want to be able to check to see if the hash of some (lowercased) mailbox is known to our on disk store. Sanity check: ./foafwhite.rb --check daniel.brickley@bristol.ac.uk false (assuming the on-disk store is empty or at least hasn't heard of this mailbox yet) We then import into our on-disk database (faster for lookups) not yet: ./foafwhite.rb --collect test.xml (this calls an RDF parser and (@@fixme) invokes itself with --import) (***bug: no it doesn't, currently URI and filename is hardcoded) but this works: rdfdump -o ntriples file:localcopyofrdfwhitelist.xml | ./foafwhite.rb --import Now we can run checks. Note that loading more RDF can be done at any time. Data is stored on disk as key/value pairs in a DBM file. Currently the key is the sha1 hash of lowercased mailbox URI, the value is simply a '1'. An obvious next step would be to track provenance information so the store can be treated as a database instead of a melting pot, ie. we'd like it to to know which whitelist ack'd which mailbox. Maybe using a generic RDF store would be worthwhile (though it might make lookups slower. Hiding the optimised store behind an RDF API is probably worth doing. Anyway, checks: ./foafwhite.rb --check daniel.brickley@bristol.ac.UK true ./foafwhite.rb --check daniel.brickley@trueloveandamilliondollars.spambot.example.com false Next: fix up return values so this works for procmail etc danbri@w3.org =end