#!/usr/bin/env ruby
# $Id: foafwhite.rb,v 1.13 2002/06/22 15:15:49 danbri Exp $
# see end of doc for writeup
#
#
#
# Todo:
# - load rdf/xml from web
# - check GPG signature?
# - migrate to XSLT RDF parser or talk to Redland parser via real APIs
# - downcase domain names? in --check if not in actual data
# I've just removed the downcasing in my code
# you may/should case-fold domain names, but not mailboxes
# i.e. turn CONNOLLY@W3.ORG to CONNOLLY@w3.org but not to connolly@w3.org#
# - other databases (blacklist, mid)
# - rdf schema (refine, document)
mylib = ENV['RUBYRDF']
if mylib
# puts "Using #{mylib} libdir" ## STDERR?
require mylib+"/basicrdf"
else
require '../../basicrdf' # fix this? (how? env variables?)
end
require 'sdbm' # lowest common denominator
require 'sha1'
require 'getoptlong'
opt = GetoptLong.new (
['--import','-i', GetoptLong::NO_ARGUMENT],
['--check', '-c', GetoptLong::REQUIRED_ARGUMENT],
['--test','-t', GetoptLong::NO_ARGUMENT],
['--verbose','-v', GetoptLong::NO_ARGUMENT],
['--collect','-f', GetoptLong::OPTIONAL_ARGUMENT],
['--diskdb','-d', GetoptLong::REQUIRED_ARGUMENT],
['--lowercase-domains','-l', GetoptLong::NO_ARGUMENT],
['--realwhitelist','-r', GetoptLong::REQUIRED_ARGUMENT],
['--scrape','-s', GetoptLong::OPTIONAL_ARGUMENT],
['--export', '-x', GetoptLong::OPTIONAL_ARGUMENT] )
#note: --lc is unused. should be used with --check
# A whitelist (including on-disk storage)
#
class FOAFWhiteList
attr_accessor :verbose, :diskdb, :realwhitelist
def initialize
@verbose=false
@diskdb = './data/whitelist'
@realwhitelist = ENV['HOME']+"/.whitelist" #todo
@lowercase=false #
end
def checkMailbox (mbox)
puts "Checking for mbox: #{mbox}" if @verbose
mbox.downcase! if (@downcase)
mbox = "mailto:"+mbox unless (mbox =~ /^mailto:/) # too gentle?
sh = SHA1::new(mbox)
bytes = sh.digest()
garbled = ''
bytes.each_byte {|i| garbled << sprintf("%02x", i) }
puts "Garbled version of '#{mbox}' is: #{garbled}" if @verbose
puts "Connecting to disk db: this.diskdb=#{@diskdb}"
disk = SDBM::new( @diskdb )
if disk[garbled]
return true
else
return false
end
end
# build our on-disk store from RDF input
# todo:
# - more flexibility about how we get the RDF
# - GPG checking (where does that belong?)
def buildDiskDB(diskdb="./data/whitelist")
db = Loader.nt2graph() # N-Triples from STDIN (bit rough...)
foaf = db.reg_xmlns 'http://xmlns.com/foaf/0.1/', 'foaf'
rdf = db.reg_xmlns 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdf'
foaf_NonSpamMailboxURI = Node.getResource(foaf+'NonSpamMailboxURI')
rdf_type = Node.getResource rdf+"type"
disk = SDBM::new(diskdb)
# query: the RDF API needs beautifying
db.ask (Statement.new(nil, rdf_type,foaf_NonSpamMailboxURI )).subjects.each do |mb|
#
puts "Got mb: #{mb} " #if @verbose
garbled = mb.foaf_sha1Value()[0].to_s.gsub!(/"/,"") # see API notes below
garbled = mb.foaf_sha1Value()[0].to_s
if (!garbled)
# raise "No garbled value :("
puts "No sha1 value"
garbled='nil' ##todo: this shouldn't happen
end
if (disk[garbled] != nil)
puts "#{garbled} already in file: #{diskdb}" if @verbose
else
disk[garbled]=1
puts "stored #{garbled} in whitelist file: #{diskdb} " if @verbose
end
end
disk.close
end
# turn a text file whitelist into RDF
# todo: -more flexibility (eg. sources other than plain text,
# such as sent-mail, textfiles, imap. (need this for --scrape)
#
def realToGarbledXML
rdf='
';
File.open(@realwhitelist) do |f|
f.each do |mbox|
mbox.chomp!
mbox.downcase!
sh = SHA1::new("mailto:"+mbox)
bytes = sh.digest()
garbled = ''
bytes.each_byte {|i| garbled << sprintf("%02x", i) }
rdf += "\n"
end
end
rdf += "\n\n\n"
return rdf
# File.open(foafdata,'w') do |f|
# f.write rdf
# end
end
def allRecipients(to_list)
list=[]
list = `grep '^To:' mail/sent-mail| sort | uniq` if (to_list==nil)
list = `cat #{to_list}` if (to_list != nil)
addresses={}
list.each do |line|
line.chomp!
while( line.sub!(/<([^>]*)>/,"") ) do
addresses[$1]=1
# puts "Match: #{$1}\n"
end
# while( line.sub!(/>([^<]*)>/,"") ) do
# addresses[$1]=1
# # puts "Match: #{$1}\n"
# end
end
return addresses
end
end
## Temporary testing stuff.
## RUnit probably better bet.
def assert_false (testname, bool, msg)
case (bool==false)
when true
puts "pass"
when false
puts "fail"
puts "\tasserting false: #{testname} value: #{bool} msg: #{msg} \n"
end
return msg
end
def assert_true (testname, bool, msg)
case (bool==true)
when true
puts "pass"
when false
puts "fail"
puts "\tasserting true: #{testname} value: #{bool} msg: #{msg} \n"
end
return msg
end
#########################################################################
#
# scripting stuff here
#
# FOAFWhite and the seven spammers...
wl = FOAFWhiteList::new
#puts "use --help for commandline options" if (opt.empty?)
#todo
opt.each_option do |name,arg|
case name
when '--verbose'
wl.verbose=true
puts "running in verbose mode"
when '--diskdb'
wl.diskdb=arg
when '--realwhitelist'
wl.realwhitelist=arg
when '--import'
wl.buildDiskDB # assumes stream of N-Triples on STDIN
when '--realwhitelist'
wl.realwhitelist = arg
#puts "using real whitelist #{arg}"
# the ultimate goal! check the mailbox
# (how do we exit true or false?)
when '--check'
wl.verbose=false
puts "checking. currently: wl.diskdb=#{wl.diskdb}"
puts "#{wl.checkMailbox(arg)}"
when '--export'
puts wl.realToGarbledXML
# this is ugly as hell, calls itself as a script, runs shell crap
# instead of using APIs, and is dependent on Redland's rdfdump
# parser.
# However it does mean we can delete the Makefile and start to
# treat this as a self-contained utility.
when '--collect'
puts "Collecting RDF from the Web"
puts `rm -f foafwhite.xml`
puts `rm -rf data/whitelist.*`
puts `GET http://tux.w3.org/~danbri/rdfweb/foafwhite.xml>foafwhite.xml`
puts `rdfdump -o ntriples file:foafwhite.xml | ./foafwhite.rb --import`
when '--test'
puts "Tests...\n\n"
bogus_msg="unknown mailboxes shouldn't match"
known_msg="known mailboxes should match"
evil="evil-danbri@rdfweb.org"
danbri1="danbri@rdfweb.org"
danbri_mailto="mailto:danbri@rdfweb.org"
as="mailto:aswartz@swartzfam.com"
dANBRI="mailto:DANBRI@w3.ORG"
assert_false(evil, wl.checkMailbox(evil), bogus_msg)
assert_true(danbri1, wl.checkMailbox(danbri1), known_msg)
assert_true(danbri_mailto, wl.checkMailbox(danbri_mailto), known_msg)
assert_true(as, wl.checkMailbox(as), known_msg)
assert_true(dANBRI, wl.checkMailbox(dANBRI), "case shouldn't matter")
#see http://www.eng.dmu.ac.uk/~hgs/ruby/ruby-unit.html
#...for RubyUnit docs:
# should use this instead:
#http://testunit.talbott.ws/doc/index.html
require "runit/testcase"
require 'runit/cui/testrunner'
require 'runit/testsuite'
when '--scrape'
puts "Extracting mailboxes"
all=wl.allRecipients(arg)
all.each_key do |mb| puts mb end if (all != nil)
# puts "scraped: #{all.inspect}"
when '--check'
puts "Testing RDFization: "+wl.realToGarbledXML()
when '--version'
puts "version $Id: foafwhite.rb,v 1.13 2002/06/22 15:15:49 danbri Exp $ \n"
end
end
## --sentmail-extract
## in progress (should use IMAP?)
##allRecipients.each_key do |mb| puts mb end
########################################################################
#
# RDF API notes
#
# (things I learned about this so-called API by trying to use it for real)
#
# trim '"' from return values (in .to_s for node?)
# clarify genid behaviour (reminder to use nils)
# use iterators with method_missing; return single value otherwise
# can nodes have multiple URIrefs? (don't see why not... but complicates)
=begin
This is Ruby code to produce and consume RDF documents that describe
'whitelist's of internet mailbox URIs that represent 'known senders'
of non-spam email. This data can be used to support spam filtering apps.
To offer some degree of privacy and spam-protection, the RDF/XML data
format used does not reveal the full mailboxes. Instead, a sha1 of the
(typically 'mailto:') mailbox URI is used.
This technique does not (as far as I can see) help deal with spam that
fakes a 'respectable' From: field. This may become increasingly
problematic.
The tools here provide pretty basic facilities for loading RDF/XML
from STDIN (in N-Triples format currently, so external RDF parser
needed). The un-scrambled whitelist file is typically stored as a list
of mailboxes in .whitelist. The on-disk store derrived from the
RDF/XML uses SDBM, and should be fast to check from procmail-based
scripts
(@@todo).
Example Usage
Noting that this could be done more gracefully. But that's no excuse
for not documenting current workings:
First we might be starting with a file full of messy 'To:' lines
greped from a mail folder:
./foafwhite.rb --scrape example1.txt > test.txt
This normalises lines such as:
Dan Brickley
into: Daniel.Brickley@bristol.ac.uk
(@@todo: should lowercase at this stage)
We might have other plain text whitelists (eg. from addressbooks, or Gerald's
'atw' utility. We treat them all as one-mailbox per line text files,
and currently can only convert these into RDF/XML for loading, rather
than use them directly.
So,next we convert this 'whitelist' file into RDF/XML
./foafwhite.rb --realwhitelist test.txt --export > test.xml
Eyes on the prize: we want to be able to check to see if the hash of
some (lowercased) mailbox is known to our on disk store.
Sanity check:
./foafwhite.rb --check daniel.brickley@bristol.ac.uk
false
(assuming the on-disk store is empty or at least hasn't heard of this
mailbox yet)
We then import into our on-disk database (faster for lookups)
not yet:
./foafwhite.rb --collect test.xml
(this calls an RDF parser and (@@fixme) invokes itself with --import)
(***bug: no it doesn't, currently URI and filename is hardcoded)
but this works:
rdfdump -o ntriples file:localcopyofrdfwhitelist.xml | ./foafwhite.rb --import
Now we can run checks.
Note that loading more RDF can be done at any time. Data is stored on
disk as key/value pairs in a DBM file. Currently the key is the sha1
hash of lowercased mailbox URI, the value is simply a '1'. An obvious
next step would be to track provenance information so the store can be
treated as a database instead of a melting pot, ie. we'd like it to to
know which whitelist ack'd which mailbox. Maybe using a generic RDF
store would be worthwhile (though it might make lookups slower. Hiding
the optimised store behind an RDF API is probably worth doing.
Anyway, checks:
./foafwhite.rb --check daniel.brickley@bristol.ac.UK
true
./foafwhite.rb --check
daniel.brickley@trueloveandamilliondollars.spambot.example.com
false
Next: fix up return values so this works for procmail etc
danbri@w3.org
=end