#!/usr/bin/perl
# $Id: log2rdf.pl,v 1.2 2000/07/14 20:28:21 connolly Exp $
#
# This parses a delivery log that I keep, formatted ala:
#
#<81E4A2BC03CED111845100104B62AFB58245DF@stagecoach.bts.co.uk> Tue, 2 May 2000 12:04:28 +0100 \\ RE: SELECT OPTION how to control the fonts..?\\www-html-request@w3.org Tue May 2 07:14:54 2000 Dave J Woolley <DJW@bts.co.uk>\\ "'www-html@w3.org'" <www-html@w3.org>\\ "'cnc99r@ecs.soton.ac.uk'" <cnc99r@ecs.soton.ac.uk>
#
# and turns it into RDF something like: (this is out of date:)
#
#<f:Message rdf:about="mid:81E4A2BC03CED111845100104B62AFB58245DF@stagecoach.bts.co.uk"
#  f:date="Tue, 2 May 2000 12:04:28 +0100 "
#  f:dateISO="2000-05-02T11:04:28Z"
#  f:subject=" RE: SELECT OPTION how to control the fonts..?">
# <f:envelopeFrom>Tue May 2 07:14:54 2000 Dave J Woolley &lt;DJW@bts.co.uk></f:envelopeFrom>
#  <f:from rdf:resource="mailto:DJW@bts.co.uk" rdfs:label="Dave J Woolley"/>
#  <f:to rdf:resource="mailto:www-html@w3.org" rdfs:label="&apos;www-html@w3.org&apos;"/>
#  <f:cc rdf:resource="mailto:cnc99r@ecs.soton.ac.uk" rdfs:label="&apos;cnc99r@ecs.soton.ac.uk&apos;"/>
#
#</f:Message>
#
#
# It handles
# tricky address parsing stuff, thanks to D. J. Bernstein's notes.
# I hope to extend it to parse whole messages and mailboxes.
#
# It records each recipient in a separate statement, but it
# does *not* express in RDF, for example, the fact that there are no
# other recipients. Hmm... maybe I'll use my
# list schema (http://www.w3.org/2000/07/12-lists) to do that.
#
# This was originally released as
#  http://www.w3.org/2000/04/maillog2rdf/log2rdf.pl
# and I intend to use that address for future releases as well.
#
# This is Open Source. Share and enjoy.
# Copyright 2000 W3C (MIT, INRIA, Keio)
# For full terms and conditions, see
# W3C(R) SOFTWARE NOTICE 
# http://www.w3.org/Consortium/Legal/copyright-software-19980720

use strict;
#use POSIX; # for interpreting dates
use Time::Local;

# per http://cr.yp.to/immhf/token.html
my $Comment = 'COM';
my $Punctuation = 'PNC';
my $AddressingSymbol = 'ADR';
my $Atom = 'ATM';
my $QuotedString = 'QST';
my $DomainLiteral = 'DLT';


my(%Months);
%Months = ('Jan', 1,
	   'Feb', 2,
	   'Mar', 3,
	   'Apr', 4,
	   'May', 5,
	   'Jun', 6,
	   'Jul', 7,
	   'Aug', 8,
	   'Sep', 9,
	   'Oct', 10,
	   'Nov', 11,
	   'Dec', 12);

my(%TimeZones);
%TimeZones = ('BST', '+0100',
	      'CDT', '-0500',
	      'CET', '+0100',
	      'CST', '-0600',
	      'EDT', '-0400',
	      'EET', '+0200',
	      'EST', '-0500',
	      'GMT', '+0000',
	      'HKT', '+0800',
	      'IST', '+0200',
	      'JST', '+0900',
	      'MDT', '-0600',
	      'MET', '+0100',
	      'MET DST', '+0200', 
	      'METDST', '+0200', 
	      'MST', '-0700',
	      'PDT', '-0700',
	      'PST', '-0800',
	      'UT', '+0000');

if(0){ # unit test
    print asAttrLit("abc'def<>lkewj"), "\n";

    print tokens(<<EOF);
":sysmail"@  group. org, Muhammed.(the greatest) Ali @(the)Vegas.WBA
EOF
}

main();

sub main{
    print <<EOF;
<rdf:RDF
  xmlns:f="http://www.w3.org/2000/04/maillog2rdf/email-fields#"
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
  >
EOF

    while(<>){
#	print "::::::", $_;
	chop;

	my($midDate, $subject, $envelopeFrom, $to, $cc) = split(/\\\\/, $_);

	my($mid, $date) = split(/ /, $midDate, 2);

	if($date =~ /^</){ # multiple Message-Id headers!?!?
	    (undef, $date) = split(/ /, $date, 2);
	}

	$mid =~ s/^<//;
	$mid =~ s/>$//;

	
	my($envSender, $envDate, $from);
	($envSender, $envelopeFrom) = split(/ /, $envelopeFrom, 2);
	($envDate, undef, undef, undef, undef, $from)
	    = split(/ /, $envelopeFrom, 6);

	my($sec,$min,$hour,$mday,$mon,$year) =
	    localtime(parseDate(tokens($date)));

	printf('
<f:Message rdf:about="mid:%s"
  f:date="%s"
  f:dateISO="%04d-%02d-%02dT%02d:%02d:%02dZ"
  f:subject="%s">
 <f:envelopeFrom>%s</f:envelopeFrom>
',
	       asAttrLit($mid), asAttrLit($date),
	       $year+1900, $mon+1, $mday, $hour, $min, $sec,
	       asAttrLit($subject), asContent($envelopeFrom));

	targetList('f:from', reverse(tokens($from)));
	addressList('f:to', reverse(tokens($to)));
	addressList('f:cc', reverse(tokens($cc)));
	
	print '
</f:Message>
';

    }

    print <<EOF;
</rdf:RDF>
EOF

}


sub asContent{
    local($_) = @_;

    s/\&/\&amp;/g;
    s/\</\&lt;/g;

    return $_;
}

sub asAttrLit{
    local($_) = @_;

    s/\&/\&amp;/g;
    s/\</\&lt;/g;
    s/\"/\&quot;/g;
    s/\'/\&apos;/g;

    return $_;
}

sub addressList{
    # recursive descent parser, *right to left*
    # per http://cr.yp.to/immhf/addrlist.html#address-list

    my($property, @tokens) = @_; # *reversed* list of tokens
    my($used) = 0;

    while($used <= $#tokens){
	if($tokens[$used] eq ';'){ # then it's an address group
	    $used += 2;

	    if($tokens[$used] eq ':'){
		$used += 2;
	    }else{
		$used += targetList($property, @tokens[$used..$#tokens]);

		die "expected colon in address group $used th token of @tokens"
		    unless $tokens[$used] eq ':';
		$used += 2;
	    }

	    my($u, $phrase) = phrase(@tokens[$used..$#tokens]);
	    $used += $u;

	    print "<!-- address group phrase: @@ $phrase -->";
	}
	elsif($tokens[$used] eq ','){
	    $used += 2;
	}
	else{
	    $used += targetList($property, @tokens[$used..$#tokens]);
	}
    }

    die "junk after address list @tokens" unless $used == $#tokens + 1;

}

sub targetList{
    my($property, @tokens) = @_;
    my($used) = 0;

    while($used <= $#tokens){
	my($u);

	if($tokens[$used] eq ','){
	    $used += 2;
	}
	elsif($u = target($property, @tokens[$used..$#tokens])){
	    $used += $u;
	}
    }

    return $used;
}


sub target{
    my($property, @tokens) = @_;

    my($addr, $phrase, $used);

    if($tokens[0] eq '>'){
	my($u);
	$used = 2;
	($u, $addr) = encodedAddress(@tokens[$used..$#tokens]);
	$used += $u;

	die "bad target; expected <: @tokens" unless $tokens[$used] eq '<';
	$used += 2;

	($u, $phrase) = phrase(@tokens[$used..$#tokens]);
	$used += $u;

	# hmm... should the subject of the label be the mailbox or
	# or the recipient? per section 6.2 SEMANTICS of
	# RFC822, clearly the recipient.
	printf("
  <%s>
    <rdf:Description f:phrase=\"%s\">
      <f:addr-spec rdf:resource=\"mailto:%s\"/>
    </rdf:Description>
  </%s>
",
	       $property,
	       asAttrLit($phrase),
	       asAttrLit($addr),
	       $property);
    }
    else{
	($used, $addr) = encodedAddress(@tokens);
	die "expected encoded address, got $used length $addr: @tokens" unless $used > 0;

	printf("  <%s rdf:resource=\"mailto:%s\"/>\n",
	       $property, asAttrLit($addr));

    }

    return $used;
}


sub phrase{
    my(@tokens) = @_;
    my($used) = 0;
    my($phrase);

    while($used <= $#tokens
	  && ($tokens[$used] eq '.'
	      || $tokens[$used] eq '@'
	      || $tokens[$used+1] eq $QuotedString
	      || $tokens[$used+1] eq $Atom)){

	# unquote the quotedstring
	if($tokens[$used+1] eq $QuotedString){
#	    print STDERR "unquoting $tokens[$used] ... ";
	    $tokens[$used] =~ s/^\"//;
	    $tokens[$used] =~ s/\"$//;
	    $tokens[$used] =~ s/\\(.)/$1/g;
#	    print STDERR "result: $tokens[$used]\n";
	}

	if(length($phrase)>0) { $phrase = " " . $phrase; }
	$phrase = $tokens[$used] . $phrase; #remember: backwards!
	$used += 2;
    }

    return ($used, $phrase);
}


sub encodedAddress{
    my(@tokens) = @_;
    my($box, $domain);
    my($i) = 0;

    $box = $domain = '';

    while($i <= $#tokens
	  && ($tokens[$i] eq '.'
	      || ($tokens[$i+1] eq $Atom)
	      || ($tokens[$i+1] eq $DomainLiteral)
	      )){
	$domain = $tokens[$i] . $domain; # backwards, recall
	$i += 2;
    }

    if($tokens[$i] eq '@'){
	# ok
	$i += 2;
    }else{
	die "bad encoded address; expected @ at $i th token of: @tokens";
    }

    while($i <= $#tokens
	  && ($tokens[$i] eq '.'
	      || $tokens[$i+1] eq $Atom
	      || $tokens[$i+1] eq $QuotedString)){
	$box = $tokens[$i] . $box; # remember, we're parsing backwards!
	$i += 2;
    }

    return ($i, $box . "@" . $domain);
}

sub parseDate{
    my(@tokens) = @_;
    my($i) = 0;
    my($t);
    my($year, $month, $dayOfMonth, $hour, $minute, $second, $tzoff);

    if($tokens[$i+3] eq ','){ # throw away day of week
	$i += 4;
    }

    $dayOfMonth = $tokens[$i+1];
    $i += 2;

    $month = $Months{$tokens[$i+1]};
    warn "bad month ", $tokens[$i+1], " in date $i th token of @tokens" unless $month;
    $i += 2;

    $year = $tokens[$i+1];
    $i += 2;
    if($year <= 40){ $year += 2000; }
    elsif($year >= 60 && $year < 140){ $year += 1900; }

    $hour = $tokens[$i+1];
    $i += 2;

    warn "expected colon at $i th token of @tokens"
	unless $tokens[$i+1] eq ':';
    $i += 2;

    $minute = $tokens[$i+1];
    $i += 2;

    if($tokens[$i+1] eq ':'){
	$i += 2;
	$second = $tokens[$i+1];
	$i += 2;
    }else{
	$second = 0;
    }


    if($tokens[$i+1] =~ m/(\+|\-)(\d\d)(\d\d)/){
	my($s, $hh, $mm) = ($1, $2, $3);
	$tzoff = 60 * ($mm + 60 * $hh);
	$tzoff *= -1 if ($s eq '-');

#	print ":::TZ offset: ", $tokens[$i+1], " => ", $tzoff, "\n";
    }else{
	my($tz) = $TimeZones{$tokens[$i+1]};
	warn "bad timezone at $i th token in @tokens" unless $tz;

	$tz =~ m/(\+|\-)(\d\d)(\d\d)/;
	my($s, $hh, $mm) = ($1, $2, $3);
	$tzoff = 60 * ($mm + 60 * $hh);
	$tzoff *= -1 if ($s eq '-');

#	print ":::TZ offset: ", $tokens[$i+1], " => ", $tzoff, "\n";

    }


    $t = timelocal($second, $minute, $hour,
		   $dayOfMonth, $month - 1, $year - 1900);

    warn "bad date: @tokens" unless $t;

#    print ":::time $hour $minute $second offset: ", ctime($t), ctime($t - $tzoff), "\n@tokens\n";

    return $t - $tzoff;
}



sub tokens{
# per http://cr.yp.to/immhf/token.html
    local($_) = @_; # field value
    my(@tokens);

    while(length($_) > 0){
#	print "tokenizing at:$_\n@tokens";

	if(s/^[ \t]+//){
	    # ignore space
	}

	elsif(/^\([^\)]*\(/){
	    die "nested comments not implemented: $_";
	}
	elsif(s/^\(([^\015\\\(\)]|\\.)*\)//){
	    #push(@tokens, $Comment, $&);
	    #@@ hmm... really throw away comments?
	    # a lot of mailers seem to use them for real info
	}
	elsif(s/^[\<\>\,\;\:]//){
	    push(@tokens, $Punctuation, $&);
	}
	elsif(s/^(@|\.)//){
	    push(@tokens, $AddressingSymbol, $&);
	}
	elsif(s/^[^ \t\@\<\>\[\(\,\;\:\.\"]+//){
	    push(@tokens, $Atom, $&);
	}
	elsif(s/^"([^\015\\\"]|\\.)*"//){
	    push(@tokens, $QuotedString, $&); # leave unquoting to the caller
	}
	elsif(s/^\[([^\015\\\[\]]|\\.)*\]//){
	    push(@tokens, $DomainLiteral, $&); # leave unquoting to the caller
	}
	else{
	    die "cannot tokenize: $_";
	}
    }

    return @tokens;
}

