#!/bin/env python
"""aboutMsg.py -- convert message metadata to RDF/XML

USAGE:
  export PYTHONPATH=$WWW/2000/10/swap
  python aboutMsg.py [options] mailbox
  python aboutMsg.py [options] --mh mhMailDir
  python aboutMsg.py [options] --msg msgfile
  python aboutMsg.py --test

  options:
    --from who@domain
    --since %Y-%m-%d

"""

__version__ = '$Id: aboutMsg.py,v 1.21 2006/03/20 23:03:13 connolly Exp $'

from string import split, join
import rfc822, mailbox, time

import notation3, toXML
from RDFSink import SYMBOL, LITERAL, FORMULA, ANONYMOUS, forSomeSym

import msgHeaderExt


def main(argv):
    import sys
    #hmm... command-line switch?
    #sink = notation3.ToN3(sys.stdout.write, "file://example/bogus")
    sink = toXML.ToRDF(sys.stdout, "file://example/bogus")
    sink.startDoc()
    sink.bind("email", SwMail.nsname)
                        
    fmla = (FORMULA, "http://example/bogus#_formula")

    filt = None

    if argv[2:] and argv[1] == '--from':
        filt = FromFilter(argv[2])
        del argv[1:3]
    if argv[2:] and argv[1] == '--since':
        filt = SinceFilter(argv[2])
        del argv[1:3]

    if argv[1:] and argv[1] == '--msg':
        msg = rfc822.Message(sys.stdin)
        aboutMsg(sink, msg, fmla)
    elif argv[2:] and sys.argv[1] == '--mh':
        box = mailbox.MHMailbox(argv[2])
        aboutMailbox(sink, fmla, box, filt)
    else:
        for fn in argv[1:]:
            box = mailbox.UnixMailbox(open(fn))
            aboutMailbox(sink, fmla, box, filt)
    
    sink.endDoc()

def _test():
    import doctest, aboutMsg
    doctest.testmod(aboutMsg)

def test():
    import sys
    
    sink = notation3.ToN3(sys.stdout.write, "file://example/bogus")
    sink.startDoc()
    sink.bind("email", (SYMBOL, SwMail.nsname))
                        
    fmla = (FORMULA, "http://example/bogus#_formula")

    box = mailbox.UnixMailbox(sys.stdin)
    aboutMailbox(sink, fmla, box)
    
    sink.endDoc()

class FromFilter:
    def __init__(self, addr):
        self._addr = addr

    def test(self, msg):
        who = msg.getaddrlist('from')
        if who and who[0][1] == self._addr:
            return 1
        return 0
        

class SinceFilter:
    def __init__(self, date):
        self._when = time.strptime(date, "%Y-%m-%d")

    def test(self, msg):
        when = msg.getdate('date')
        
        if when and when >= self._when:
            return 1
        return 0
        
def aboutMailbox(sink, fmla, box, filter = None):
    while 1:
        msg = box.next()
        if not msg: break
        if filter:
            if filter.test(msg):
                aboutMsg(sink, msg, fmla)
        else:
            aboutMsg(sink, msg, fmla)

    
def aboutMsg(sink, m, fmla):
    mid = m.getheader('message-id')
    if mid:
        subj = "mid:%s" % (mid[1:-1])

        stringField(sink, fmla, m, subj, "date")
        dateField(sink, fmla, m, subj, "date")
        dateField(sink, fmla, m, subj, "received")
        textField(sink, fmla, m, subj, "subject")
        refField(sink, fmla, m, subj, "references")
        senders = whoField(sink, fmla, m, subj, "from")
        recips = whoField(sink, fmla, m, subj, "to")
        recips = recips + whoField(sink, fmla, m, subj, "cc")

        say = sink.makeStatement
        say((fmla,
             (SYMBOL, SwMail.senders), #@@clash with future header fields?
             (SYMBOL, subj),
             (LITERAL, join([addr + "\n" for addr in senders], '')) ))
        say((fmla,
             (SYMBOL, SwMail.recipients), #@@clash with future header fields?
             (SYMBOL, subj),
             (LITERAL, join([addr +"\n" for addr in recips], '')) ))

    else:
        pass # no message-id??
    


_serial = 1
def something(fmla, hint):
    global _serial
    s = _serial +1
    _serial = s
    return "%s_%s_%s" % (fmla[1], hint, s)

_things = {} # memory leak. should use weak ref dictionary
def theThing(fmla, p, v, hint):
    global _things, _serial
    it = _things.get((p, v), None)
    if it is not None: return it
    it = something(fmla, hint)
    _things[(p, v)] = it
    return it


def stringField(sink, fmla, m, subj, fld):
    v = m.getheader(fld)
    if v:
        sink.makeStatement((fmla,
                            (SYMBOL, SwMail.sym(fld)),
                            (SYMBOL, subj),
                            (LITERAL, v) ))

def textField(sink, fmla, m, subj, fld):
    """ makes no statement in case of mangled text encoding
    """

    v = m.getheader(fld)
    if v:
        try:
            v = msgHeaderExt.decode(v)
        except (UnicodeError, LookupError):
            return
        sink.makeStatement((fmla,
                            (SYMBOL, SwMail.sym(fld)),
                            (SYMBOL, subj),
                            (LITERAL, v) ))

def dateField(sink, fmla, m, subj, fld):
    s = m.getheader(fld)
    if s:
        try:
            v = fmtdate(s)
        except ValueError:
            return
        sink.makeStatement((fmla,
                            (SYMBOL, SwMail.sym(fld + "_iso")), #@@kludge
                            (SYMBOL, subj),
                            (LITERAL, v) ))

def fmtdate(s):
    """convert rfc822 format to YYYY-MM-DDTHH:MM:SSZ format

    >>> fmtdate("Tue, 17 Jun 2003 11:47:39 +0200")
    '2003-06-17T09:47:39Z'

    >>> fmtdate("Tue, 17 Jun 2003 09:04:18 -0400")
    '2003-06-17T13:04:18Z'
    
    >>> fmtdate("Jan 01 2000 01:23:45 +200")
    '1999-12-31T23:23:45Z'

    skip stuff before ';' e.g. in received headers

    >>> fmtdate("Received: from tux.w3.org ...; Thu, 7 Mar 2002 13:41:25 -0500 (EST)")
    '2002-03-07T18:41:25Z'
    
    raise ValueError of the date isn't parseable

    >>> fmtdate("Wed, 25 Jun 2003 21:25:12 --0500")
    Traceback (most recent call last):
      File "<stdin>", line 1, in ?
      File "aboutMsg.py", line 194, in fmtdate
        if ttupz[9] is None: raise ValueError
    ValueError
    
    """
    
    i = s.find(';')
    if i > 0: s = s[i+1:].strip()

    ttupz = rfc822.parsedate_tz(s)
    if not ttupz: raise ValueError
    if ttupz[9] is None: raise ValueError
    secs = time.mktime(ttupz[:9]) - ttupz[9] - time.timezone
    ttup = time.gmtime(secs)
    return time.strftime("%04Y-%02m-%02dT%02H:%02M:%02SZ", ttup)


def refField(sink, fmla, m, subj, fld):
    txt = m.getheader(fld)
    if txt:
        # work around a bug in evolution 2.4.1:
        # In-Reply-To: %3C43CB7B73.3090207@internetalchemy.org%3E
        # References: %3C43CB7B73.3090207@internetalchemy.org%3E
        # Date: Wed, 25 Jan 2006 09:05:20 -0600
        # Message-Id: <1138201520.4991.507.camel@dirk.w3.org>
        # X-Mailer: Evolution 2.4.2.1
        #
        # reported:
        # Subject: 	Bug#351087: Acknowledgement (evolution: too much hex encoding in in-reply-to and references threading header fields)
        #Date: 	Thu, 02 Feb 2006 09:48:08 -0800  (11:48 CST)
        
        if txt.startswith("%3C"):
            refs = txt.split()
            for i in range(0, len(refs)):
                r = refs[i]
                if r.startswith("%3C") and r.endswith("%3E"):
                    refs[i] = "<" + r[3:-3] + ">"
            txt = " ".join(refs)

        for ref in txt.split(">"):
            ref = ref.strip()
            if ref and ref[0] == '<':
                obj = (SYMBOL, "mid:%s" % ref[1:])
                sink.makeStatement((fmla,
                                    (SYMBOL, SwMail.sym(fld)),
                                    (SYMBOL, subj),
                                    obj))




def whoField(sink, fmla, m, subj, fld):
    parties = m.getaddrlist(fld)
    prop = SwMail.sym(fld)
    say = sink.makeStatement

    addrs = []
    
    for who in parties:
        addrs.append(who[1])

        mbox_uri = 'mailto:%s' % (who[1],)
        whoT = theThing(fmla, SwMail.mbox, mbox_uri, "who")

        say((fmla, (SYMBOL, forSomeSym), fmla,
             (ANONYMOUS, whoT)))
        say((fmla, (SYMBOL, prop), (SYMBOL, subj),
             (ANONYMOUS, whoT)))

        if who[0]:
            try:
                phrase = msgHeaderExt.decode(who[0])
            except (LookupError, UnicodeError):
                # e.g.
                # From: "=?ks_c_5601-1987?B?wNPB9sjGXChuZXRzZ28uY29tXCk=?=" <maxim98@netsgo.com>
                # or From: Sebastian Mu<F1>iz <sjmuniz@...>
                # we just don't know what that phrase is
                pass
            else:
                say((fmla, (SYMBOL, SwMail.phrase), (ANONYMOUS, whoT),
                     (LITERAL, phrase) ))

        say((fmla, (SYMBOL, SwMail.mbox), (ANONYMOUS, whoT),
             (SYMBOL, mbox_uri) ))

    return addrs

    
class Namespace:
    """A collection of URIs witha common prefix.

    ACK: AaronSw / #rdfig
    http://cvs.plexdev.org/viewcvs/viewcvs.cgi/plex/plex/plexrdf/rdfapi.py?rev=1.6&content-type=text/vnd.viewcvs-markup
    """
    def __init__(self, nsname): self.nsname = nsname
    def __getattr__(self, lname):
        if lname[:2] == '__': raise AttributeError
        return self.nsname + lname
    def sym(self, lname): return self.nsname + lname

SwMail = Namespace("http://www.w3.org/2000/10/swap/pim/email#")


import sys

if __name__ == "__main__":
    if '--test' in sys.argv:
        _test()
    else:
        main(sys.argv)


# $Log: aboutMsg.py,v $
# Revision 1.21  2006/03/20 23:03:13  connolly
# handle spaces between references
# ugh... not enough testing
#
# Revision 1.20  2006/02/09 16:52:06  connolly
# create just one bnode per mailbox
#
# Revision 1.19  2006/02/02 18:17:01  connolly
# worked around evolution threading bug (debian Bug#351087)
#
# Revision 1.18  2006/01/12 17:37:43  dom
# never mind, was using old version of cwm
#
# Revision 1.17  2006/01/12 17:01:27  dom
# bug fix (bind function wasn't called with proper args
#
# Revision 1.16  2005/09/02 17:50:10  connolly
# handle multiple mboxes on command line
#
# Revision 1.15  2004/10/29 13:28:08  connolly
# open first arg, not stdin
#
# Revision 1.14  2004/10/29 13:26:57  connolly
# cleaned up dead code with pychecker
# documented usage
#
# Revision 1.13  2003/07/17 17:15:38  connolly
# trap problems inside textField
#
# Revision 1.12  2003/07/03 21:23:07  connolly
# added a fmtdate test case for crossing year boundaries
#
# Revision 1.11  2003/06/27 13:15:30  connolly
# handled a bogus date format from ms outlook express
#
# Revision 1.10  2003/06/21 14:52:38  connolly
# took timezone issues seriously in date formatting
#
# Revision 1.9  2003/06/10 22:36:29  connolly
# added --since arg
#
# Revision 1.8  2003/06/10 22:01:33  connolly
# update bind API
# add --msg option
#
# Revision 1.7  2002/05/09 22:22:58  connolly
# un-hardcode from address
#
# Revision 1.6  2002/05/09 22:22:19  connolly
# un-hardcode from address
#
# Revision 1.5  2002/05/09 22:21:05  connolly
# un-hardcode from address
#
# Revision 1.4  2002/04/27 02:58:10  connolly
# 27Mar: restricted to mail from me
#
# Revision 1.3  2002/03/24 21:50:18  connolly
# - handle no subject
# - senders/recips: line oriented
#
# Revision 1.2  2002/03/24 20:45:55  connolly
# handles not just messages but mailboxes
# senders/recipients fields. hmm...
# references
#
# Revision 1.1  2002/03/24 16:07:37  connolly
# works for one message: PYTHONPATH=~/w3ccvs/WWW/2000/10/swap python aboutMsg.py <~/evolution/local/Sent/mbox
#
