#!/bin/env python
"""
an implementation of
Message Header Extensions for Non-ASCII Text
http://www.cis.ohio-state.edu/cgi-bin/rfc/rfc1522.html


Copyright (c) 2002 W3C (MIT, Keio, INRIA).
Share and Enjoy. For Open Source license details, see:
  http://www.w3.org/Consortium/Legal/copyright-software-19980720
It's OSI-approved
  http://www.opensource.org/licenses/index.html
"""

__version__ = '$Id: msgHeaderExt.py,v 1.5 2003/06/20 04:34:51 connolly Exp $'

import re, string
import base64, codecs

#from the spec:
# encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
EncodedWord = re.compile(r'=\?([^\?]+)\?([^\?]+)\?(([^\?]|\?[^=])+)\?=')

def decode(txt):
    txt.encode("us-ascii") # rase UnicodeError if there's bogus stuff.

    while 1:
        m = EncodedWord.search(txt)
        if not m:
            return txt
    
        charset, encoding, str = m.group(1), m.group(2), m.group(3).encode('us-ascii')
        dummy, enc, dummy, dummy = codecs.lookup(charset)
        if encoding in 'Qq':
            txt = txt[:m.start(0)] + enc(qdecode(str))[0] + txt[m.end(0):]
        elif encoding in 'Bb':
            txt = txt[:m.start(0)] + enc(base64.decodestring(str))[0] + txt[m.end(0):]
        else:
            raise NoSuchEncoding, encoding

class NoSuchEncoding(Exception):
    pass

def qdecode(str):
    # thanks to Lars Marius Garshol
    # 09 Feb 2002
    coded=string.replace(str,"_"," ")
    decoded=""
    strt=0
    while 1:
        end=string.find(coded,"=",strt)
        if end==-1:
            break
        decoded=decoded+coded[strt:end]+\
                 chr(string.atoi(coded[end+1:end+3],16))
        strt=end+3
        
    return decoded + coded[strt:]
                    
def test():
    import codecs, sys
    dummy, dummy, dummy, encWriter = codecs.lookup('utf-8')
    uout = encWriter(sys.stdout)
    
    cases = (("=?iso-8859-1?Q?Bill_de_h=D3ra?=", u'Bill de hÓra', None),
             ("abc=?iso-8859-1?Q?Bill_de_h=D3ra?=def", u'abcBill de hÓradef', None),
             ("=?windows-1252?Q?_Turismo_Rural_-_promo=E7=F5es_fant=E1sticas_/_promociones_fant=E1sticos?=", u' Turismo Rural - promo\xe7\xf5es fant\xe1sticas / promociones fant\xe1sticos', None),
             ("[closed] Re: Turismo Rural - =?iso-8859-1?Q?promo=E7=F5es?=  =?iso-8859-1?Q?_?= =?iso-8859-1?Q?fant=E1sticas?= / promociones   =?iso-8859-1?Q?fant=E1sticos?=", u'[closed] Re: Turismo Rural - promo\\xe7\\xf5es    fant\\xe1sticas / promociones   fant\\xe1sticos', None),
             ("=?ks_c_5601-1987?B?wNPB9sjGXChuZXRzZ28uY29tXCk=?=", 'exception!', 'X'),
             ("=?iso-8859-1?X?Bill_de_h=D3ra?=", 'exception!', 'X'),
             )

    
    for testIn, expected, expExc in cases:
        print "testing:", testIn
        exc = None
        res = None
        try:
            res = decode(testIn)
        except NoSuchEncoding, v:
            exc = v
            res = 'exception!'
        except LookupError, v:
            exc = v
            res = 'exception!'

        print "result:", `res`
        uout.write(res + "\n")
        if res <> expected:
            print "FAIL!", `res`, "expected:", `expected`
        if (not exc) <> (not expExc):
            print "FAIL! [%s] <> [%s]" % (`exc`, `expExc`)
            
if __name__ == '__main__':
    test()


# $Log: msgHeaderExt.py,v $
# Revision 1.5  2003/06/20 04:34:51  connolly
# fixed non-ascci exception bug
#
# Revision 1.4  2002/02/12 14:42:50  connolly
# raise an exception on non-ascii input
#
# Revision 1.3  2002/02/09 21:58:59  connolly
# added unknown encoding test
#
# Revision 1.2  2002/02/09 21:36:55  connolly
# implemented charset decoding
#
# Revision 1.1  2002/02/09 20:34:58  connolly
# implmente RFC1522
#
