#!/usr/bin/env python

'''
Transform an annotated RelaxNG schema and valid XML instance into n-triples.
Caveats:
    1. Namespace support is hacked, I'm using prefixes.
    2. Parsing of rng schema is shallow: 
       a. simple dictionary with element/attribute type name and its 
       annotations. Consequently, it doesn't expect to find identical 
       element/attribute types with different semantics.
       b. annotate mode won't give you annotations for
       anonymous classes (i.e., rdf:Descriptions in the schema), 
       because I emit a simple bnode when I look ahead and see
       the range includes "prefix:_". This can be fixed.
    3. Need to add a feature for XMLLiterals.
       
'''

from xml.sax import make_parser
from xml.sax import saxutils
from xml.sax import handler
from xml.sax.handler import ContentHandler
import sys


def normalize_whitespace(text):
    """Remove redundant whitespace from a string."""
    return ' '.join(text.split())

    
def rngExtract(rngfd, outfd, rng_nodes):
    """Parse the RNG and creating a rng_nodes dictionary for 
    RDF annotations corresponding to a RNG name. """
    
    class rngHandler(ContentHandler):
        """Create mappings for the translations."""
        
        def startElement(self, name, attrs):
            if name in ["grammar"]:     # namespace prefix/URI pairs
                for attr,val in attrs.items():
                    if attr[:6] == "xmlns:":
                        ns_dict[attr[6:]] = val
            elif name in ["element", "attribute"]: # elem/attr annotations
                rng_nodes[attrs.get('name')] = dict(attrs)
            elif name in ["rr:Description"]:       # anonymous class annotations
                rng_nodes[attrs.get('rr:ID')] = dict(attrs)

    parser = make_parser()
    rh = rngHandler()
    parser.setContentHandler(rh)
    parser.setFeature(handler.feature_namespaces, 0)
    parser.parse(rngfd)

def mapNT (xmlfd, outfd, rng_nodes):
    """Parse the XML instance and emit n-triples according
    to the annotations extracted in rngExtract(). """
    
    def _get_annotations(name, rng_nodes):
        """return a name,dictionary of annotations for an elem/attr type
             name corresponds to the rr:ID, if available
             dicionary are the type, domain and range 
               annotations for the name"""
           
        try:
            annotations = dict(rng_nodes[name]) # attr/val annotations
        except KeyError:
            sys.stdout.write("Error, no RDF correspondance to %s\n" % name)
            sys.exit()
        name = annotations.get('rr:ID', name)
        return name, annotations
            
        
    def _emit_rdfs(subject, annotations):
        """emit the rdf type, domain and range annotations
           for an elem/attr type"""
            
        if option_rdfs: 
            for attr,val in annotations.items():
                if attr == 'rr:ID': pass
                elif attr[:3] == 'rr:':
                    _emit_nt('   <%s> <%s> <%s> .\n', subject, attr, val)

                    
    def _emit_nt(format, *args):
        """emit as much of an ntriple sentence as possible"""
        
        from string import split 
        
        tweaked = []
        
        if not option_short:    # Expand QNames to URIs
            for arg in args:
                try:
                    prefix,name = split(arg, ':')
                    if prefix in ns_dict:
                        if ns_dict[prefix] == "http://www.w3.org/2003/02/schema-annotation/":
                            if name in ["type"]:
                                tweaked.append("http://www.w3.org/1999/02/22-rdf-syntax-ns#" + name)
                            elif name in ["domain","range"]:
                                tweaked.append("http://www.w3.org/2000/01/rdf-schema#" + name)
                        else:
                            tweaked.append(ns_dict[prefix] + name)
                    else:
                        tweaked.append(arg)
                except ValueError: # Error on the split
                    tweaked.append(arg)
            args = tuple(tweaked)
                
        sys.stdout.write(format %args)

            
    class xmlHandler(ContentHandler):
        """Parse the instance document for the rr:Literal values."""
                
        def startDocument(self) :
            self.parent_stack = []
            self.inXMLLiteral = None
        
        def startElement(self, name, attrs):
            """
            Algorithm:
            if I'm *not* an rdf:XMLLiteral element:
                if I'm a class element:
                    create a bnode corresponding to an instance of the class
                    emit subject(bnode) property(attr) value (attr-value)
                    place bnode on parent stack with type
                elif I'm a property element:
                    if my range includes a bnode class
                        create bnode
                        emit subject(parent) property(me) value(bnode)
                        place bnode on parent stack with type of rdfs:Class
                    else:
                        emit subject(parent), property(me), value(characters())
                        place me on parent stack with type
            """
    
            import random
            self.inElement = name
            
            if self.inXMLLiteral:
                sys.stdout.write("<%s" % name)
                if attrs:
                    for att,val in attrs.items():
                        sys.stdout.write(' %s="%s"'  %(att,val))
                sys.stdout.write(">")    
            else:
                elem_name, elem_annotations = _get_annotations(name, rng_nodes)
#                 print "*** ", elem_name, elem_annotations
                if elem_annotations['rr:type'] == 'rdfs:Class': 
                    bname = '_:' + elem_name[4:] + str(random.randint(1000,9999))
                    for attr,val in attrs.items():
                        if attr[:6] != "xmlns:":
                            attr_name, attr_annotations = _get_annotations(attr, rng_nodes)
#                             print "      ", attr_name, attr_annotations
                            if attr_annotations['rr:type'] == 'rdf:Resource':
                                _emit_nt('<%s> <%s> <%s> .\n', bname, attr_name, val)
                            elif attr_annotations['rr:type'] == 'rdf:Property':
                                _emit_nt('<%s> <%s> "%s" .\n', bname, attr_name, val)
                            _emit_rdfs(attr_name, attr_annotations)
                    self.parent_stack.append((bname,'rdfs:Class'))
                elif elem_annotations['rr:type']== 'rdf:Property':
                    if self.parent_stack != []:
                        parent = self.parent_stack[-1]
                        # If range is an anonymous class, emit bnode
                        if elem_annotations['rr:range'].find(":_") >= 0:
                            bname = "_:" + elem_annotations['rr:range'][4:] \
                                + str(random.randint(1000,9999))
                            _emit_nt('<%s> <%s> <%s> .\n', parent[0], elem_name, bname)
                            self.parent_stack.append((bname,'rdfs:Class'))
                        elif elem_annotations['rr:range'] == "rdf:XMLLiteral":
                            self.inXMLLiteral = name
#                             print "*** ENTERING literal mode with ", elem_name
                            _emit_nt('<%s> <%s> "', parent[0], elem_name)
                            self.parent_stack.append((name,'rdfs:Property')) 
                        else:
                            _emit_nt('<%s> <%s>', parent[0], elem_name)   
                            self.parent_stack.append((name,'rdfs:Property')) 
        
        def endElement(self, name):
            """emit the rdfs annotations for that element"""

            elem_name, elem_annotations = _get_annotations(name, rng_nodes)
            if self.inXMLLiteral:
                if name == self.inXMLLiteral:
                    _emit_nt('" .\n')
                    _emit_rdfs(elem_name, elem_annotations)
                    self.inXMLLiteral = None    # exit XMLLiteral 
                    self.parent_stack.pop()
                else:
                    sys.stdout.write("</%s>" %name) # close a XMLLiteral elem
            else:                               # emit rdfs annotations
                _emit_rdfs(elem_name, elem_annotations)
                self.inElement = None
                self.parent_stack.pop()
                                            
        def characters(self, ch):
            chars = normalize_whitespace(ch)
            if self.inXMLLiteral:
                sys.stdout.write(ch)
            elif chars and \
               self.inElement and \
               rng_nodes[self.inElement].has_key('rr:range'):
                if rng_nodes[self.inElement]['rr:range'] == "rdfs:Literal":
                    _emit_nt('"%s" .\n', chars)
                elif rng_nodes[self.inElement]['rr:range'] == "rdf:Resource":
                    _emit_nt('<%s> .\n', chars)
                    
    parser = make_parser()
    xh = xmlHandler()
    parser.setContentHandler(xh)
    parser.setFeature(handler.feature_namespaces, 0)
    parser.parse(xmlfd)
    
    
if __name__ == "__main__":

    import getopt, commands
    option_rdfs  = 0    # whether the triples should include the rr info as well
    option_short = 0    # where to use QNames instead of URIs
    try:
        (options,files) = getopt.getopt (sys.argv[1:],"as")
    except getopt.error:
        print "Error: Unknown option or missing argument."
        print "Usage: rng-rdf.py -[as] file.rng file.xml [output]"
        print "       -a(ll) emits rdfs statements as well"
        print "       -s(hort) uses the QNames insteadof URIs"
    for (option,value) in options:
        if option == '-a':
            option_rdfs = 1
        elif option == '-s':
            option_short = 1

    try:
        rngfd = open(files[0])
    except IndexError:
        print "Error: cannot open the relaxNG file ", files[0]
    try:
        xmlfd = open(files[1])
    except IndexError:
        print "Error: cannot open the XML instance file ", files[1]
    try:
        outfd = open(files[2], 'w')
        sys.stdout = outfd
    except IndexError:
        outfd = sys.stdout
    
    # Validate instance
    result = commands.getoutput('javax -jar /home/reagle/bin/jing.jar %s %s' \
        % (files[0], files[1]))
    if result != '':
        print "Error: jing isn't installed or found a problem."
        print result
        sys.exit()

    rng_nodes = {}  # elem/attr element types defined by the RNG
    ns_dict = {}    # prefix/NS pairs from the RNG's grammar element

    rngExtract(rngfd, outfd, rng_nodes)
#     print rng_nodes
    mapNT(xmlfd, outfd, rng_nodes)

                
    rngfd.close()
    xmlfd.close()
    outfd.close()
