"""Use ply to tokenize a SPARQL expression


"""

import re
import sys
#import lex
import time

bufsiz = 3
tryLongest = True

import cPickle as pickle

def abbr(prodURI):
   if prodURI is None: return None
   return prodURI.split('#').pop()

wide_build = (len(u"\U00012345") == 1)
def smartCompile(pattern, flags=0):
    if not wide_build:
        pattern = pattern.replace(u"\U00010000-\U000effff", u"\ud800-\udb7f\udc00-\udfff")
    return re.compile(pattern, flags)

def importTokens():
    global tokens
    if tokens is None:
        try:
            t0 = time.time()
            from sparql_tokens_table import tokens as ts, regexps as rs
            t1 = time.time()
            print >> sys.stderr, 'loaded from file ', t1 - t0
            tokens = ts
            for k, v in rs.iteritems():
                setattr(Tokens, k, v)
        except ImportError:
            from swap import myStore
            store = myStore._checkStore()
            F = myStore.load('http://www.w3.org/2000/10/swap/grammar/sparql')
            BNF = myStore.Namespace("http://www.w3.org/2000/10/swap/grammar/bnf#")
            regexps = {}
            k = F.statementsMatching(pred=BNF.tokens)
            if len(k) != 1:
                raise RuntimeError("Expected 1 occurrence of bnf:tokens, got %i: %s" % (len(k), `k`))
            for triple in k:
                tokens = [x.uriref() for x in triple.object()]
            tokens.append(BNF.PASSED_TOKENS.uriref())
            for triple in F.statementsMatching(pred=BNF.matches):
                s, p, o = triple.spo()
                key = s.uriref()
                val = o.value()
                if key in tokens:
                    setattr(Tokens, 't_' + key, val)
                    regexps['t_' + key] = val
                    setattr(Tokens, 'c_' + key, smartCompile(val, re.I))
                    regexps['c_' + key] = smartCompile(val, re.I)
            pklVal = {'tokens': tokens, 'regexps': regexps}
            try:
                import imp, os.path
                try:
                    path = imp.find_module('sparql')[1]
                except ImportError:
                    path = ''
#                path = ''
                f = file(os.path.join(path, 'sparql_tokens_table.py'), 'w')
                mkmodule(pklVal, f)
                f.close()
            except:
                raise

def mkmodule(result, out):
   import pprint
   tokens = result['tokens']
   regexps = result['regexps']

   pp = pprint.PrettyPrinter()
   print >> out, '#!/usr/bin/env python'
   print >> out, '"""sparql_tokens_table - For use with sparql_tokens.py."""'
   print >> out, '# Automatically generated by sparql_tokens.py'
   print >> out
   print >> out, 'import re'
   print >> out
   print >> out, 'wide_build = (len(u"\U00012345") == 1)'
   print >> out, 'def smartCompile(pattern, flags=0):'
   print >> out, '    if not wide_build:'
   print >> out, '        pattern = pattern.replace(u"\U00010000-\U000effff", u"\ud800-\udb7f\udc00-\udfff")'
   print >> out, '    return re.compile(pattern, flags)'
   print >> out   
   print >> out, 'tokens =', pp.pformat(tokens)
   print >> out, 'regexps = {'
   for (key, regexp) in regexps.iteritems():
        if isinstance(regexp, unicode):
            print >> out, '   %r: %r, ' % (key, regexp)
        else:
            print >> out, '   %r: smartCompile(%r, re.I), ' % (key, regexp.pattern)
   print >> out, '}'
   print >> out
   print >> out, 'if __name__=="__main__": '
   print >> out, '   print __doc__'

tokens = None
class Tokens(object):
    pass

##class Tokens(object):
##    #literal strings
##    t_IT_SELECT = u'SELECT'
##    t_IT_DISTINCT = u'DISTINCT'
##    t_IT_CONSTRUCT = u'CONSTRUCT'
##    t_IT_DESCRIBE = u'DESCRIBE'
##    t_IT_ASK = u'ASK'
##    t_IT_BASE = u'BASE'
##    t_IT_PREFIX = u'PREFIX'
##    t_FROM_NAMED = u'FROM(?:(?:\u0008)|(?:(?:\\n)|(?:(?:\\r)|(?:(?: )|(?:(?:\u00A0)|(?:(?:[\u2000-\u200B])|(?:(?:\u202F)|(?:(?:\u205F)|(?:\u3000)))))))))+NAMED'
##    t_IT_FROM = u'FROM'
##    t_IT_WHERE = u'WHERE'
##    t_IT_ORDER = u'ORDER'
##    t_IT_BY = u'BY'
##    t_IT_ASC = u'ASC'
##    t_IT_DESC = u'DESC'
##    t_IT_LIMIT = u'LIMIT'
##    t_IT_OFFSET = u'OFFSET'
##    t_IT_UNION = u'UNION'
##    t_IT_OPTIONAL = u'OPTIONAL'
##    t_IT_GRAPH = u'GRAPH'
##    t_IT_FILTER = u'FILTER'
##    t_IT_STR = u'STR'
##    t_IT_LANG = u'LANG'
##    t_IT_DATATYPE = u'DATATYPE'
##    t_IT_REGEX = u'REGEX'
##    t_IT_BOUND = u'BOUND'
##    t_IT_isURI = u'isURI'
##    t_IT_isBLANK = u'isBLANK'
##    t_IT_isLITERAL = u'isLITERAL'
##    t_IT_true = u'true'
##    t_IT_false = u'false'
##
##
##    t_QuotedIRIref = u'<[^> ]*>'
##
##
##    t_FLOATING_POINT = u'(?:[0-9]+\\.[0-9]*(?:[eE][\\+-]?[0-9]+)?)|(?:(?:\\.[0-9]+(?:[eE][\\+-]?[0-9]+)?)|(?:[0-9]+(?:[eE][\\+-]?[0-9]+)))'
##    t_DECIMAL = u'(?:[0-9]+\\.[0-9]*)|(?:\\.[0-9]+)'
##    t_INTEGER = u'[0-9]+'
##    t_STRING_LITERAL_LONG1 = u'"""(?:(?:[^"\\\\])|(?:(?:(?:\\\\[^\\n\\r]))|(?:(?:(?:"[^"]))|(?:(?:""[^"])))))*"""'
##    t_STRING_LITERAL_LONG2 = u'\'\'\'(?:(?:[^\'\\\\])|(?:(?:(?:\\\\[^\\n\\r]))|(?:(?:(?:\'[^\']))|(?:(?:\'\'[^\'])))))*\'\'\''
##    t_STRING_LITERAL1 = u'\'(?:(?:[^\'\\\\\\n\\r])|(?:(?:\\\\[^\\n\\r])))*\''
##    t_STRING_LITERAL2 = u'"(?:(?:[^"\\\\\\n\\r])|(?:(?:\\\\[^\\n\\r])))*"'
##    t_LANGTAG = u'@[a-zA-Z]+(?:-[a-zA-Z0-9]+)*'
##    t_BNODE_LABEL = u'_:(?:(?:(?:_)|(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE]))))))))))))))(?:(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE])))))))))))))|(?:(?:_)|(?:(?:-)|(?:(?:\\.)|(?:(?:[0-9])|(?:\u00B7))))))*)'
##    t_QNAME = u'(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE]))))))))))))(?:(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE])))))))))))))|(?:(?:_)|(?:(?:-)|(?:(?:\\.)|(?:(?:[0-9])|(?:\u00B7))))))*)?:(?:(?:(?:_)|(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE]))))))))))))))(?:(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE])))))))))))))|(?:(?:_)|(?:(?:-)|(?:(?:\\.)|(?:(?:[0-9])|(?:\u00B7))))))*)'
##    t_QNAME_NS = u'(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE]))))))))))))(?:(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE])))))))))))))|(?:(?:_)|(?:(?:-)|(?:(?:\\.)|(?:(?:[0-9])|(?:\u00B7))))))*)?:'
##    t_VAR2 = u'\\$(?:(?:(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE])))))))))))))|(?:(?:_)|(?:(?:[0-9])|(?:\u00B7))))*)'
##    t_VAR1 = u'\\?(?:(?:(?:(?:(?:[A-Z])|(?:(?:[a-z])|(?:(?:[\u00C0-\u00D6])|(?:(?:[\u00D8-\u00F6])|(?:(?:[\u00F8-\u02FF])|(?:(?:[\u0370-\u037D])|(?:(?:[\u037F-\u1FFF])|(?:(?:[\u200C-\u200D])|(?:(?:[\u2070-\u218F])|(?:(?:[\u2C00-\u2FEF])|(?:(?:[\u3001-\uD7FF])|(?:[\uF900-\uFFFE])))))))))))))|(?:(?:_)|(?:(?:[0-9])|(?:\u00B7))))*)'
##    t_CloseSquare = u',?(?:(?:\u0008)|(?:(?:\\n)|(?:(?:\\r)|(?:(?: )|(?:(?:\u00A0)|(?:(?:[\u2000-\u200B])|(?:(?:\u202F)|(?:(?:\u205F)|(?:\u3000)))))))))*;?(?:(?:\u0008)|(?:(?:\\n)|(?:(?:\\r)|(?:(?: )|(?:(?:\u00A0)|(?:(?:[\u2000-\u200B])|(?:(?:\u202F)|(?:(?:\u205F)|(?:\u3000)))))))))*\\]'
##    t_CloseCurly = u',?(?:(?:\u0008)|(?:(?:\\n)|(?:(?:\\r)|(?:(?: )|(?:(?:\u00A0)|(?:(?:[\u2000-\u200B])|(?:(?:\u202F)|(?:(?:\u205F)|(?:\u3000)))))))))*;?(?:(?:\u0008)|(?:(?:\\n)|(?:(?:\\r)|(?:(?: )|(?:(?:\u00A0)|(?:(?:[\u2000-\u200B])|(?:(?:\u202F)|(?:(?:\u205F)|(?:\u3000)))))))))*\\.?(?:(?:\u0008)|(?:(?:\\n)|(?:(?:\\r)|(?:(?: )|(?:(?:\u00A0)|(?:(?:[\u2000-\u200B])|(?:(?:\u202F)|(?:(?:\u205F)|(?:\u3000)))))))))*\\}'
##    t_OptDot = u'\\.(?=((?:(?:(?:\u0008)|(?:(?:\\n)|(?:(?:\\r)|(?:(?: )|(?:(?:\u00A0)|(?:(?:[\u2000-\u200B])|(?:(?:\u202F)|(?:(?:\u205F)|(?:\u3000)))))))))*(?:(?:(?:(?:UNION)|(?:(?:OPTIONAL)|(?:(?:GRAPH)|(?:FILTER))))[^a-z])|(?:\\{)))))'
##    t_EmptyPattern = u'\\{(?:(?:\u0008)|(?:(?:\\n)|(?:(?:\\r)|(?:(?: )|(?:(?:\u00A0)|(?:(?:[\u2000-\u200B])|(?:(?:\u202F)|(?:(?:\u205F)|(?:\u3000)))))))))*\\}'
##
##    #ignored
##    t_PASSED_TOKENS = u'(?:(?:(?:\u0008)|(?:(?:\\n)|(?:(?:\\r)|(?:(?: )|(?:(?:\u00A0)|(?:(?:[\u2000-\u200B])|(?:(?:\u202F)|(?:(?:\u205F)|(?:\u3000)))))))))+)|(?:(?:#[^\\n]*\\n)|(?:/\\*(?:(?:/[^\\*])|(?:[^/\\*]))*\\*/))'
##
##    #two characters
##    t_GT_OR = u'\\|\\|'
##    t_GT_AND = u'&&'
##    t_GT_NEQUAL = u'!='
##    t_GT_LE = u'<='
##    t_GT_GE = u'>='
##    t_GT_DTYPE = u'\\^\\^'
##
##    #single character
##    t_GT_TIMES = u'\\*'
##    t_GT_LPAREN = u'\\('
##    t_GT_RPAREN = u'\\)'
##    t_GT_SEMI = u';'
##    t_GT_COMMA = u','
##    t_IT_a = u'a'
##    t_GT_EQUAL = u'='
##    t_GT_LT = u'<'
##    t_GT_GT = u'>'
##    t_GT_PLUS = u'\\+'
##    t_GT_MINUS = u'-'
##    t_GT_DIVIDE = u'/'
##    t_GT_NOT = u'!'
##    t_Dot = u'\\.'
##    t_OpenCurly = u'\\{'
##    t_OpenSquare = u'\\['
##
##
##tokens = ('IT_SELECT',
##        'IT_DISTINCT',
##        'IT_CONSTRUCT',
##        'IT_DESCRIBE',
##        'IT_ASK',
##        'IT_BASE',
##        'IT_PREFIX',
##        'FROM_NAMED',
##        'IT_FROM',
##        'IT_WHERE',
##        'IT_ORDER',
##        'IT_BY',
##        'IT_ASC',
##        'IT_DESC',
##        'IT_LIMIT',
##        'IT_OFFSET',
##        'IT_UNION',
##        'IT_OPTIONAL',
##        'IT_GRAPH',
##        'IT_FILTER',
##        'IT_STR',
##        'IT_LANG',
##        'IT_DATATYPE',
##        'IT_REGEX',
##        'IT_BOUND',
##        'IT_isURI',
##        'IT_isBLANK',
##        'IT_isLITERAL',
##        'IT_true',
##        'IT_false',
##        'QuotedIRIref',
##        'FLOATING_POINT',
##        'DECIMAL',
##        'INTEGER',
##        'STRING_LITERAL_LONG1',
##        'STRING_LITERAL_LONG2',
##        'STRING_LITERAL1',
##        'STRING_LITERAL2',
##        'LANGTAG',
##        'BNODE_LABEL',
##        'QNAME',
##        'QNAME_NS',
##        'VAR2',
##        'VAR1',
##        'CloseSquare',
##        'CloseCurly',
##        'OptDot',
##        'EmptyPattern',
##        'PASSED_TOKENS',
##        'GT_OR',
##        'GT_AND',
##        'GT_NEQUAL',
##        'GT_LE',
##        'GT_GE',
##        'GT_DTYPE',
##        'GT_TIMES',
##        'GT_LPAREN',
##        'GT_RPAREN',
##        'GT_SEMI',
##        'GT_COMMA',
##        'IT_a',
##        'GT_EQUAL',
##        'GT_LT',
##        'GT_GT',
##        'GT_PLUS',
##        'GT_MINUS',
##        'GT_DIVIDE',
##        'GT_NOT',
##        'Dot',
##        'OpenCurly',
##        'OpenSquare',
##        'eof')

class Lexer(object):
    def __init__(self):
        self.uri = None
        self.buffer = None
        self.tokenStream = None
        self.chunk = ''
        self.was = ''
        self.line = 0
        self.fixTokens()

    def input(self, buf):
        self.buffer = buf
        self.line = 0
        self.tokenStream = self.parse()
        #print [m for m in self.parse()]

    def parse(self):
        while True: 
            self.chunk += self.buffer.read().decode('utf_8')
            if not self.chunk: break
            for token in self.tokenize(): 
                yield token

    def tokenize(self): 
        """Tokenize the current chunk."""
        while True: 
            if not self.chunk: break
            waslen = len(self.was)
            (name, m) = self.match(self.was + self.chunk, waslen)
            if m:
                token = (self.was + self.chunk)[m.start():m.end()]
                self.was = token
                self.line += len(token.split('\n')) - 1
                if abbr(name) != "PASSED_TOKENS":
                    yield (name, token, self.line)
                endpos = (m.end() - waslen)
                if not endpos: 
                   raise ValueError, "Got zero-length token"
                self.chunk = self.chunk[endpos:]
            else: break

    def token(self):
        if not self.tokenStream: return None
        try:
            return self.tokenStream.next()
        except StopIteration:
            self.tokenStream = None
            return ('http://www.w3.org/2000/10/swap/grammar/bnf#eof', '', -1)

    def fixTokens(self):
        importTokens()
        #print dir(Tokens)
        #print "Tokens = ", `tokens`

    def match(self, string, offset):
        """try everything in the list ``tokens''

        """
        sayAll = hasattr(self, 'sayAll')
        length = -1
        retVal = None
        for name in tokens:
            if hasattr(Tokens, 'c_' + name):
                if sayAll:
                    print "Trying to match ", name, " to ", string[offset:offset+10]
                pattern = getattr(Tokens, 'c_' + name)
                r = pattern.match(string, offset)
                if r:
                    if not tryLongest:
                        return (name, r)
                    extra = 0
                    if r.lastindex:
                        extra = r.end(1) - r.start(1)
                    if retVal is None or length < r.end() + extra:
                        retVal = (name, r)
                        length = r.end() + extra
                        
        if not retVal and offset<len(string):
            raise SyntaxError("found %s when expecting one of %s" %
                              (string[offset:],
                               [ str(t).split("#",1)[1] for t in tokens]))
        return retVal

def runLexer():
    lexer = Lexer()
    lexer.input(file(sys.argv[1]))
    while 1:
        tok = lexer.token()
        if not tok: break      # No more input
        print tok

if __name__ == '__main__':
    runLexer()

