# $Id: sgmllib.py,v 1.4 1996/09/17 05:13:23 connolly Exp $ # A lexer, parser for SGML, using the derived class as static DTD. # This only supports those SGML features used by HTML. # See W3C tech report: "A lexical analyzer for HTML and Basic SGML" # http://www.w3.org/pub/WWW/MarkUp/SGML/sgml-lex/sgml-lex.html # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import sgmllex # compiled flex scanner import string def sgml_lex_attrval(v): # @@ this should go in sgml_lex API #@@ deal with spaces, entity/char references return v[1:-1] # strip quotes # SGML lexer base class -- find tags and call handler functions. # Usage: p = SGMLLexer(); p.feed(data); ...; p.close(). # The data # between tags is passed to the parser by calling self.handle_data() # with some data as argument (the data may be split up in arbutrary # chunks). Entity references are passed by calling # self.handle_entityref() with the entity reference as argument. class SGMLLexer: def __init__(self): self.l = sgmllex.scanner() # Interface -- feed some data to the parser. Call this as # often as you want, with as little or as much text as you # want (may include '\n'). def feed(self, data): self.l.scan(data, self.structure, self.aux, self.err) def line(self): return self.l.line() def close(self): self.l.scan("", self.structure, self.aux, self.err) def structure(self, types, strings): if types[0] is sgmllex.data: self.handle_data(strings[0]) elif types[0] is sgmllex.generalEntity: # strip leading & self.handle_entityref(strings[0][1:]) elif types[0] is sgmllex.numCharRef: # strip leading &#, convert to char self.handle_data(chr(string.atoi(strings[0][2:]))) elif types[0] is sgmllex.startTag: # strip leading < gi = strings[0][1:] attrs = [] i = 1 while i+1 < len(strings): n, v = strings[i],strings[i+1] # HACK for unquoted literals... if types[i+1] is sgmllex.literal \ and v[0] == '"': v = sgml_lex_attrval(v) attrs.append((n, v)) i = i + 2 self.startTag(gi, attrs) elif types[0] is sgmllex.endTag: # strip leading self.handle_pi(strings[0][2:-1]) else: #XXX markup declarations, etc. pass def err(self, types, strings): pass # SGML parser class -- find tags and call handler functions. # Usage: p = SGMLParser(); p.feed(data); ...; p.close(). # The dtd is defined by deriving a class which defines methods # with special names to handle tags: start_foo and end_foo to handle # and , respectively, or do_foo to handle by itself. # (Tags are converted to lower case for this purpose.) # XXX what about periods, hyphens in tag names? class SGMLParser(SGMLLexer): # Interface -- initialize and reset this instance def __init__(self, verbose = 0): self.verbose = verbose SGMLLexer.__init__(self) self.reset() # Interface -- reset this instance. Loses all unprocessed data def reset(self): self.stack = [] self.cdata = 0 # For derived classes only -- enter literal mode (CDATA) def setliteral(self, *args): self.cdata = 1 #@@ finish implementing this... def startTag(self, tag, attrs): try: method = getattr(self, 'start_' + tag) except AttributeError: try: method = getattr(self, 'do_' + tag) except AttributeError: self.unknown_starttag(tag, attrs) return method(attrs) return self.stack.append(tag) method(attrs) def endTag(self, tag): try: method = getattr(self, 'end_' + tag) except AttributeError: self.unknown_endtag(tag) return if self.stack and self.stack[-1] == tag: del self.stack[-1] else: self.report_unbalanced(tag) # Now repair it found = None for i in range(len(self.stack)): if self.stack[i] == tag: found = i if found <> None: del self.stack[found:] method() # Example -- report an unbalanced tag. def report_unbalanced(self, tag): if self.verbose: print '*** Unbalanced ' print '*** Stack:', self.stack # Definition of entities -- derived classes may override entitydefs = \ {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"'} # Example -- handle entity reference, no need to override def handle_entityref(self, name): table = self.entitydefs if table.has_key(name): self.handle_data(table[name]) else: self.unknown_entityref(name) return # Example -- handle data, should be overridden def handle_data(self, data): pass # Example -- handle comment, could be overridden def handle_comment(self, data): pass # Example -- handle processing instruction, could be overridden def handle_pi(self, data): pass # To be overridden -- handlers for unknown objects def unknown_starttag(self, tag, attrs): pass def unknown_endtag(self, tag): pass def unknown_entityref(self, ref): pass class TestSGML(SGMLParser): def __init__(self): SGMLParser.__init__(self) # performance measurements self.l.normalize(1) self.tokQty = 0 self.tokTot = 0 self.calls = 0 def handle_data(self, data): r = repr(data) if len(r) > 72: r = r[:35] + '...' + r[-35:] print 'data:', r def handle_comment(self, data): r = repr(data) if len(r) > 68: r = r[:32] + '...' + r[-32:] print 'comment:', r def unknown_starttag(self, tag, attrs): print 'start tag: <' + tag, for name, value in attrs: if name: print name + '=' + '"' + value + '"', else: print value, print '>' def unknown_endtag(self, tag): print 'end tag: ' def unknown_entityref(self, ref): print '*** unknown entity ref: &' + ref + ';' def structure(self, types, strings): self.calls = self.calls + 1 self.tokQty = self.tokQty + len(types) self.tokTot = self.tokTot + lenstrings(strings) SGMLParser.structure(self, types, strings) def aux(self, types, strings): self.calls = self.calls + 1 self.tokQty = self.tokQty + len(types) self.tokTot = self.tokTot + lenstrings(strings) SGMLParser.aux(self,types,strings) def err(self, types, strings): self.calls = self.calls + 1 self.tokQty = self.tokQty + len(types) self.tokTot = self.tokTot + lenstrings(strings) SGMLParser.err(self,types,strings) def close(self): print "Calls: ", self.calls, \ "Tokens:", self.tokQty, \ "Ave TokLen:", self.tokTot/self.tokQty def lenstrings(strings): try: return len(string.join(strings,'')) except TypeError: ret = 0 for s in strings: if s: ret = ret + len(s) return ret def test(): import sys f = sys.stdin x = TestSGML() while 1: line = f.readline() if not line: x.close() break x.feed(line) if __name__ == '__main__': test()