/* $Id: sgml.l,v 1.1 1995/01/16 20:54:01 connolly Exp $ */ %{ #include #include #include #include "html.tab.h" typedef enum { content_element, content_EMPTY, content_MIXED, content_CDATA, content_RCDATA } SGML_Content; typedef enum { start_tags, end_tags, attribute_names, value_names, ns_max } Namespaces; static int lookup_name(const char* name, int namespace, SGML_Content *content); %} %s MIXED %x ATTR %x ATTRVAL %x CDATA %x RCDATA %x LIT /* INITIAL state is ELEMENT content */ FNMCHAR [a-zA-Z] NMCHAR [a-zA-Z0-9\.-] DIGIT [0-9] ps [ \t\r\n\f\l] RS \r RE \n %% static SGML_Content content = content_element; "<"{FNMCHAR}{NMCHAR}*{ps}* { int tok = lookup_name(yytext+1, start_tags, &content); if (tok >= 0){ BEGIN(ATTR); return tok; } /* @@ else go into "junk tag" state? */ } "]? { int tok = lookup_name(yytext+2, end_tags, &content); /* @# report syntax error if yytext doesn't end with > */ if(tok >= 0){ BEGIN(MIXED); /* @@ CDATA mode for XMP/LISTING? */ return tok; } /* else: report unknown GI */ } "&"{FNMCHAR}{NMCHAR}*";"? { /* @# assume all entities are CDATA -- or at least that they have no non-data markup */ /* @# return the data somehow */ return DATA; } /* @@ comments, marked sections, , pi's, etc. */ {ps}+ /* skip space */ {FNMCHAR}{NMCHAR}*{ps}*"="{ps}* { /* attribute name */ int tok = lookup_name(yytext, attribute_names, &content); BEGIN(ATTRVAL); if(tok >= 0){ return tok; } /* @# else report error: no such attribute */ } {FNMCHAR}{NMCHAR}*{ps}* { /* attribute value */ int tok = lookup_name(yytext, value_names, &content); if(tok >= 0){ return tok; } else{ return NAME; /* generic NAME token: store yytext somewhere? @@ */ } } ">" { BEGIN(MIXED); /* @@ switch(content){ ... } */ return TAGC; } . { /* @# report error: bad char in markup */ } {DIGIT}+{ps}* { BEGIN(ATTR); return NUMBER; } {NMCHAR}+{ps}* { BEGIN(ATTR); return NMTOKEN; } "\"" { BEGIN(LIT); } "\"" { BEGIN(ATTR); return LITERAL; /* @@single q*/} "&"{FNMCHAR}{NMCHAR}*";"? { /* @# assume all entities are CDATA -- or at least that they have no non-data markup */ /* append data to current LITERAL */ } {RE} { /* xvt to space */ } {RS} { /* xvt to space */ } [^\n\r&"]+ { /* append to current LITERAL */ } . { /* append to current LITERAL */ } {RS}+ { return DATA; /* @# ingore these? */} {RE} { return DATA; /* @# ignore these sometimes?*/} [^<&\n\t]+ { return DATA; } . { return DATA; } . { /* report error: bad char */ } %% #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) static int lookup_name(const char* name, int namespace, SGML_Content *content) { static struct{ char *gi; int tok[ns_max]; } dtd[] = { #include "tokens.c" }; int len = strlen(name); int i; int ret = NAME; char *dup = strdup(name); /* @@ NULL return? */ if(dup){ for(i = 0; i < len; i++){ if(isalpha(dup[i])) dup[i] = tolower((int)(dup[i])); else if(isdigit(dup[i]) || dup[i] == '.' || dup[i] == '-')/* ok */; else { dup[i] = '\0'; break; } } for(i = 0; i < ARRAY_SIZE(dtd); i++){ if(strcmp(dtd[i].gi, dup) == 0){ ret = dtd[i].tok[namespace]; break; } } free(dup); }/* @# else out of memory... */ return ret; }