/* SGML_stream.c * $Id: SGMLstream.c,v 1.3 93/01/06 18:40:28 connolly Exp Locker: connolly $ */ /* implements... */ #include "SGML.h" /* uses ... */ #include "object.h" #include #include #include VOID SGML_parseInstance(stream, getch, document, docclass) HMStream stream; HMGetcProc *getch; HMDoc* document; CONST HMDoc_Class *docclass; { static char RE[] = "\n"; char REbuffer[1 + SGML_LITLEN + SGML_NAMELEN + 4]; char *buffer = REbuffer + 1; int content = SGML_MIXED; int lookahead = EOF; int len, read; char gi[SGML_NAMELEN+1]; HMBinding attributes[SGML_ATTCNT]; int attrqty; char eat_next_RE = 1, RE_pending = 0; REbuffer[0] = '\n'; /*@@ should be 13, not 10! */ while( (read = SGML_read(stream, getch, buffer, sizeof(REbuffer) - 2, content, &lookahead)) != EOF){ switch(read){ case SGML_start_tag: if (RE_pending){ (docclass->data)(document, RE, 1); } len = SGML_read_name(stream, getch, gi, &lookahead); gi[len] = 0; attrqty = 0; while(isalpha(lookahead)){ /* iterate over attributes */ len = SGML_read_name(stream, getch, buffer, &lookahead); buffer[len] = 0; if(lookahead == '='){ int offset = len + 1; HMBinding* attr = &attributes[attrqty++]; lookahead = EOF; /* @@ entity references in attribute value */ len += SGML_read_value(stream, getch, buffer + offset, &lookahead) + 1; buffer[len++] = '\0'; attr->name = NEW(char, len); memcpy(attr->name, buffer, len); attr->value = attr->name + offset; } } /* look for tag close */ while(isspace(lookahead)) lookahead = (getch)(stream); lookahead = EOF; { int i; int c; c = (docclass->startTag)(document, gi, attributes, attrqty); if(c == SGML_EMPTY){ eat_next_RE = 0; }else{ content = c; eat_next_RE = 1; } for(i=0; iendTag)(document, gi); content = SGML_MIXED; /* @@ could be element */ eat_next_RE = 0; RE_pending = 0; break; case SGML_entity: if (RE_pending){ (docclass->data)(document, RE, 1); } eat_next_RE = 0; RE_pending = 0; { CONST char* text = (docclass->entityText)(document, buffer); if(text) (docclass->data)(document, text, strlen(text)); } break; case SGML_record_end: if(eat_next_RE){ eat_next_RE = 0; RE_pending = 0; } else if (RE_pending){ (docclass->data)(document, RE, 1); } else RE_pending = 1; break; default: buffer[read] = 0; if(RE_pending) (docclass->data)(document, REbuffer, read + 1); else (docclass->data)(document, buffer, read); RE_pending = 0; eat_next_RE = 0; break; } }while(read != EOF); } /***** * lexical analysis *****/ int SGML_read(stream, getch, buf, nbytes, content, inout_lookahead) HMStream stream; HMGetcProc* getch; char* buf; int nbytes; int content; int* inout_lookahead; { int c; /* state machine input character */ enum { /* state machine states */ start, data, cdata, rcdata, pcdata, and, and_hash, cref, entity, lt, lt_slash, tag, pi, lt_bang, lt_bang_dash, comment, comment_dash, ps } state = start; /* auxiliary state: */ int end_tag; /* saw '/' after '<' */ char name[SGML_NAMELEN + 1]; /* function character name */ int name_chars; int ret = 0; /* number of characters read */ #define LOOKAHEAD(n) (ret + n < nbytes) #define REDUCE(s) { state = (s); break; } #define SHIFT(s) { state = (s); continue; } #define DONE(c) { *inout_lookahead = (c); return ret; } #define WRITE(c) { *buf++ = (c); ret++; } /* prime the pump */ if((c = *inout_lookahead) == EOF) c = (getch)(stream); /* state machine...*/ while(ret < nbytes){ switch(state){ case start: if(c == EOF) return EOF; else if(c == '\n') { ret = SGML_record_end; DONE(EOF); } else if(c == '<'){ if(LOOKAHEAD(3)) { REDUCE(lt); } else { DONE(c); } /* no room for lookahead */ }else if(c == '&'){ if(LOOKAHEAD(2)) { REDUCE(and); } }else if(content == SGML_ELEMENT && isspace(c)){ break; /* ignore whitespace in ELEMENT content */ }else { SHIFT(data); } case data: if(content == SGML_ELEMENT){ if(isspace(c)){ break; }else{ *buf = 0; ret = 0; DONE(c); } }else if(content == SGML_CDATA){ SHIFT(cdata); } else if(content == SGML_RCDATA){ SHIFT(rcdata); } else /* assume SGML_MIXED */ { SHIFT(pcdata); } case cdata: if(c == EOF || c == '<' || c == '\n') { DONE(c); } else{ WRITE(c); break; } case rcdata: case pcdata: if(c == EOF || c == '<' || c == '&' || c == '\n') { DONE(c); } else{ WRITE(c); break; } case and: if(c == '#') { REDUCE(and_hash); } else if(isalpha(c)) { if(LOOKAHEAD(SGML_NAMELEN+1)){ name_chars = 0; SHIFT(entity); }else{ DONE(c); /* error: no room for entity name */ } } else{ WRITE('&'); SHIFT(data); } case entity: if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){ WRITE(c); break; } else{ WRITE('\0'); ret = SGML_entity; if(c == ';' || c == '\n'){ DONE(EOF); /* eat ; */ } else{ DONE(c); /* ended char ref with other char */ } } case and_hash: if(isalnum(c)){ name_chars = 0; SHIFT(cref); } else{ WRITE('&'); WRITE('#'); SHIFT(data); } case cref: /* auxiliary state: name_chars */ if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){ if(name_chars < SGML_NAMELEN) name[name_chars++] = c; /* else markup error: name too long */ break; } else{ int nc = 0; name[name_chars] = '\0'; if(isdigit(name[0])){ nc = atoi(name); }else if(!strcmp(name, "SPACE")){ nc = 32; }else if(!strcmp(name, "RS")){ nc = 10; }else if(!strcmp(name, "RE")){ nc = 13; } if(nc) WRITE(nc); /* else error: bad character reference */ if(c == ';') { REDUCE(data); } else /* terminate entity reference w/space or something */ { SHIFT(data); } } case lt: if(c == '/') { REDUCE(lt_slash); } if(content == SGML_MIXED || content == SGML_ELEMENT){ if(c == '?') { REDUCE(pi); } else if(c == '!') { REDUCE(lt_bang); } else if(isalpha(c)) { end_tag = 0; SHIFT(tag); } } WRITE('<'); SHIFT(data); case lt_slash: if(isalpha(c)) { end_tag = 1; SHIFT(tag); } else { WRITE('<'); WRITE('/'); SHIFT(data); } case tag: /* auxiliary state: end_tag */ ret = end_tag ? SGML_end_tag : SGML_start_tag; DONE(c); case pi: /* processing instruction (or markup declaraion) */ if(c == '>') { REDUCE(start); } else if(c == EOF) { SHIFT(start); } /* error: EOF in pic */ else break; case lt_bang: if(c == '-') { REDUCE(lt_bang_dash); } /* * *** NON CONFORMING IMPLEMENTATION *** * a letter here starts a markup declaration, which isn't supported * a [ starts a marked section, which isn't supported. * treat them like processing instructions. */ else if(c == '[' || isalpha(c)) { REDUCE(pi); } else if(c == '>') { REDUCE(start); } else{ WRITE('<'); WRITE('!'); SHIFT(data); } case lt_bang_dash: if(c == '-') { REDUCE(comment); } else{ WRITE('<'); WRITE('!'); WRITE('-'); SHIFT(data); } case comment: if(c == '-') { REDUCE(comment_dash); } else if(c == EOF) { DONE(c); } /* error: eof in comment */ else break; case comment_dash: if(c == '-') { REDUCE(ps); } else if(c == EOF) { DONE(c); }/* error: eof in comment */ else { REDUCE(comment); } case ps: /* parameter separator between -- and > */ if(c == EOF) { DONE(c); } else if(isspace(c)) break; else { REDUCE(start); }/* error if c !='>' */ } c = (getch)(stream); } DONE(c); /* set up lookahead for next call */ #undef S #undef LOOKAHEAD #undef REDUCE #undef SHIFT #undef DONE #undef WRITE } int SGML_read_name(stream, getch, buf, inout_lookahead) HMStream stream; HMGetcProc* getch; char* buf; int* inout_lookahead; { int name_chars = 0; int c = *inout_lookahead; if(!isalpha(c)) return 0; do{ if(name_chars <= SGML_NAMELEN) buf[name_chars++] = toupper(c); /* else error: name too long */ c = (getch)(stream); }while(isalnum(c) || strchr(SGML_UCNMCHAR, c)); while(isspace(c)) c = (getch)(stream); *inout_lookahead = c; return name_chars; } int SGML_read_value (stream, getch, buf, inout_lookahead) HMStream stream; HMGetcProc* getch; char* buf; int* inout_lookahead; { int c; /* state machine input character */ enum { /* state machine states */ start, literal, and, and_hash, cref, #if defined(SGML_SHORTTAG) || defined(GROK_UNQUOTED_LITERALS) value, #endif ps } state = start; /* auxiliary state: */ char quote; /* which kind of quote */ int ret = 0; /* number of characters read */ char name[SGML_NAMELEN + 1]; /* entity name */ int name_chars; #define LOOKAHEAD(n) (ret + n < SGML_LITLEN) #define REDUCE(s) { state = (s); break; } #define SHIFT(s) { state = (s); continue; } #define DONE(c) { *inout_lookahead = (c); return ret; } #define WRITE(c) { *buf++ = (c); ret++; } /* prime the pump */ if((c = *inout_lookahead) == EOF) c = (getch)(stream); /* state machine...*/ while(ret < SGML_LITLEN){ switch(state){ case start: if(c == EOF) return EOF; else if(c == '"') { quote = c; REDUCE(literal); } else if(c == '\'') { quote = c; REDUCE(literal); } else if(isspace(c)) break; #ifdef GROK_UNQUOTED_LITERALS else if(!(c == '>')){ SHIFT(value); } #else #ifdef SGML_SHORTTAG else if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){ SHIFT(value); } #else else { DONE(c); } /* error: illegal char in markup */ #endif #endif #ifdef GROK_UNQUOTED_LITERALS case value: if(c == EOF) { DONE(c); } else if(isspace(c) || c == '>'){ SHIFT(ps); } else{ WRITE(c); break; } #else #ifdef SGML_SHORTTAG case value: if(c == EOF) { DONE(c); } else if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){ WRITE(c); break; }else{ SHIFT(ps); } #endif #endif case literal: if(c == EOF) { DONE(c); } else if(c == quote) { REDUCE(ps); } else if(c == '&'){ REDUCE(and); } else if(c == '\n' || c == '\t'){ WRITE(' '); break; } else{ WRITE(c); break; } case and: if(c == '#') { REDUCE(and_hash); } /*@@ else if(isalpha(c)) ... process entity reference */ else{ WRITE('&'); SHIFT(literal); } case and_hash: if(isalnum(c)){ name_chars = 0; SHIFT(cref); } else{ WRITE('&'); WRITE('#'); SHIFT(literal); } case cref: /*@@ in case of xyz, this throws out xyz as error, when it should only throw out x */ if(isdigit(c) || isalpha(c) || strchr(SGML_UCNMCHAR, c)){ if(name_chars < SGML_NAMELEN) name[name_chars++] = c; /* else markup error: name too long */ break; } else{ int nc = 0; name[name_chars] = '\0'; if(isdigit(name[0])){ nc = atoi(name); }else if(!strcmp(name, "SPACE")){ nc = 32; }else if(!strcmp(name, "RS")){ nc = 10; }else if(!strcmp(name, "RE")){ nc = 13; }else break; if(nc) WRITE(nc); /* else error: bad character reference */ if(c == ';') { REDUCE(literal); } else /* terminate entity reference w/space or something */ { SHIFT(literal); } } case ps: /* parameter separator between attributes */ if(isspace(c)) break; else { DONE(c); } } c = (getch)(stream); } /* error: attribute value too long */ DONE(EOF); /* set lookahead to EOF for next call */ #undef S #undef LOOKAHEAD #undef REDUCE #undef SHIFT #undef DONE #undef WRITE }