#!/usr/local/bin/perl package sgml; # Content Modes $CDATA = 1; $RCDATA = 2; $Mixed = 3; $Element = 4; chop($Name = <') # # Auxiliary types are: # # ATTR -- attribute name (case folded) ('ATTR', 'HREF') # NAME -- SGML name (case folded) ('NAME', 'HTML') # NMTOKEN -- unquoted attribute value ('NMTOKEN', '.9z') # LITERAL -- quoted literal (quotes included in return value) # ('LITERAL', '"ab&xyz;cde"') # !IDS -- internal declaration subset* ('!IDS', '[]>') # # *types beginning with ! are not correctly supported. # They're parsed in a quick-and-dirty hack fashion. # # Values have punctuation included; except for LITERAL, trailing # space and punctuation is discarded # # Return array is either a DATA/SPACE item, a token item with # auxiliary items, or both. sub lex{ local($mode) = @_; $mode = $Mixed unless $mode; local(@ret, $data, $tok, $rest); while($_){ # Markup Declarations: if($mode >= $Mixed && s/^(//){ return @ret; } elsif(s/^\s+//){ # skip } elsif(s/^($Name)//){ ($tok = $1) =~ tr/a-z/A-Z/; push(@ret, 'NAME', $tok); } elsif(s/^(--([^-]|-[^-])*--)\s*//){ push(@ret, 'COM', $1); } elsif(s/^($NMTOKEN)//){ push(@ret, 'NMTOKEN', $1); } elsif(s/^($Literal)//){ push(@ret, 'LITERAL', $1); } elsif(s/^\[//){ $tok = $&; if(/\]>/){ #@# space/comments allowed between ] > $rest = $` . $&; #` perl-mode fix-up substr($_, 0, length($rest)) = ''; } push(@ret, '!IDS', $tok . $rest); return @ret; } else{ s/^[^>]*>// || s/.*\n?//; # if no >, delete a line. warn "$0: Syntax error in declaration. Skipping: `$&'\n"; return @ret; } } warn "$0: EOF in declaration.\n"; return @ret; } # Comments if($mode >= $Mixed && s/^)/$1/){ &flush; while($_){ if(s/^>//){ return @ret; } elsif(s/^(--([^-]|-[^-])*--)\s*//){ push(@ret, 'COM', $1); } else{ s/^[^>]*>// || s/.*\n?//; # if no >, delete a line. warn "$0: Syntax error in comment. Skipping: `$&'\n"; return @ret; } } die "$0: Fatal error in comment parsing"; } # Marked sections hack elsif($mode >= $Mixed && s/^/){ $rest = $` . $&; #` perl-mode fix-up substr($_, 0, length($rest)) = ''; } push(@ret, '!MS', $tok . $rest); return @ret; } # Start Tags elsif($mode >= $Mixed && s/^(<$Name)\s*// || s/^(<)>/>/){ # Empty start tag local($attr_pending); $tok = $1; &flush; $tok =~ tr/a-z/A-Z/; #@# support NAMECASE = NO? push(@ret, 'START', $tok); while($_){ # end of start tag if(s/^>// || /^]*>// || s/.*\n?//; # if no >, delete a line. warn "$0: Syntax error in start tag. Skipping: `$&'\n"; return @ret; } } warn "$0: EOF in start tag.\n"; return @ret; } # End Tags elsif(s,^(,>,){ # empty end tag ($tok = $1) =~ tr/a-z/A-Z/; #@# support NAMECASE = NO? &flush; push(@ret, 'END', $tok); if(s/^>//){ return @ret; } else{ s/^[^>]*>// || s/.*\n?//; # if no >, delete a line. warn "$0: Syntax error or EOF in end tag. Skipping: `$&'\n"; return @ret; } } # Processing Instructions elsif($mode >= $Mixed && s/^(<\?[^>]*)//){ $tok = $1; &flush; push(@ret, 'PI', $tok); if(s/^>//){ return @ret; }else{ s/.*\n?//; # no : delete a line. warn "$0: Syntax error or EOF in processing instruction. Skipping: `$&'\n"; return @ret; } } # Entity Reference elsif($mode >= $RCDATA && s/^(&$Name);?//){ $tok = $1; &flush; push(@ret, 'ENT', $tok); return (@ret); } # Numeric Character Reference elsif($mode >= $RCDATA && s/^$CREF//){ $tok = $1; $data .= &cref($tok); } # Data Characters elsif(s/^[^<&]+// || s/.//){ $data .= $&; } } &flush; return @ret; } sub flush{ # PERL has dynamic scope. Nasty, but nifty! if(length($data) > 0){ if($data =~ /\S/){ # any non space characters? push(@ret, 'DATA', $data); }else{ # no... if($mode < $Element){ # don't report SPACE in $Element mode push(@ret, 'SPACE', $data); } } $data=''; } } sub cref{ local($tok) = @_; if($tok =~ /\D/){ warn "$0: non-numeric character reference not supported: $tok\n"; }elsif($tok > 255){ warn "$0: numeric character reference out of range: $tok\n"; }else{ return pack("C", $tok); } return undef; } # # dump -- pretty-print output of lex # sub dump{ local(*entities, @stuff) = @_; local($indent) = 1; while(@stuff){ local($t, $d); $t = shift(@stuff); $d = shift(@stuff); $d =~ s/\\/\\\\/g; # escape \'s $d =~ s/\n/\\n/g; # don't split output over lines print " " x $indent, "$t: `$d'"; print " => `", &sgml'literal($d, *entities), "'" if $t eq 'LITERAL'; print "\n"; # first part of stuff can be DATA/SPACE token if($t eq 'DATA' || $t eq 'SPACE'){ print "\n"; }else{ # indent "rest" of token info $indent = 2; } } print "\n"; } sub literal{ local($_, *entities) = @_; local($before, $match, $n); s/^.//; s/.$//; # remove quotes s/[\n\r \t]+/ /g; # compress whitespace as per SGML (section ???) while(1){ if(/&($Name);?|$CREF/){ $n = $1; $before .= $`; $match = $&; $_ = $'; if('&#' eq substr($match, 0, 2)){ $match =~ s/$CREF/&cref($1)/e; }else{ $n = $entities{$n}; $match =~ s/&($Name);?/$n/e; } $before .= $match; } else{ last; } } return $before . $_; } 1;