#!/usr/local/bin/perl

package sgml;

# Content Modes
$CDATA = 1;
$RCDATA = 2;
$Mixed = 3;
$Element = 4;

chop($Name = <<EOF);
[A-Za-z][A-Za-z0-9\.-]*
EOF

chop($NMTOKEN = <<EOF);
[A-Za-z0-9\.-]+
EOF

chop($Literal = <<EOF);
(\"[^\"]*\")|(\'[^\']*\')
EOF

chop($CREF = <<EOF);
&\#([A-Za-z0-9][A-Za-z0-9\.-]*);?
EOF

#
# sgml`lex -- return next lexical token from SGML document
#
# Input data in $_. Characters are deleted from the from of $_
# as they are consumed.
#
# First arg (if present) is parsing mode: $CDATA, $RCDATA, $Mixed, or $Element.
# Defaults to $Mixed. SPACE is not reported in $Element mode.
#
# Return value is an array of the form (type, val, type, val, ...)
#
# Token types are:
#        DATA    -- data characters, e.g.                ('DATA', 'ab cde')
#        SPACE   -- like data, but only space characters ('SPACE', " \n")
#        DECL    -- markup declaration                   ('DECL', '<!DOCTYPE')
#        START   -- start tag                            ('START', '<HTML')
#        END     -- end tag                              ('END', '</UL')
#        ENT     -- entity reference                     ('ENT', '&amp')
#        COM     -- comment                              ('COM', '--xxx--')
#        PI      -- processing instruction               ('PI', '<?foo')
#        !MS     -- marked section    ('!MS', '<![IGNORE[xxx]]>')
#
# Auxiliary types are:
#
#        ATTR    -- attribute name (case folded)         ('ATTR', 'HREF')
#        NAME    -- SGML name (case folded)              ('NAME', 'HTML')
#        NMTOKEN -- unquoted attribute value             ('NMTOKEN', '.9z')
#        LITERAL -- quoted literal (quotes included in return value)
#                                                 ('LITERAL', '"ab&xyz;cde"')
#        !IDS    -- internal declaration subset* ('!IDS', '[<!-- xxx-->]>')
#
#   *types beginning with ! are not correctly supported.
#     They're parsed in a quick-and-dirty hack fashion.
#
# Values have punctuation included; except for LITERAL, trailing
# space and punctuation is discarded
#
# Return array is either a DATA/SPACE item, a token item with
# auxiliary items, or both.

sub lex{
    local($mode) = @_;

    $mode = $Mixed unless $mode;

    local(@ret, $data, $tok, $rest);

    while($_){

	# Markup Declarations:
	if($mode >= $Mixed && s/^(<!$Name)//){
	    ($tok = $1) =~ tr/a-z/A-Z/;

	    &flush;

	    push (@ret, 'DECL', $tok);

	    while($_){
		if(s/^>//){
		    return @ret;
		}
		elsif(s/^\s+//){
		    # skip
		}
		elsif(s/^($Name)//){
		    ($tok = $1) =~ tr/a-z/A-Z/;
		    push(@ret, 'NAME', $tok);
		}
		elsif(s/^(--([^-]|-[^-])*--)\s*//){
		    push(@ret, 'COM', $1);
		}
		elsif(s/^($NMTOKEN)//){
		    push(@ret, 'NMTOKEN', $1);
		}
		elsif(s/^($Literal)//){
		    push(@ret, 'LITERAL', $1);
		}

		elsif(s/^\[//){
		    $tok = $&;
		    if(/\]>/){ #@# space/comments allowed between ] >
			$rest = $` . $&; #` perl-mode fix-up
			substr($_, 0, length($rest)) = '';
		    }
		    push(@ret, '!IDS', $tok . $rest);
		    return @ret;
		}

		else{
		    s/^[^>]*>//
			|| s/.*\n?//; # if no >, delete a line.
		    warn "$0: Syntax error in declaration. Skipping: `$&'\n";
		    return @ret;
		}
	    }
	    warn "$0: EOF in declaration.\n";
	    return @ret;
	}

	# Comments
	if($mode >= $Mixed && s/^<!(--|>)/$1/){
	    &flush;

	    while($_){
		if(s/^>//){
		    return @ret;
		}
		elsif(s/^(--([^-]|-[^-])*--)\s*//){
		    push(@ret, 'COM', $1);
		}
		else{
		    s/^[^>]*>//
			|| s/.*\n?//; # if no >, delete a line.
		    warn "$0: Syntax error in comment. Skipping: `$&'\n";
		    return @ret;
		}
	    }
	    die "$0: Fatal error in  comment parsing";
	}

	# Marked sections hack
	elsif($mode >= $Mixed && s/^<!\[\s*//){
	    $tok = $&;
	    &flush;
	    if(/\]\]>/){
		$rest = $` . $&; #` perl-mode fix-up
		substr($_, 0, length($rest)) = '';
	    }
	    push(@ret, '!MS', $tok . $rest);
	    return @ret;
	}

	# Start Tags
	elsif($mode >= $Mixed && s/^(<$Name)\s*//
	      || s/^(<)>/>/){ # Empty start tag
	    local($attr_pending);
	    $tok = $1;

	    &flush;

	    $tok =~ tr/a-z/A-Z/; #@# support NAMECASE = NO?
	    push(@ret, 'START', $tok);

	    while($_){
		# end of start tag
		if(s/^>//
		   || /^</){ # unclosed start tag
		    return @ret;
		}
		elsif(s,^/,,){
		    warn "$0: NET tags not supported.\n";
		    return @ret;
		}

		# attribute values following attribute name=
		elsif($attr_pending && s/^($NMTOKEN)\s*//){
		    push(@ret, 'NMTOKEN', $1);
		    $attr_pending = 0;
		}
		elsif($attr_pending && s/^($Literal)\s*//){
		    push(@ret, 'LITERAL', $1);
		    $attr_pending = 0;
		}

		# attribute name
		elsif(s/^($Name)\s*=\s*//){
		    ($tok = $1) =~ tr/a-z/A-Z/; #@# support NAMECASE = NO?
		    push(@ret, 'ATTR', $tok);
		    $attr_pending = 1;
		}

		# attribute value without attribute name
		elsif(s/^($Name)\s*//){
		    ($tok = $1) =~ tr/a-z/A-Z/; #@# support NAMECASE = NO?
		    push(@ret, 'NAME', $tok);
		    $attr_pending = 0;
		}

		else{
		    s/^[^>]*>//
			|| s/.*\n?//; # if no >, delete a line.
		    warn "$0: Syntax error in start tag. Skipping: `$&'\n";
		    return @ret;
		}
	    }
	    warn "$0: EOF in start tag.\n";
	    return @ret;
	}

	# End Tags
	elsif(s,^(</$Name)\s*,, ||
	      s,^(</)>,>,){ # empty end tag
	    ($tok = $1) =~ tr/a-z/A-Z/; #@# support NAMECASE = NO?

	    &flush;

	    push(@ret, 'END', $tok);

	    if(s/^>//){
		return @ret;
	    }
	    else{
		s/^[^>]*>//
		    || s/.*\n?//; # if no >, delete a line.
		warn "$0: Syntax error or EOF in end tag. Skipping: `$&'\n";
		return @ret;
	    }
	}

	# Processing Instructions
	elsif($mode >= $Mixed && s/^(<\?[^>]*)//){
	    $tok = $1;

	    &flush;

	    push(@ret, 'PI', $tok);

	    if(s/^>//){
		return @ret;
	    }else{
		s/.*\n?//;	# no : delete a line.
		warn "$0: Syntax error or EOF in processing instruction. Skipping: `$&'\n";
		return @ret;
	    }
	}

	# Entity Reference
	elsif($mode >= $RCDATA && s/^(&$Name);?//){
	    $tok = $1;

	    &flush;

	    push(@ret, 'ENT', $tok);
	    return (@ret);
	}

	# Numeric Character Reference
	elsif($mode >= $RCDATA && s/^$CREF//){
	    $tok = $1;

	    $data .= &cref($tok);
	}

	# Data Characters
	elsif(s/^[^<&]+// || s/.//){
	    $data .= $&;
	}
    }

    &flush;
    return @ret;
}


sub flush{
    # PERL has dynamic scope. Nasty, but nifty!
    if(length($data) > 0){
	if($data =~ /\S/){ # any non space characters?
	    push(@ret, 'DATA', $data);
	}else{    # no...
	    if($mode < $Element){ # don't report SPACE in $Element mode
		push(@ret, 'SPACE', $data);
	    }
	}
	$data='';
    }
}


sub cref{
    local($tok) = @_;

    if($tok =~ /\D/){
	warn "$0: non-numeric character reference not supported: $tok\n";
    }elsif($tok > 255){
	warn "$0: numeric character reference out of range: $tok\n";
    }else{
	return pack("C", $tok);
    }

    return undef;
}


#
# dump -- pretty-print output of lex
#
sub dump{
    local(*entities, @stuff) = @_;
    local($indent) = 1;

    while(@stuff){
	local($t, $d);
	$t = shift(@stuff); $d = shift(@stuff);

	$d =~ s/\\/\\\\/g; # escape \'s
	$d =~ s/\n/\\n/g;  # don't split output over lines

	print " " x $indent, "$t: `$d'";
	print "  => `", &sgml'literal($d, *entities), "'" if $t eq 'LITERAL';

	print "\n";

	# first part of stuff can be DATA/SPACE token
	if($t eq 'DATA' || $t eq 'SPACE'){
	    print "\n";
	}else{
	    # indent "rest" of token info
	    $indent = 2;
	}
    }
    print "\n";
}

sub literal{
    local($_, *entities) = @_;
    local($before, $match, $n);

    s/^.//; s/.$//;		# remove quotes
    s/[\n\r \t]+/ /g;		# compress whitespace as per SGML (section ???)

    while(1){
	if(/&($Name);?|$CREF/){
	    $n = $1;
	    $before .= $`; $match = $&; $_ = $';

	    if('&#' eq substr($match, 0, 2)){
		$match =~ s/$CREF/&cref($1)/e;
	    }else{
		$n = $entities{$n};
		$match =~ s/&($Name);?/$n/e;
	    }
	    $before .= $match;
	}
	else{
	    last;
	}
    }

    return $before . $_;
}

1;
