#!/usr/local/bin/perl5
#
# Bert Bos <bert@w3.org>
# Created: 17 Mar 1998
# $Id: dtd2bnf,v 1.4 2002/05/21 19:36:06 bbos Exp $
#

my $linelen = 72;

my $PROG = substr($0, rindex($0, "/") + 1);
my $USAGE = "Usage: $PROG file\n";

my $string = "(?:\"([^\"]*)\"|\'([^\']*)\')";
my %attributes;			# Attribute lists
my @element;			# Elements in source order
my %model;			# Content models


# Parse a string into an array of "words".
# Words are whitespace-separated sequences of non-whitespace characters,
# or quoted strings ("" or ''), with the quotes removed.
sub parsewords {
    my $line = $_[0];
    my @words = ();
    while ($line ne '') {
	if ($line =~ /^\s+/) {
	    # Skip whitespace
	} elsif ($line =~ /^\"((?:[^\"]|\\\")*)\"/) {
	    push(@words, $1);
	} elsif ($line =~ /^\'((?:[^\']|\\\')*)\'/) {
	    push(@words, $1);
	} elsif ($line =~ /^\S+/) {
	    push(@words, $&);
	} else {
	    die "Cannot happen\n";
	}
	$line = $';
    }
    return @words;
}

# break lines at or before $linelen, indent continuation lines $indent
sub break
{
    my ($linelen, $indent, $line) = @_;
    my $result = '';
    $line =~ s/\s+$//o;		# Remove trailing whitespace
    while (length($line) > $linelen) {
	my $i = $linelen;
	BREAK: while (1) {
	    if (substr($line, $i, 1) =~ /\s/so) {
		# found a space
		last BREAK;
	    }
	    if ($i <= $linelen) {$i--;} else {$i++;}
	    if ($i == $indent) {
		# no space found to the left, try to the right
		$i = $linelen + 1;
	    }
	    if ($i == length($line)) {
		# no space found anywhere
		last BREAK;
	    }
	}
	my $part = substr($line, 0, $i);
	$part =~ s/\s+$//o;	# Remove trailing spaces
	$result .= $part;	# Add to result
	$line = substr($line, $i + 1);
	$line =~ s/^\s+//o;	# Remove leading spaces
	if (length($line) != 0) {
	    $result .= "\n";
	    $line = (' ' x $indent) . $line;
	}
    }
    if (length($line) != 0) {$result .= $line;}
    return $result;
}


# Store content model, return empty string
sub store_elt
{
    my ($name, $model) = @_;
    $model =~ s/\#PCDATA/TEXT*/gio;
    $model =~ s/\s+/ /gso;
    push(@element, $name);
    $element{$name} = $model;
    return '';
}


# Store attribute list, return empty string
sub store_att
{
    my ($element, $atts) = @_;
    $atts =~ s/\#FIXED//gio;	# Remove #FIXED
    my @words = parsewords($atts);
    $attributes{$element} = [ @words ];
    return '';
}


# Return maximum value of an array of numbers
sub max
{
    my $max = $_[0];
    foreach my $i (@_) {
	if ($i > $max) {$max = $i;}
    }
    return $max;
}


# Main

# read file into $buf
open(INPUT, $ARGV[0]) || die "$PROG: cannot read file $ARGV[0]\n";
$/ = undef;
my $buf = <INPUT>;
close(INPUT);

# remove comments
$buf =~ s/<!--.*?-->\s*//gso;

# remove processing instructions
$buf =~ s/<\?.*?>\s+//gso;

# loop until parameter entities fully expanded and no more conditional sections
while ($buf =~ s/<!ENTITY\s+%\s+(\S+)\s+$string\s*>\s*//so) {
  my $entity = $1;
  my $expansion = (defined $2?$2:"").(defined $3?$3:"").(defined $4?$4:"");
  # expand this entity everywhere
  $buf =~ s/%\Q$entity\E\b;?/$expansion/g;
  # remove any ignorable conditional sections that might have produced
  $buf =~ s/<!\[\s*IGNORE\s*\[.*?\]\]>\s*//gso;
  $buf =~ s/<!\[\s*INCLUDE\s*\[(.*?)\]\]>\s*/$1/gso;
}

# remove all other entities
$buf =~ s/<!ENTITY.*?>\s*//gsio;

# store attribute lists
$buf =~ s/<!ATTLIST\s+(\S+)\s+(.*?)>\s*/store_att($1, $2)/gsioe;

# store content models
$buf =~ s/<!ELEMENT\s+(\S+)\s+(.+?)>\s*/store_elt($1, $2)/gsioe;

# At this moment, the DTD should have nothing left but whitespace...
$buf =~ /^\s*$/ or warn "Warning: some unregnized text left over:\n$buf";

# find maximum length of non-terminals
my $maxlen = max(map(length, @element)) + 4;

# loop over elements, writing EBNF
foreach $e (@element) {
    my $model = $element{$e};
    my $has_attribs = defined $attributes{$e};
    my $s;

    # print rule for element $e
    $s = sprintf("%-${maxlen}s = ", $e);
    $s .= "\"<$e\" ";
    $s .= "${e}_att* " if ($has_attribs);
    $s .= "( \"/>\" | \">\" ${e}_cnt \"</$e>\" )";
    print break($linelen, $maxlen + 3, $s) . "\n";

    # print rule for $e's attributes
    if ($has_attribs) {
        printf("%-${maxlen}s = ", "${e}_att");
	my $h = $attributes{$e};
	my @atts = @$h;
	for (my $i = 0; $i <= $#atts; $i += 3) {
	    if ($i != 0) {print ' ' x $maxlen . " | ";}
	    # use only name, ignore type and default
	    print "\"$atts[$i]\" \"=\" STRING\n";
	}
    }
    $s = sprintf("%-${maxlen}s = ", "${e}_cnt");

    # print rule for $e's content model
    if ($model eq "EMPTY") {
	$s .= ";empty";
    } elsif ($model eq "ANY") {
	$s .= "( " . join(" | ", sort @element) . " )*";
    } else {
	$model =~ s/\s+$//o;	# remove trailing spaces
	if ($model =~ /\)$/o) {
	    chop($model);	# remove final ')'
	    $model = substr($model, 1); # remove initial '('
	}
	$model =~ s/\(/ \( /go; # one space around '('
	$model =~ s/\)/ \) /go; # one space around ')'
	$model =~ s/\|/ \| /go; # one space around '|'
	$model =~ s/,/ /go;	# one space instead of ','
	$model =~ s/\s+\?/\?/go; # no spaces before '?'
	$model =~ s/\s+\*/\*/go; # no spaces before '*'
	$model =~ s/\s+\+/\+/go; # no spaces before '+'
	$model =~ s/\s+/ /go;	# no multiple spaces
	$model =~ s/^ //o;	# no initial space
	$s .= "$model";
    }
    print break($linelen, $maxlen + 3, $s) . "\n\n";
}

# print auxiliary tokens
printf "%-${maxlen}s = \"\'\" TEXT* \"\'\" | \'\"\' TEXT* \'\"\'\n", "STRING";
printf "%-${maxlen}s = (any legal XML character)\n", "TEXT";
