#!/usr/local/bin/perl5 # # Bert Bos # Created: 17 Mar 1998 # $Id: dtd2bnf,v 1.4 2002/05/21 19:36:06 bbos Exp $ # my $linelen = 72; my $PROG = substr($0, rindex($0, "/") + 1); my $USAGE = "Usage: $PROG file\n"; my $string = "(?:\"([^\"]*)\"|\'([^\']*)\')"; my %attributes; # Attribute lists my @element; # Elements in source order my %model; # Content models # Parse a string into an array of "words". # Words are whitespace-separated sequences of non-whitespace characters, # or quoted strings ("" or ''), with the quotes removed. sub parsewords { my $line = $_[0]; my @words = (); while ($line ne '') { if ($line =~ /^\s+/) { # Skip whitespace } elsif ($line =~ /^\"((?:[^\"]|\\\")*)\"/) { push(@words, $1); } elsif ($line =~ /^\'((?:[^\']|\\\')*)\'/) { push(@words, $1); } elsif ($line =~ /^\S+/) { push(@words, $&); } else { die "Cannot happen\n"; } $line = $'; } return @words; } # break lines at or before $linelen, indent continuation lines $indent sub break { my ($linelen, $indent, $line) = @_; my $result = ''; $line =~ s/\s+$//o; # Remove trailing whitespace while (length($line) > $linelen) { my $i = $linelen; BREAK: while (1) { if (substr($line, $i, 1) =~ /\s/so) { # found a space last BREAK; } if ($i <= $linelen) {$i--;} else {$i++;} if ($i == $indent) { # no space found to the left, try to the right $i = $linelen + 1; } if ($i == length($line)) { # no space found anywhere last BREAK; } } my $part = substr($line, 0, $i); $part =~ s/\s+$//o; # Remove trailing spaces $result .= $part; # Add to result $line = substr($line, $i + 1); $line =~ s/^\s+//o; # Remove leading spaces if (length($line) != 0) { $result .= "\n"; $line = (' ' x $indent) . $line; } } if (length($line) != 0) {$result .= $line;} return $result; } # Store content model, return empty string sub store_elt { my ($name, $model) = @_; $model =~ s/\#PCDATA/TEXT*/gio; $model =~ s/\s+/ /gso; push(@element, $name); $element{$name} = $model; return ''; } # Store attribute list, return empty string sub store_att { my ($element, $atts) = @_; $atts =~ s/\#FIXED//gio; # Remove #FIXED my @words = parsewords($atts); $attributes{$element} = [ @words ]; return ''; } # Return maximum value of an array of numbers sub max { my $max = $_[0]; foreach my $i (@_) { if ($i > $max) {$max = $i;} } return $max; } # Main # read file into $buf open(INPUT, $ARGV[0]) || die "$PROG: cannot read file $ARGV[0]\n"; $/ = undef; my $buf = ; close(INPUT); # remove comments $buf =~ s/\s*//gso; # remove processing instructions $buf =~ s/<\?.*?>\s+//gso; # loop until parameter entities fully expanded and no more conditional sections while ($buf =~ s/\s*//so) { my $entity = $1; my $expansion = (defined $2?$2:"").(defined $3?$3:"").(defined $4?$4:""); # expand this entity everywhere $buf =~ s/%\Q$entity\E\b;?/$expansion/g; # remove any ignorable conditional sections that might have produced $buf =~ s/\s*//gso; $buf =~ s/\s*/$1/gso; } # remove all other entities $buf =~ s/\s*//gsio; # store attribute lists $buf =~ s/\s*/store_att($1, $2)/gsioe; # store content models $buf =~ s/\s*/store_elt($1, $2)/gsioe; # At this moment, the DTD should have nothing left but whitespace... $buf =~ /^\s*$/ or warn "Warning: some unregnized text left over:\n$buf"; # find maximum length of non-terminals my $maxlen = max(map(length, @element)) + 4; # loop over elements, writing EBNF foreach $e (@element) { my $model = $element{$e}; my $has_attribs = defined $attributes{$e}; my $s; # print rule for element $e $s = sprintf("%-${maxlen}s = ", $e); $s .= "\"<$e\" "; $s .= "${e}_att* " if ($has_attribs); $s .= "( \"/>\" | \">\" ${e}_cnt \"\" )"; print break($linelen, $maxlen + 3, $s) . "\n"; # print rule for $e's attributes if ($has_attribs) { printf("%-${maxlen}s = ", "${e}_att"); my $h = $attributes{$e}; my @atts = @$h; for (my $i = 0; $i <= $#atts; $i += 3) { if ($i != 0) {print ' ' x $maxlen . " | ";} # use only name, ignore type and default print "\"$atts[$i]\" \"=\" STRING\n"; } } $s = sprintf("%-${maxlen}s = ", "${e}_cnt"); # print rule for $e's content model if ($model eq "EMPTY") { $s .= ";empty"; } elsif ($model eq "ANY") { $s .= "( " . join(" | ", sort @element) . " )*"; } else { $model =~ s/\s+$//o; # remove trailing spaces if ($model =~ /\)$/o) { chop($model); # remove final ')' $model = substr($model, 1); # remove initial '(' } $model =~ s/\(/ \( /go; # one space around '(' $model =~ s/\)/ \) /go; # one space around ')' $model =~ s/\|/ \| /go; # one space around '|' $model =~ s/,/ /go; # one space instead of ',' $model =~ s/\s+\?/\?/go; # no spaces before '?' $model =~ s/\s+\*/\*/go; # no spaces before '*' $model =~ s/\s+\+/\+/go; # no spaces before '+' $model =~ s/\s+/ /go; # no multiple spaces $model =~ s/^ //o; # no initial space $s .= "$model"; } print break($linelen, $maxlen + 3, $s) . "\n\n"; } # print auxiliary tokens printf "%-${maxlen}s = \"\'\" TEXT* \"\'\" | \'\"\' TEXT* \'\"\'\n", "STRING"; printf "%-${maxlen}s = (any legal XML character)\n", "TEXT";