#!/usr/local/bin/perl5
#
# Dan Connolly
# derived from
#
# Id: dtd2bnf,v 1.3 1998/03/17 18:48:44 bbos Exp 
# Bert Bos <bert@w3.org>
# Created: 17 Mar 1998
#

use strict;

my $targetNS = shift(@ARGV);

my $linelen = 72;

my $PROG = substr($0, rindex($0, "/") + 1);
my $USAGE = "Usage: $PROG file\n";

my $string = "(?:\"([^\"]*)\"|\'([^\']*)\')";
my %pent;			# Parameter entities
my %attributes;			# Attribute lists
my @element;			# Elements in source order
my %model;			# Content models


# Parse a string into an array of "words".
# Words are whitespace-separated sequences of non-whitespace characters,
# or quoted strings ("" or ''), with the quotes removed.
# HACK: added () stuff for attlist stuff

sub parsewords {
    my $line = $_[0];
    my @words = ();
    while ($line ne '') {
	if ($line =~ /^\s+/) {
	    # Skip whitespace
	} elsif ($line =~ /^\"((?:[^\"]|\\\")*)\"/) {
	    push(@words, $1);
	} elsif ($line =~ /^\'((?:[^\']|\\\')*)\'/) {
	    push(@words, $1);
	} elsif ($line =~ /^\(((?:[^\)]|\\\))*)\)/) {
	    push(@words, $1);
	} elsif ($line =~ /^\S+/) {
	    push(@words, $&);
	} else {
	    die "Cannot happen\n";
	}
	$line = $';
    }
    return @words;
}

# break lines at or before $linelen, indent continuation lines $indent
sub break
{
    my ($linelen, $indent, $line) = @_;
    my $result = '';
    $line =~ s/\s+$//o;		# Remove trailing whitespace
    while (length($line) > $linelen) {
	my $i = $linelen;
	BREAK: while (1) {
	    if (substr($line, $i, 1) =~ /\s/so) {
		# found a space
		last BREAK;
	    }
	    if ($i <= $linelen) {$i--;} else {$i++;}
	    if ($i == $indent) {
		# no space found to the left, try to the right
		$i = $linelen + 1;
	    }
	    if ($i == length($line)) {
		# no space found anywhere
		last BREAK;
	    }
	}
	my $part = substr($line, 0, $i);
	$part =~ s/\s+$//o;	# Remove trailing spaces
	$result .= $part;	# Add to result
	$line = substr($line, $i + 1);
	$line =~ s/^\s+//o;	# Remove leading spaces
	if (length($line) != 0) {
	    $result .= "\n";
	    $line = (' ' x $indent) . $line;
	}
    }
    if (length($line) != 0) {$result .= $line;}
    return $result;
}


# Store content model, return empty string
sub store_elt
{
    my ($name, $model) = @_;
    $model =~ s/\s+/ /gso;
    push(@element, $name);

    my @words;
    while($model =~ s/^\s*(\(|\)|,|\+|\?|\||[\w_\.-]+|\#\w+|\*)//){
	push(@words, $1);
    };
    $model{$name} = [ @words ];
    return '';
}


# Store attribute list, return empty string
sub store_att
{
    my ($element, $atts) = @_;
#    $atts =~ s/\#FIXED//gio;	# Remove #FIXED
    my @words = parsewords($atts);
    $attributes{$element} = [ @words ];
    return '';
}


# Return maximum value of an array of numbers
sub max
{
    my $max = $_[0];
    foreach my $i (@_) {
	if ($i > $max) {$max = $i;}
    }
    return $max;
}


# Main

$/ = undef;
my $buf = <>;

# remove comments
$buf =~ s/<!--.*?-->\s+//gso;

# remove processing instructions
$buf =~ s/<\?.*?>\s+//gso;

# loop until parameter entities fully expanded
my $i;
do {
    # store parameter entities
    $buf =~ s/<!ENTITY\s+%\s+(\S+)\s+$string/($pent{$1}=$2.$3.$4),$&/gsioe;
    # count # of substitutions
    $i = 0;
    # expand parameter entities
    $buf =~ s/%([a-z0-9.-]+);?/$i++,$pent{$1}/gsioe;
} while ($i != 0);

# remove all entities (parameter and other)
$buf =~ s/<!ENTITY.*?>\s+//gsio;

# store attribute lists
$buf =~ s/<!ATTLIST\s+(\S+)\s+(.*?)>/store_att($1, $2)/gsioe;

# store content models
$buf =~ s/<!ELEMENT\s+(\S+)\s+(.+?)>/store_elt($1, $2)/gsioe;

print "<schema
xmlns='http://www.w3.org/1999/XMLSchema'
targetNamespace='$targetNS'
xmlns:t='$targetNS'
><annotation
><documentation
>This schema was automatically generated
from a DTD by a perl script. The script is designed
to produce a schema for a language that it some
superset of the language generated by the DTD: all
content models are reduced to repeating choice
groups, and all attributes are just given type 'string'.
TODO: a closer mapping of ? | + ,
and attribute types: enumeration, etc.

About the indentation of the XML that is produced, I
use the 'line-oriented approach' approach
suggested in:

Comments on the WD - A proposed alternative Arjun Ray (Sun, Feb 20 2000) 
http://lists.w3.org/Archives/Public/www-xml-canonicalization-comments/2000Feb/0005.html


</documentation
></annotation
>";



# find maximum length of non-terminals
#my $maxlen = max(map(length, @element)) + 4;

my($e);

# loop over elements, writing EBNF
foreach $e (@element) {
    my $h = $model{$e};
    my @model = @$h;

    # print rule for element $e
    print "<element
name='$e'
>";

    # print rule for $e's content model
    print "<complexType
";

    if ($model[0] eq 'EMPTY') {
	print "content='empty'
>";
    } else {
	if (grep($_ eq '#PCDATA', @model)){
	    print "content='mixed'
>";
	}else{
	    print "content='elementOnly'
>";
	}


	print "<annotation
><appinfo
>@model</appinfo
></annotation
>";

	my @children = grep(/^[\w_\.-]+$/, @model);
	print STDERR "MODEL $e: ", join ('#', @model), "\n";
	print STDERR "CHILDREN $e: ", join ('#', @children), "\n";
	if (@children){
	    my ($n);
	    print "<sequence
minOccurs='0'
maxOccurs='*'
><choice
>";

	    foreach $n (@children){
		print "<element
ref='t:$n'
/>";
	    }
	    print "</choice
></sequence
>";
	}

    }

    # print rule for $e's attributes
    my $h = $attributes{$e};
    if (! $h) {
	# nothing
    } else {
        my @atts = @$h;

        for (my $i = 0; $i <= $#atts; $i += 3) {
            # @@use only name, ignore type and default
            print "<attribute
name='$atts[$i]'
type='string'
/>";
	}
    }


    print "</complexType
></element
>";
}

print <<EOF;
</schema
>
EOF

