#!/bin/sh
exec perl -x $0 ${1+"$@"}
#!perl -w

# Program to convert a meta-bnf grammar to an XML Schema (for a
# slightly larger language). See schemas.html and
# http://www.w3.org/People/Bos/meta-bnf for more info. This
# implementation differs from the two syntaxes in those document in
# the fact that attributes use '@' and that the '@' must be used both
# in the rhs and the lhs, as follows:
#
#   foo: @bar;
#   @bar: STRING;
#
# ToDo: complete the list of predefined types, use Schema's datatypes.
# ToDo: replace \w with the correct value for name characters.
# ToDo: make the is_*() functions more efficient for large grammars.
#
# Author: Bert Bos <bert@w3.org>
# Created: 7 April 2000
# Version: $Id: mbnftosch.pl,v 1.1 2001/04/05 17:57:08 bbos Exp $

# The "magic number" of a schema file (update this when Schema becomes a Rec):
# sub XSD {"http://www.w3.org/2001/XMLSchema"}
sub XSD {"http://www.w3.org/2000/10/XMLSchema"}
sub VERSION {'$Id: mbnftosch.pl,v 1.1 2001/04/05 17:57:08 bbos Exp $'}

# Regexp for a token on the rhs, excluding ( ) * + ? |
# $1 = @-sign, $2 = name, $3 = string, $4 is string, $5 is regexp
my $TERM = '(?:(@)?(\w+)|(\"[^\"]*\")|(\'[^\']*\')|(/[^/]*/))';

my %rhs;			# RHS for each non-terminal
my %is_attr;			# Whether a non-terminal wants to be attribute
my %tagname;			# Tag name for each non-terminal
my %is_tagless;			# True if non-terminal should have no tags
my $startsym;			# First non terminal of the grammar
my $source;			# The original in MBNF


# --------------------------------------------------------------------------
# is_empty -- return true if there is nothing or only attributes in nonterm
# --------------------------------------------------------------------------
sub is_empty {

  # ToDo: deal with recursive definition
  my $nonterm = shift;
  my $done = shift;		# Reference to hash of already checked

  my $h = $rhs{$nonterm};
  my $save = $h;

  $done->{$nonterm} = 1;	# Mark as checked

  while ($h =~ /^(?:\s|[()*+?|])*$TERM/s) {
    $h = $';
    return 0 if (! defined $1);	# Something without an '@'
  }
  die "Error: malformed rhs: $save\n" if ($h !~ /^(?:\s|[()*+?|])*$/s);
  return 1;
}


# --------------------------------------------------------------------------
# is_predefined -- return true if $_ is one of the predefined terminals
# --------------------------------------------------------------------------
sub is_predefined {

  return $_[0] =~ /^(STRING|BOOLEAN|FLOAT|DOUBLE|DECIMAL|INTEGER|DATE|URI)$/;
  # ToDo: This list is incomplete
}


# --------------------------------------------------------------------------
# is_mixed -- return true if there is content other than elements/attributes
# --------------------------------------------------------------------------
sub is_mixed {

  my $nonterm = shift;
  my $done = shift;		# Reference to hash of already checked nonterms

  my $h = $rhs{$nonterm};

  $done->{$nonterm} = 1;	# Mark as checked

  while ($h =~ /^(?:\s|[()*+?|])*$TERM/s) {
    $h = $';
    if (defined $1) {
      ;				# Attribute
    } elsif (defined $3 || defined $4 || defined $5) {
      return 1;			# Literal string or regexp
    } elsif (is_predefined($2)) {
      return 1;			# Predefined type
    } elsif ($is_tagless{$2} && ! defined $done->{$2} && is_mixed($2, $done)) {
      return 1;			# Contains mixed tagless subelement
    }
  }
  die "Error: malformed rhs for $nonterm\n" if ($h !~ /^(?:\s|[()*+?|])*$/s);
  return 0;
}


# --------------------------------------------------------------------------
# is_textonly -- return true if content contains no child elements
# --------------------------------------------------------------------------
sub is_textonly {

  # ToDo: deal with recursive definition
  my $nonterm = shift;
  my $done = shift;		# Reference to hash of already checked nonterms

  my $h = $rhs{$nonterm};

  $done->{$nonterm} = 1;	# Mark as checked

  while ($h =~ /^(?:\s|[()*+?|])*$TERM/s) {
    $h = $';
    if (defined $1) {
      ;				# Attribute
    } elsif (defined $3 || defined $4 || defined $5) {
      ;				# Fixed text or regexp
    } elsif (is_predefined($2)) {
      ;				# Predefined type
    } elsif (! $is_tagless{$2}) {
      return 0;			# Child element found
    } elsif (! defined $done->{$2} && ! is_textonly($2)) {
      return 0;			# Contains elements in sub-branch
    }
  }
  return 1;
}


# --------------------------------------------------------------------------
# print_attribute -- print the declaration of an attribute
# --------------------------------------------------------------------------
sub print_attribute {

  my $attr = shift;

  my $name = $tagname{$attr};

  die "Error: undefined nonterminal: @$attr\n" if (! defined $rhs{$attr});

  my $h = $rhs{$attr};
  $h =~ s/^\s+//s;		# Remove leading whitespace
  $h =~ s/\s+$//s;		# Remove trailing whitespace

  if (is_predefined($h)) {	# Attribute has predefined data type
    print "    <x:attribute name=\"$name\" type=\"x:".lc($h)."\" />\n";
  } elsif ($h =~ /^\"[^\"]*\"$/ || $h =~ /^\'[^\']*\'$/) { # Fixed string
    print "    <x:attribute name=\"$name\" type=\"x:string\" value=$h />\n";
  } elsif ($h =~ /^\/([^\/]*)\/$/) { # Regular expression
    print "    <x:attribute name=\"$name\" type=\"x:string\">\n";
    print "    <x:pattern value=\"$1\" />\n";
    print "    </x:attribute>\n";
  } else {			# It has some other structure...
    print "    <x:attribute name=\"$name\" type=\"x:string\" />\n";
  }
}


# --------------------------------------------------------------------------
# print_attributes -- find all attributes contained in rhs and declare them
# --------------------------------------------------------------------------
sub print_attributes {

  my $nonterm = shift;
  my $done = shift;		# Reference to hash of already checked nonterms
  my $attrs = shift;		# Reference to hash of already printed attrs
  
  my $h = $rhs{$nonterm};

  $done->{$nonterm} = 1;	# Mark as done

  # XML Schemas don't allow relations among attributes or
  # structure inside an attribute. We just list all possible
  # attributes.

  while ($h =~ /^(?:\s|[()*+?|])*$TERM/s) {
    $h = $';

    if (defined $1) {

      # Token starts with '@'
      print_attribute($2) if (! $attrs->{$2});
      $attrs->{$2} = 1;

    } elsif (defined $2 && $is_tagless{$2} && ! defined $done->{$2}) {

      # Token is a nonterminal without a tag; print attributes recursively
      print_attributes($2, $done, $attrs);

    }
  }
}


# --------------------------------------------------------------------------
# derive_rhs -- recursively construct the content model
# --------------------------------------------------------------------------
sub derive_rhs {

  my $h = shift;		# Reference to a string

  my $result = "";

  $$h =~ s/^\s+//s;		# Remove leading whitespace

  while ($$h =~ /^$TERM|(\()|(\|)/) {

    my ($at, $term, $s1, $s2, $re, $lpar, $bar) = ($1, $2, $3, $4, $5, $6, $7);

    $$h = $';			# Remove the token...
    $$h =~ s/^\s+//s;		#  and any subsequent whitespace

    if (defined $at) {

      # An attribute: already handled
      ;

    } elsif (defined $s1 || defined $s2) {

      # Literal text, treat as a pattern. (i.e., use a comment)

      my $pat = defined $s1 ? "$s1" : "$s2";
      $pat =~ s/\(/\\\(/g;	# Escape special regexp characters
      $pat =~ s/\)/\\\)/g;
      $pat =~ s/\|/\\\|/g;
      $pat =~ s/\?/\\\?/g;
      $pat =~ s/\+/\\\+/g;
      $pat =~ s/\*/\\\*/g;

      $result .= "    <!-- x:pattern value=$pat -->\n";

    } elsif (defined $re) {

      # Regular expression. Use a comment.

      $result .= "    <!-- x:pattern value=\"";
      $result .= substr($re, 1, length($re) - 2);
      $result .= "\" -->\n";

    } elsif (defined $term && is_predefined($term)) {

      # Predefined type. Put in a comment instead
      $result .= "    <!-- x:" . lc($term) . " -->\n";

    } elsif (defined $term && ! $is_tagless{$term}) {

      # An element

      die "Error: undefined token: $term\n" if (! defined $rhs{$term});

      $result .= "    <x:element name=\"$tagname{$term}\" type=\"$term\"";

      if ($$h =~ /^(\*|\+|\?)/) {
	if ($1 eq '*') {	# Repeated and optional element
	  $result .= " minOccurs=\"0\" maxOccurs=\"unbounded\"";
	} elsif ($1 eq '+') {	# Repeated element
	  $result .= " maxOccurs=\"unbounded\"";
	} elsif ($1 eq '?') {	# Optional element
	  $result .= " minOccurs=\"0\"";
	}
	$$h = $';		# Remove the token...
	$$h =~ s/^\s+//s;	#  and any subsequent whitespace
      }

      $result .= " />\n";

    } elsif (defined $term) {

      # A tagless element

      die "Error: undefined token: $term\n" if (! defined $rhs{$term});

      $result .= "    <x:group ref=\"$term\"";

      if ($$h =~ /^(\*|\+|\?)/) {
	if ($1 eq '*') {	# Repeated and optional element
	  $result .= " minOccurs=\"0\" maxOccurs=\"unbounded\"";
	} elsif ($1 eq '+') {	# Repeated element
	  $result .= " maxOccurs=\"unbounded\"";
	} elsif ($1 eq '?') {	# Optional element
	  $result .= " minOccurs=\"0\"";
	}
	$$h = $';		# Remove the token...
	$$h =~ s/^\s+//s;	#  and any subsequent whitespace
      }

      $result .= " />\n";

    } elsif (defined $lpar) {

      # '(' starts a group

      my $result1 = derive_rhs($h);

      if ($$h =~ /^\)\?/) {	# Group is optional
	$result .= "    <x:sequence minOccurs=\"0\">\n";
	$result .= $result1;
	$result .= "    </x:sequence>\n";
      } elsif ($$h =~ /^\)\*/) { # Group is repeated and optional
	$result .= "    <x:sequence minOccurs=\"0\" maxOccurs=\"unbounded\">\n";
	$result .= $result1;
	$result .= "    </x:sequence>\n";
      } elsif ($$h =~ /^\)\+/) { # Group is repeated
	$result .= "    <x:sequence minOccurs=\"1\" maxOccurs=\"unbounded\">\n";
	$result .= $result1;
	$result .= "    </x:sequence>\n";
      } elsif ($$h =~ /^\)/) {	# Probably a choice group, don't add extra tags
	$result .= $result1;
      } else {
	die "Error: unbalanced parentheses\n";
      }
      $$h = $';			# Remove the token...
      $$h =~ s/^\s+//s;		#  and any subsequent whitespace

    } elsif (defined $bar) {

      # '|' creates a choice

      my $result1 = derive_rhs($h);

      # Wrap what we had in a group, put this group and the derived one
      # in a choice element.

      $result = "    <!-- empty -->\n" if ($result eq '');
      $result = "    <x:choice>\n    <x:sequence>\n$result    </x:sequence>\n";
      $result .= "    <x:sequence>\n$result1    </x:sequence>\n";
      $result .= "    </x:choice>\n";

    } else {

      die "Bug: cannot happen\n";

    }

  }

  $result = "    <!-- empty -->\n" if ($result eq '');
  return $result;
}
  

# --------------------------------------------------------------------------
# print_rhs -- print the content model of a non-terminal
# --------------------------------------------------------------------------
sub print_rhs {

  my $nonterm = shift;

  my $h = $rhs{$nonterm};
  print derive_rhs(\$h);
}


# --------------------------------------------------------------------------
# main
# --------------------------------------------------------------------------

$/ = undef;			# Read whole files into memory

while (<>) {

  $source .= $_ . "\n";		# Keep the original for logging purposes

  s/\/\*.*?\*\///sgo;		# Remove /*...*/ comments

  my @rules = split(';');	# Split at ';'
  $#rules--;			# Remove part after last ';'


  # Loop over rules and analyze them

  foreach (@rules) {

    my ($head, $tail) = split(':');

    $head =~ /\s*(@)?(\w+)\s*(?:\(\s*(\w*)\s*\))?\s*/s
      or die "Error: malformed rule: $_\n";

    # $1 = "@", or undefined
    # $2 = non-terminal being defined
    # $3 = name to use in XML or empty or undefined

    $rhs{$2} = "$tail";
    $is_attr{$2} = defined $1;	# '@' is present
    $is_tagless{$2} = defined $3 && "$3" eq "";
    $tagname{$2} = $is_tagless{$2} ? "$2" : defined $3 ? "$3" : "$2";

    # Remember the start symbol (i.e., the first non-terminal)

    $startsym = $2 if (! defined $startsym);
  }
}

# Sanity check: start symbol must be an element:

if ($is_attr{$startsym} or $is_tagless{$startsym}) {
  warn "Warning: start symbol is not an element; proceeding as if it is.\n"
}

# Write standard XML Schema header.
# It seems XML Schemas cannot be used without XML Namespaces :-(
#print "<!DOCTYPE x:schema SYSTEM\n";
#print "  \"http://www.w3.org/XML/2000/xmlschema/structures.dtd\" [\n";
#print "  <!ENTITY % p \"x:\">\n";
#print "  <!ENTITY % nds \"xmlns:x\">\n";
#print "  ]>\n\n";
print "<x:schema xmlns:x=\"", XSD, "\">\n\n";
#print "  xmlns=\"http://default\"\n";
#print "  targetNamespace=\"http://default\"";
#print "<x:schema xmlns:x=\"http://www.w3.org/1999/XMLSchema\">\n\n";

print "<!--\n";
print "  Generated by ", VERSION, "\n";
print "  on " . gmtime() . " GMT\n";
print "  from the following MNBF source:\n\n";
print $source;
print "-->\n\n";

# Write the start symbol

print "<!-- The start (root) symbol -->\n\n";
print "  <x:element name=\"$tagname{$startsym}\" type=\"$startsym\" />\n\n";

# Loop over rules and write declarations for all types

print "<!-- The types of all elements -->\n\n";

foreach (keys %rhs) {

  if (! $is_attr{$_}) {

    # Declare the type named $_. It will be referred from element
    # declarations inside content models. The content model, if any,
    # is a pointer to a group that will be declared later.

    print "  <x:complexType name=\"$_\"";
    if (is_empty($_)) {
      ;
    } elsif (is_textonly($_)) {
      print " mixed=\"true\">\n";
    } elsif (is_mixed($_)) {
      print " mixed=\"true\">\n";
      print "    <x:group ref=\"$_\" />\n";
    } else {
      print ">\n";
      print "    <x:group ref=\"$_\" />\n";
    }
    print_attributes($_, {}, {});
    print "  </x:complexType>\n\n";
  }

  # Content model for attributes cannot be translated, since Schemas
  # seem to only allow regular expressions to define them.

}

# Loop over rules and define the content models

print "<!-- The content models of all elements -->\n\n";

foreach (keys %rhs) {

  if (! $is_attr{$_}) {

    # Define a group with the same name as the corresponding
    # complexType, so that tagless elements can refer to that. It
    # seems a group cannot have a type and can only refer to another
    # group.

    print "  <x:group name=\"$_\">\n";
    print "    <x:sequence>\n";
    print_rhs($_);
    print "    </x:sequence>\n";
    print "  </x:group>\n\n";
  }
}

# Write standard XML Schema footer

print "</x:schema>\n";
