/*

   Prolog syntax rules for turning EBNF+annotations into 
   RDF statements using a grammar vocabulary.

   The EBNF dialect here is that used by the XML 1.0 Recommendation,
   as extracted by:
       lynx -source http://www.w3.org/TR/2000/REC-xml-20001006 > REC-xml.html
       
       lynx -dump -nolist -width=65535 http://www.w3.org/TR/2000/REC-xml-20001006 | grep -v '^symbol' | grep '::=' | cut -b11- | sed 's;^ *;;' > xml-grammar
        OOPS -- misses some with explicit brs?  
	     --> should start on ::= line and go on until blank line
	         or another ::= line.   perl?
 
   When this works, you should be able to feed it that xml-grammar
   file and it will return a large set of RDF triples which can in
   turn be used to parse XML files.

   Like yacc, etc, we extend the EBNF with annotations.  Any rule may
   be followed by a logical expression in curly-braces.  That
   expression should be in language [...which one?...] and refers to
   elements in the rule by number -- but it can refer to the value or
   the text.   

   Also, the default rules for things like X* are reasonable, making
   it a list.

        a ::= b{2,4}
    ==
        a : b b      { $0 = ($1, $2) }
          | b b b    { $0 = ($1, $2, $3) }
          | b b b b  { $0 = ($1, $2, $3, $4) }
   

*/

% auto_table breaks IO!
% :- auto_table. 
:- table ruleset/3.
:- table opgap/2.
:- table gap/2.
:- table word/2.
:- table string_contents1/2.
:- table string_contents2/2.
:- table charset_parts/2.
:- table not_star_slash/2.
:- table pattern/3.

:- import append/3 from basics.
:- import member/2 from basics.

% ruleset --> opgap; rule, opgap, [10], opgap, ruleset.

ruleset([]) --> opgap.
ruleset(S)  -->  opgap, rule(R), opgap, ruleset(SB), {append(SB, [R], S)}.

rule(R) --> word(WordText), opgap, "::=", opgap, pattern(P),
	    {R=rule(WordText, P)},
	    {write('Parsed Rule: '), name(N, WordText), writeln(N)}.

pattern(P) --> atom(P) ; expr(P).

atom(A) --> word(Text), { A=class(Text) }
	;   char
	;   string
	;   charset
	;   "(", opgap, pattern(A), opgap, ")"
	.

expr(E) --> seq(E)
	;   alt(E)
	;   except(E)
	;   repeat(E)
	.

%opgap --> [] ; gap.
opgap --> [] ; opgap, whitespace.
gap --> whitespace ; whitespace, gap.
whitespace([Head|Tail], Tail) :- isspace(Head).
% whitespace --> [10]; [8]; [13]; " ".

lower --> "a"; "b"; "c"; "d"; "e"; "f"; "g"; "h"; "i"; "j"; "k"; "l"; "m"; "n"; "o"; "p"; "q"; "r"; "s"; "t"; "u"; "v"; "w"; "x"; "y"; "z".
upper --> "A"; "B"; "C"; "D"; "E"; "F"; "G"; "H"; "I"; "J"; "K"; "L"; "M"; "N"; "O"; "P"; "Q"; "R"; "S"; "T"; "U"; "V"; "W"; "X"; "Y"; "Z".
letter --> lower ; upper.
digit --> "1"; "2"; "3"; "4"; "5"; "6"; "7"; "8"; "9"; "0".
hexdigit --> "1"; "2"; "3"; "4"; "5"; "6"; "7"; "8"; "9"; "0"; "A"; "B"; "C"; "D"; "E"; "F".
normal_punctuation --> " "; "`";  "="; "~"; "!"; "@"; "$"; "%"; "&"; "("; ")"; "_"; "+"; "|"; "["; "{"; "}"; ";"; ","; "."; "/"; "<"; ">"; "?".
special_punctuation --> ":", "'"; "\"; """"; "-"; "]"; "^"; "#"; "*"; "/".
not_special --> letter; digit; normal_punctuation.
not_apos --> not_special;  ":";      "\"; """"; "-"; "]"; "^"; "#"; "*"; "/".
not_quote --> not_special; ":"; "'"; "\";       "-"; "]"; "^"; "#"; "*"; "/".

word --> letter, ([] ; word).

word(Text, Li, Lo) :- word(Li, Lo), append(Text, Lo, Li).

char --> "#x", hexdigit.
char --> "#x", hexdigit, hexdigit.
char --> "#x", hexdigit, hexdigit, hexdigit.
char --> "#x", hexdigit, hexdigit, hexdigit, hexdigit.

string --> "'", string_contents1, "'".
string_contents1 --> []; not_apos, string_contents1.
string --> """", string_contents2, """".
string_contents2 --> []; not_quote, string_contents2.



seq(E) --> pattern(E1), gap, pattern(E2), { E=seq(E1,E2) }.

alt(E) --> pattern(E1), opgap, "|", opgap, pattern(E2), { E=or(E1,E2) }.

except(E) --> pattern(E1), opgap, "-", opgap, pattern(E2), { E=and(E1,not(E2)) }.

repeat(E) --> pattern(E1), opgap, "+", { E=repeat(E1, 1, max) }
          ;   pattern(E1), opgap, "?", { E=repeat(E1, 0, 1)   }
          ;   pattern(E1), opgap, "*", { E=repeat(E1, 0, max) }.


% what elements can occur inside the [...] char-set construct?
% omit: "-"; "]"; "^"; "#"; ":"   if you want them use is #x construct
setchar --> char; not_special; "'"; "\"; """"; "*"; "/".  

charset --> "[", charset_parts , "]"      % normal
          ; "[-", charset_parts, "]"       % with dash
          ; "[", charset_parts, "-]"       % with dash
          ; "[^", charset_parts, "]"       % inverted 
          ; "[^-", charset_parts, "]"      % inverted with dash
          ; "[^", charset_parts, "-]".     % inverted with dash

charset_parts --> []
	      ;   setchar, charset_parts
	      ;   setchar, "-", setchar, charset_parts.

% disjoint from charset because of the ":"
constraint --> "[", opgap, "VC:", constraint_expression, opgap, "]"
	   ;   "[", opgap, "WFC:", constraint_expression, opgap, "]".

% this is so ugly because we're not using charset inverses
% we're not doing nesting here.
comment    --> "/*", not_star_slash, "*/".
not_star_or_slash --> not_special; ":", "'"; "\"; """"; "-"; "]"; "^"; "#".
not_star_slash --> []
	       ;   not_star_or_slash, not_star_slash
	       ;   "*", not_star_or_slash, not_star_slash
	       ;   "/", not_star_slash.

:- import file_open/3, file_close/2, file_getbuf_list/4 from file_io.
:- import length/2 from basics.
do(Filename) :-
  read_string(Filename, Contents),
  writeln(''),
  writeln('------Parsing-------'),
  name(X, Contents),
  writeln(X),
  writeln('------Results--------'),
  ruleset(R, Contents, ""),
  writeln(R),
  writeln('---------------------').




xxread_string(Filename, Contents) :-
  seeing(Previous),
  see(Filename),
%  get0(A),
%  get0(B),
%  get0(C),
%  get0(D),
%  Contents=[A,B,C,D],
  getall(Contents),
  see(Previous).		      

getall(L) :-
  writeln('getall1'),
  get0(N), 
  writeln('getall2'), !,
  (N = -1, L=[], writeln('EOF.')); 
  (write('Not done, read: '),
  writeln(N), 
  getall(Rest), 
  append([N], Rest, L)).

read_string(Filename, Contents) :-
  file_open(Filename, 0, IOPort),
  getall(IOPort, Contents), !.
  % file_close(IOPort).

getall(IOPort, Contents) :-
  (file_read_line_list(IOPort, Line),
   %name(X, Line),
   %write('Read line: '),
   %writeln(X),
   getall(IOPort, Rest),
   append(Line, Rest, Contents))
   %writeln('Appended'))
 ; %writeln('Done'), 
   Contents = [].


