/* ast.pl:  routines for working with the abstract syntax trees for
 * regular expressions returned by the grammar in regex.dcg.
 */

/* Copyright (c) 2008 World Wide Web Consortium, 
 * (Massachusetts Institute of Technology, European Research 
 * Consortium for Informatics and Mathematics, Keio University). 
 * All Rights Reserved. This work is distributed under the 
 * W3C(TM) Software License [1] in the hope that it will be 
 * useful, but WITHOUT ANY WARRANTY; without even the implied 
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * [1] http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
 */

/* This file is part of Xerophily, a parser for XSD regular expressions. */

/* Revisions:
 * 2008-03-27 : CMSMcQ : made file, beginning with cruft moved out of regex.dcg.
 */
:- module(ast,
	  [ ast_legal/2,
	    simplify_ast/2,
	    print_ast/1,
	    xml_ast/1,
	    ast_awf/2
	  ]).

simplify_ast(or(Expression0,dummy),Expression) :- 
	simplify_ast(Expression0,Expression).
simplify_ast(Expression,Expression) :- 
        not( Expression = or(_X,_Y) ).

/* I don't want to do ast_legal just now. */
ast_legal(_,[]).

/* Two forms of pretty printing.  ASCII ... */
pp_step(2).

print_ast(AST) :- pp(AST,0).
pp(or(L,R),N) :- 
	tab(N), print('or('), nl, 
	pp_step(Step),
	N1 is N + Step,
	pp(L,N1),
	tab(N), print(','), nl, 
	pp(R,N1),
	tab(N), print(')'), nl.
pp(seq(L),N) :- 
	tab(N), print('seq('), nl, 
	pp_step(Step),
	N1 is N + Step,
	ppitems(L,N,N1),
	tab(N), print(')'), nl.
pp(count(Min,Max,Atom),N) :- 
	tab(N), print('rep('), print(Min), print(','), print(Max), print(','), 
	(   simple(Atom)
	->  print(Atom)
	;   nl,
	    pp_step(Step),
	    N1 is N + Step + Step,
	    pp(Atom,N1),
	    tab(N)
	), print(')'), nl.
pp(char(C),N) :- 
	tab(N), print('char('), print(C), print(')'), nl.
pp(charClass(C),N) :- 
	not(C = diff(_,_)),
	tab(N), print('charClass('), print(C), print(')'), nl.
pp(any(C),N) :- 
	tab(N), print('any('), print(C), print(')'), nl.
pp(none(C),N) :- 
	tab(N), print('none('), print(C), print(')'), nl.
pp(charClass(diff(L,R)),N) :- 
	tab(N), print('charClass('), nl,
	pp_step(Step),
	N1 is N + Step,
	N2 is N + Step + Step,
	tab(N1), print('diff('), nl,
	tab(N2), print(L), nl,
	tab(N1), print(','), nl,
	pp(R,N2),
	tab(N1), print(')'), nl,
	tab(N), print(')'), nl.
pp(regexp(E),N) :- 
	tab(N), print('regexp('), nl,
	pp_step(Step),
	N1 is N + Step,
	pp(E,N1),
	tab(N), print(')'), nl.

	

ppitems([],_,N) :- tab(N), print([]), nl.
ppitems([X],_,N) :- pp(X,N), nl.
ppitems([H|T],N0,N1) :- 
	pp(H,N1), 
	tab(N0), print(','), nl,
	ppitems(T,N0,N1).

simple(char(_)).
simple(charClass(C)) :- C \= diff(_,_).
simple(regex(E)) :- E \= regex(_), simple(E).

/* ... and XML.  In contrast to the (first version of the) above, we
 * assume our parent has positioned us and will do a newline after us if
 * needed; N is used ONLY for internal newlines */
att(Name,Value,S) :-
	(   atomic(Value)
	->  concat_atom([Name,'="',Value,'"'],S)
	;   is_list(Value)
	->  concat_atom(Value,' ',AValue),
	    concat_atom([Name,'="',AValue,'"'],S)
	).
stag(GI,Lavs) :-
	start_or_sole(GI,Lavs,'>').
start_or_sole(GI,Lavs,TAGC) :-
	concat_atom(Lavs,' ',ALavs),
	print('<'),
	print(GI),
	(   Lavs = []
	->  print(TAGC)
	;   print(' '),
	    print(ALavs),
	    print(TAGC)
	).
etag(GI) :-
	print('</'),
	print(GI),
	print('>').
elem(GI,Lavs,Content) :-
	stag(GI,Lavs),
	print(Content),
	etag(GI).
e_elem(GI,Lavs) :-
	start_or_sole(GI,Lavs,'/>').

xml_ast(E) :- xml_ast(E,0).

xml_ast(parses(G,AST),N) :-
	xml_ast(parse(G,AST),N).
xml_ast(parse(G,AST),N) :-
	att(grammar, G, A),
	stag(ast,[A]), nl,
	pp_step(Step),
	N1 is N + Step,
	tab(N1),
	xml_ast(AST,N1),
	nl, tab(N), etag(ast), nl.

xml_ast(noparse,_N) :- print('<no-parse/>').
xml_ast(or(L,R),N) :- 
	stag(or,[]),
	pp_step(Step),
	N1 is N + Step,
	nl, tab(N1), xml_ast(L,N1),
	nl, tab(N1), xml_ast(R,N1),
	nl, tab(N), etag('or').

xml_ast(seq(L),N) :- 
	stag('seq',[]),
	pp_step(Step),
	N1 is N + Step,
	do_items(L,N1),
	nl, tab(N), etag('seq').

xml_ast(count(Min,Max,Atom),N) :- 
	pp_step(Step), 
	N1 is N + Step,
	att(min,Min,Amin),
	att(max,Max,Amax),
	stag(rep,[Amin,Amax]),
	(   simple(Atom)
	->  xml_ast(Atom,N1)
	;   nl, tab(N1),
	    xml_ast(Atom,N1),
	    nl, tab(N)
	),
	etag('rep').
xml_ast(char(C),_N) :- elem(c,[],C).

xml_ast(charClass(C),N) :- 
	not(C = diff(_,_)),
	pp_step(Step), N1 is N + Step,
	stag(charClass,[]),
	nl, tab(N1), xml_ast(C,N1),
	nl, tab(N), etag(charClass).
xml_ast(any(C),N) :- 
	pp_step(Step), N1 is N + Step,
	stag('any-of',[]),
	length(C,L),
	(   L > 1
	->  do_items(C,N1),
	    nl, tab(N)
	;   L = 1
	->  C = [C1],
	    xml_ast(C1,N1)
	;   print('<empty/>')
	),	
	etag('any-of').
xml_ast(none(C),N) :- 
	pp_step(Step), N1 is N + Step,
	stag('none-of',[]),
	length(C,L),
	(   L > 1
	->  do_items(C,N1),
	    nl, tab(N)
	;   L = 1
	->  C = [C1],
	    xml_ast(C1,N1)
	;   print('<empty/>')
	),	
	etag('none-of').
xml_ast(sce(C),_N) :- elem('sce',[],C).
xml_ast(category(C),_N) :- elem('category',[],C).
xml_ast(notcategory(C),_N) :- elem('not-category',[],C).
xml_ast(kw(KW),_N) :-
	not(KW = not(_)),
	att(kw,KW,A),
	e_elem('mce',[A]).
xml_ast(kw(not(KW)),_N) :-
	att('not-kw',KW,A),
	e_elem('mce',[A]).
xml_ast(range(From,To),N) :-
	(   atom(From), atom(To)
	->  simple_range(From,To)
	;   complex_range(From,To,N)
	).
xml_ast(category(Catcode),_N) :-
	not(Catcode=block(_Blockname)),
	att(code,Catcode,A),
	e_elem('category',[A]).
xml_ast(category(block(Blockname,Start,End)),_N) :-
	att(name,Blockname,Ab),
	att(start,Start,As),
	att(end,End,Ae),
	e_elem('block',[Ab,As,Ae]).
xml_ast(category(blockname(Blockname)),_N) :-
	att(name,Blockname,Ab),
	e_elem('block',[Ab]).
xml_ast(category(unknownblock(Blockname)),_N) :-
	att(name,Blockname,Ab),
	e_elem('unknown-block',[Ab]).
xml_ast(category(Catcode),_N) :-
	att(code,Catcode,A),
	e_elem('category',[A]).
xml_ast(charClass(diff(L,R)),N) :- 
	pp_step(Step), N1 is N + Step,
	stag(charClass,[]),
	stag(difference,[]),
	nl, tab(N1), xml_ast(L,N1),
	nl, tab(N1), xml_ast(R,N1),
	nl, tab(N), etag(diff), etag(charClass).
xml_ast(regexp(E),N) :- 
	stag(re,[]),
	pp_step(Step),
	N1 is N + Step,
	nl, tab(N1), xml_ast(E,N1),
	nl, tab(N), etag(re).
xml_ast(A,_N) :-
	atom(A),
	(   atom_chars(A,[A]) % single character
	->  stag('c',[]), print(A), etag('c')
	;   stag('what-on-earth',[]), print(A), etag('what-on-earth')
	).


do_items([],_N).
do_items([X],N) :- 
	nl, tab(N), xml_ast(X,N).
do_items([H|T],N) :- 
	nl, tab(N), xml_ast(H,N),
	do_items(T,N).


simple_range(From,To) :-
	att(from,From,A1),
	att(to,To,A2),
	e_elem('simple-range',[A1,A2]).
complex_range(From,To,N) :-
	stag('range',[]),
	xml_ast(From,N),
	xml_ast(To,N),
	etag('range').

/* ... and Antjewierden/Wielemaker Form.  Just write elements, in AWF,
 * and let xml_write take care of things. 
 */
iatt(Name,Value,(Name=Value)) :- atomic(Value).
iatt(Name,List,(Name=A)) :-
	is_list(List),
	concat_atom(List,' ',A).
ielem(GI,Lavs,Content,element(GI,Lavs,Content)).

ast_awf(parses(G,AST),AWF) :-
	ast_awf(parse(G,AST),AWF).
ast_awf(parse(G,AST),element(ast,
			     [A],
			     [AWF])) :-
	iatt(grammar, G, A),
	ast_awf(AST,AWF).

ast_awf(noparse,element(no_parse,[],[])).
ast_awf(or(L0,R0),element(or,[],[L,R])) :- 
	ast_awf(L0,L),
	ast_awf(R0,R).

ast_awf(seq(L),element(seq,[],Ch)) :- 
	awf_do_items(L,Ch).

ast_awf(count(Min,Max,Atom0),
       element(count,[min=Min,max=Max],[Atom])) :- 
	ast_awf(Atom0,Atom).

/* does this need to do something clever with control characters, etc.? */
ast_awf(char(C),element(c,[],[C])).

ast_awf(charClass(C0),element(charClass,[],[C])) :- 
	ast_awf(C0,C).

ast_awf(any(C0),element(any_of,[],Cs)) :- 
	length(C0,L),
	(   L > 1
	->  awf_do_items(C0,Cs)
	;   L = 1
	->  C0 = [E0],
	    ast_awf(E0,E),
	    Cs = [E]
	;   Cs = [element(empty,[],[])]
	).

ast_awf(none(C0),element(none_of,[],Cs)) :- 
	length(C0,L),
	(   L > 1
	->  awf_do_items(C0,Cs)
	;   L = 1
	->  C0 = [E0],
	    ast_awf(E0,E),
	    Cs = [E]
	;   Cs = [element(empty,[],[])]
	).


ast_awf(sce(C),element(sce,[],[C])).
ast_awf(category(C),element(category,[],[C])).
ast_awf(notcategory(C),element(not_category,[],[C])).
ast_awf(kw(KW),element(mce,[A],[])) :-
	not(KW = not(_)),
	iatt(kw,KW,A).
ast_awf(kw(not(KW)),element(mce,[A],[])) :-
	iatt('not_kw',KW,A).

ast_awf(range(From,To),AWF) :-
	(   atom(From), atom(To)
	->  awf_simple_range(From,To,AWF)
	;   awf_complex_range(From,To,AWF)
	).
ast_awf(category(Catcode),element(category,[A],[])) :-
	not(Catcode=block(_Blockname)),
	iatt(code,Catcode,A).
ast_awf(category(block(Blockname,Start,End)),
	element(block,[Ab,As,Ae],[])) :-
	iatt(name,Blockname,Ab),
	iatt(start,Start,As),
	iatt(end,End,Ae).

ast_awf(category(blockname(Blockname)),
	element(block,[Ab],[])) :-
	iatt(name,Blockname,Ab).

ast_awf(category(unknownblock(Blockname)),
	element(unknown_block,[Ab],[])) :-
	iatt(name,Blockname,Ab).

ast_awf(category(Catcode),element(category,[A],[])) :-
	iatt(code,Catcode,A).

ast_awf(diff(L0,R0),element(difference,[],[L,R])) :- 
	ast_awf(L0,L),
	ast_awf(R0,R).

ast_awf(regexp(E0),element(re,[],[E])) :- 
	ast_awf(E0,E).

ast_awf(A,AWF) :-
	atom(A),
	(   atom_chars(A,[A]) % single character
	->  AWF = element(c,[],[A])
	;   AWF = element(what_on_earth,[],[A])
	).


awf_do_items([],[]).
awf_do_items([X0],[X]) :- 
	ast_awf(X0,X).
awf_do_items([H0|T0],[H|T]) :- 
	ast_awf(H0,H),
	awf_do_items(T0,T).


awf_simple_range(From,To,element(simple_range,[A1,A2],[])) :-
	iatt(from,From,A1),
	iatt(to,To,A2).

awf_complex_range(From,To,element(range,[],[Start,End])) :-
	ast_awf(From,Start),
	ast_awf(To,End).

