/* testgenerator1.pl: Test generator for regexes */

/* Copyright (c) 2008 World Wide Web Consortium, 
 * (Massachusetts Institute of Technology, European Research 
 * Consortium for Informatics and Mathematics, Keio University). 
 * All Rights Reserved. This work is distributed under the 
 * W3C(TM) Software License [1] in the hope that it will be 
 * useful, but WITHOUT ANY WARRANTY; without even the implied 
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * [1] http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
 */

/* This file is part of Xerophily, a parser for XSD regular expressions. */

:- module(testgenerator1,
          [
            teststring/2,
            teststring/3
          ]).

/* experiment:  try suppressing this call.  
:- ensure_loaded('regex.dcg.pl').
*/
:- use_module(g_opts).

/* s(n,NT,s):  n is a number; will be used for random test generation.
 * NT is a non-terminal (used for making sure these are actually legal
 * strings against the non-terminals I think they are legal against).
 * s is a string; should be legal against the non-terminal.
 */

teststring(N,S) :- teststring(N,_,S).
teststring(N,A,S) :-
	/* generate n random numbers */
	random_numbers(N,Ns),
	/* concatenate the strings they select */
	numbers_strings(Ns,Ss),
	concat_atom(Ss,A),
	atom_chars(A,S).

random_numbers(0,[]).
random_numbers(N0,[R|Rs]) :-
	N0 > 0,
	N1 is N0 - 1,
	R is random(128),
	random_numbers(N1,Rs).

numbers_strings(Ns,Ss) :- maplist(n_s,Ns,Ss).
n_s(N,S) :- s(N,_,S).

/* verify:  check that the strings really are what I think they are.
 * Unfortunately, this doesn't work as written:  not all of the
 * nonterminals in the grammar take exactly two arguments.
 * So I'm skipping this verification step for now; it's not really
 * crucial. -MSS, 2008-05-27
 */
verify_once_failable(AST) :-
	default_grammar(G),
	get_grammar_options(G,Opts),
	s(_,NT,A),
	NT \= terminal,
	verify(NT,Opts,A,AST).

verify(NT,Opts,A,AST) :-
	Head =.. [NT, Opts, AST],
	atom_chars(A,S),
	phrase(Head,S).	
	

/* Do expressions for each non-terminal in the grammar, at least
 * the ones that appear on right-hand sides (here, all of them do).
 *
 * Imagine the approriate set of ATNs as a single large regex 
 * consisting of a choice over all the ATNs, and with an alphabet
 * containing (a) all the real terminal symbols (or at least
 * one representative of each equivalence class) and (b) pseudo-atoms
 * (here represented as strings) matching each non-terminal.
 * It's then clear (a) how many states there are and (b) what
 * the largest minimum path length is.
 */


/* DCG for XML Schema 1.0 regular expressions.
 * Parse the expression and return a structure.
 * 
 * Revisions:
 * 2008-05-24 : CMSMcQ : build a set of substrings for random string generation,
 *                       starting from regex.dcg.pl
 */


/* "A regular expression is composed from zero or more branches,
 * separated by | characters."
 *
 * [1] regExp ::= branch ( '|' branch )* 
 */
/* (Actually, the grammar requires at least one branch; the prose is wrong,
 * misguided perhaps by the fact that the branch can be empty. 
 */
s(0,regExp,'0').
s(1,regExp,'a').
s(2,regExp,'').
s(3,regExp,'[abc]').
s(4,regExp,'a|b|cde').
s(5,terminal,'|').

s(6,altbranches,'|xyz').
s(7,altbranches,'|[a-z]|[0-9]*').

/* [Definition:] A branch consists of zero or more pieces,
 * concatenated together.
 */
/* [2] branch ::= piece* */

s(8,branch,'x').
s(9,branch,'x*y+z{2,5}').

 
/* [Definition:] A piece is an atom, possibly followed by a
 * quantifier.
 *
 * [3] piece ::= atom quantifier?  
 */


/* [Definition:] A quantifier is one of ?, *, +, {n,m} or {n,}, which
 * have the meanings defined in the table above.
 *
 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 
 */
/* [5] quantity ::= quantRange | quantMin | QuantExact */
/* [6] quantRange ::= QuantExact ',' QuantExact */
/* [7] quantMin ::= QuantExact ',' */
/* [8] QuantExact ::= [0-9]+ */

s(10,quantifier,'?').
s(11,quantifier,'+').
s(12,quantifier,'*').
s(13,terminal,'{').
s(14,terminal,'}').
s(15,terminal,',').
s(16,quantifier,'{1,9}').
s(17,quantifier,'{3,}').
s(18,quantifier,'{3}').
s(19,maxspec,',2').
s(20,maxspec,',').
s(21,quantexact,'2').
s(22,quantexact,'999').
s(23,digit,'3').


/* [Definition:] An atom is either a normal character, a character
 * class, or a parenthesized regular expression.
 * 
 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 
 */

s(24,re_atom,'3').
s(25,re_atom,'x').
s(26,re_atom,'[xyz]').
s(27,re_atom,'([xyz])').
s(28,re_atom,'(([xyz]))').
s(29,re_atom,'((([xyz])))').
s(30,terminal,'(').
s(31,terminal,')').

/* [Definition:] A metacharacter is either ., \, ?, *, +, {, } (, ), |, 
 * [, or ]. These characters have special meanings in regular
 * expressions, but can be escaped to form atoms that denote the sets
 * of strings containing only themselves, i.e., an escaped
 * metacharacter behaves like a normal character.
 */

s(32,terminal,'.').
s(33,terminal,'\\').
s(34,terminal,'?').
s(35,terminal,'*').
s(36,terminal,'+').
s(37,terminal,'{').
s(38,terminal,'}').
s(39,terminal,'(').
s(40,terminal,')').
s(41,terminal,'|').

/* N.B. XSD 1.0 omitted | from this list, as did drafts D4 and D5 
 * of XSD 1.1.  D6 and later include it.  The grammar rules have 
 * always had it, so it's unlikely that any parsers treat | as a 
 * normal character, but there's a grammar option for it, just in
 * case.
 */

/* [Definition:] A normal character is any XML character that is not a
 * metacharacter. In regular expressions, a normal character is an
 * atom that denotes the singleton set of strings containing only
 * itself.
 *
 * 1.0 1E, PER, 2E, D4, and D5 have:
 *
 *     [10] Char ::= [^.\?*+()|#x5B#x5D] 
 *
 * Later versions (D6, LC, D8, W) have:
 *
 *     [10] Char ::= [^.\?*+{}()|#x5B#x5D] 
 */

s(42,char,'.').
s(43,char,'\\').
s(44,char,'?').
s(45,char,'*').
s(46,char,'+').
s(47,char,'(').
s(48,char,')').
s(49,char,'[').
s(50,char,']').
s(51,char,'|').
s(52,char,'{').
s(53,char,'}').


/* Note that a normal character can be represented either as itself,
 * or with a character reference.
 */

/* F.1 Character Classes */

/* [Definition:] A character class is an atom R that identifies a set
 * of characters C(R). The set of strings L(R) denoted by a character
 * class R contains one single-character string "c" for each character
 * c in C(R).
 *
 * 1.0 1E has:
 * 
 *     [11] charClass ::= charClassEsc | charClassExpr
 * 
 * All other forms of the grammar have:
 *
 * [11] charClass ::= charClassEsc | charClassExpr | WildcardEsc
 *
 */

s(54,charClass,'\n').
s(55,charClass,'\\{').
s(56,charClass,'\\}').
s(57,charClass,'\\s').
s(58,charClass,'\\d').
s(59,charClass,'\\W').
s(60,charClass,'.').
s(61,charClass,'[aeiou]').
s(62,charClass,'[a-zA-Z]').
s(63,charClass,'[^a-zA-Z]').
s(64,charClass,'\\p{L}').
s(65,charClass,'\\p{IsBasicLatin}').
s(66,charClass,'\\p{IsBasicallyLatin}').

 
/* A character class is either a character class escape or a character
 * class expression or a wildcard escape.
 */
/* [Definition:] A character class expression is a character group
 * surrounded by [ and ] characters. For all character groups G, [G]
 * is a valid character class expression, identifying the set of
 * characters C([G]) = C(G).
 *
 * [12] charClassExpr ::= '[' charGroup ']' 
 */

s(67,charClassExpr,'[abc]').
s(68,charClassExpr,'[^abc]').
s(69,charClassExpr,'[\\p{IsBasicLatin}-[aeiou]]').
s(70,charClassExpr,'[^abc-[aeiou]]').

/* [Definition:] A character group is either a positive character
 * group, a negative character group, or a character class
 * subtraction.
 *
 * Grammars 1E through D8 have:
 *
 *     [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 
 * 
 * W has:
 * 
 *     [76] charGroup ::= (posCharGroup | negCharGroup) ('-' charClassExpr)?
 *
 */
s(71,charGroup,'abc').
s(72,charGroup,'a-z').
s(73,charGroup,'a-zA-Z0-9').
s(74,charGroup,'^a-zA-Z0-9').
s(75,charGroup,'^a-zA-Z0-9[abc]').
s(76,charGroup,'^a-zA-Z0-9-[abc]').

/* [Definition:] A positive character group consists of one or more
 * character ranges or character class escapes, concatenated
 * together. A positive character group identifies the set of
 * characters containing all of the characters in all of the sets
 * identified by its constituent ranges or escapes.
 *
 * Most forms of the grammar have:
 *
 *     [14] posCharGroup ::= ( charRange | charClassEsc )+ 
 * 
 * W has:
 * 
 *     [77] posCharGroup ::= (charGroupPart)+
 *
 */

/* N.B. "The ^ character is only valid at the beginning of a positive 
 * character group if it is part of a negative character group".
 *
 * So we pass a parameter on the call to posCharGroup, to act as a flag: 
 *   - 'nolc' means no leading caret is allowed.
 *   - 'oklc' means leading caret is allowed.
 * We could check this in the AST, instead of here.  But for most
 * purposes, it's more convenient to impose that rule here instead
 * of generating multiple parse trees and then winnowing them.
 *
 */
s(77,posCharGroup,'xyz').
s(78,posCharGroup,'q-v').
s(79,posCharGroup,'\\+').
s(80,posCharGroup,'\\-').
s(81,posCharGroup,'\\[').
s(82,posCharGroup,'\\]').
s(83,posCharGroup,'\\^').

/* [Definition:] A negative character group is a positive character
 * group preceded by the ^ character. For all positive character
 * groups P, ^P is a valid negative character group, and C(^P)
 * contains all XML characters that are not in C(P).
 *
 * [15] negCharGroup ::= '^' posCharGroup 
 */
s(84,posCharGroup,'^xyz').
s(85,posCharGroup,'^q-v').
s(86,posCharGroup,'^\\+').
s(87,posCharGroup,'^\\-').
s(88,posCharGroup,'^\\^').
s(89,posCharGroup,'^^').


/* [Definition:] A character class subtraction is a character class
 * expression subtracted from a positive character group or negative
 * character group, using the - character.
 *
 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-'
 *                       charClassExpr 
 */

s(90,charClassSub,'a-z-[aeiou]').
s(91,charClassSub,'abcABC-[aeiou]').

/* [Definition:] A character group part (charGroupPart) is either a
 * single unescaped character (SingleCharNoEsc), a single escaped
 * character (SingleCharEsc), or a character range (charRange).
 *
 * 
 * [79] charGroupPart ::= singleChar | charRange
 * [80] singleChar ::= SingleCharEsc | SingleCharNoEsc 
 *
 * If a charGroupPart starts with a singleChar and this is immediately
 * followed by a hyphen, and if the hyphen is part of the character
 * group (that is, it is not being treated as a substraction operator
 * because it is followed by '['), then the hyphen must be followed by
 * another singleChar, and the sequence (singleChar, hyphen,
 * singleChar) is treated as a charRange. It is an error if either of
 * the two singleChars in a charRange is a SingleCharNoEsc comprising
 * an unescaped hyphen.
 *
 * (This seems to rule out groups like [a-], perhaps intentionally.)
 *
 * More formally, I read that paragraph as saying
 * if (lookahead('-') and not lookahead('-]'), then fail.
 * Reversed to be a condition of success, this becomes the guard
 * ( not lookahead('-') or lookahead('-[') ), or in Prolog as below.
 */

s(92,charGroupPart,'a').
s(93,charGroupPart,'x').
s(94,charGroupPart,'x-z').
s(95,charGroupPart,'\\t').

/* [Definition:] A character range R identifies a set of characters
 * C(R) containing all XML characters with UCS code points in a
 * specified range.
 *
 * Several variants.  charrange('1E') (1.0 1E) has:
 *
 *     [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash  
 *
 * charrange('PER') (1.0 2E PER) has:
 * 
 *     [17] charRange ::= seRange | XmlChar  
 *
 * charrange('2E') (2E, D4-D8) has:
 *
 *     [17] charRange ::= seRange | XmlCharIncDash
 *
 * charRange('W') (draft proposal for bug 1889) has:
 * 
 *     [82] charRange ::= singleChar '-' singleChar
 */  

s(96,charRange,'a-z').
s(97,charRange,'\\n-z').
s(98,charRange,'-').


/* [18] seRange ::= charOrEsc '-' charOrEsc */
/* not reachable if we have grammar option charrange('W'). */


/* [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) 
 *                   | (' &#x' [0-9a-fA-F]+ ';' ) 
 */
/* Only reachable in 1E */


/* [20] charOrEsc ::= XmlChar | SingleCharEsc */
/* not reachable if we have grammar option charrange('W'). */


/* [21] XmlChar ::= [^\#x2D#x5B#x5D] 
 * i.e. any character but "\", "-",  "[", or "]".
 * If the carethack is on and the CaretFlag argument is 'nolc',
 * then also exclude '^'.
 */
s(99,terminal,'^').
s(100,terminal,'-').

/* XmlCharIncDash is present in 1E, 2E, D4-D8, but not in PER or W. */
/* [22] XmlCharIncDash ::= [^\#x5B#x5D] 
 * i.e. any but \ [ ]
 */


/* 1E through D8 impose some form or other of the following constraints
 * in prose; W omits all of them.
 *
 * "A single XML character is a character range that identifies the set
 * of characters containing only itself. All XML characters are valid
 * character ranges, except as follows:"
 *
 *   "The [, ], - and \ characters are not valid character ranges;"
 *
 * Imposed in this program by a guard on charRange (not checked for 
 * grammar version W).
 *
 *   "The ^ character is only valid at the beginning of a positive
 *   character group if it is part of a negative character group"
 *
 * Imposed here by the caret hack (CaretFlag argument).
 * W has a similar rule elsewhere, so this is checked pretty much
 * always.
 *
 *   "The - character is a valid character range only at the
 *   beginning or end of a positive character group."
 *
 * Not in PER or W.  Constraint imposed here by call to hyphen_ok
 * with the content of any positive character group.
 *
 *   A character range may also be written in the form s-e,
 *   identifying the set that contains all XML characters with UCS
 *   code points greater than or equal to the code point of s, but not
 *   greater than the code point of e.
 */
/* All versions up to W say:
 * "s-e is a valid character range iff:
 *
 *   "s is a single character escape, or an XML character; 
 *   "s is not \ 
 *   "If s is the first character in a character class expression, 
 *      [??? surely you mean in a positive character group?!]
 *      then s is not ^ 
 *   "e is a single character escape, or an XML character; 
 *   "e is not \ or [; and 
 *   "The code point of e is greater than or equal to the code point of
 *      s."
 */
/* Of these, 1 and 4 are enforced by the grammar,
 * 3 by the carethack flag, and 2, 5, and 6 are enforced 
 * by a guard in the rule for serange.
 */

/* W adds:
 * 
 * [87] SingleCharNoEsc ::= [^\#x5B#x5D] 
 * 
 * A single unescaped character (SingleCharNoEsc) is any character
 * except '[' or ']'. There are special rules, described earlier, that
 * constraint the use of the characters '-' and '^' in order to
 * disambiguate the syntax.
 */


/* N.B. The prose of proposal W does not mention \, so the correct
 * rule may be 	not(member(Atom, [ '[',']' ])) }.
 * Not something to make a grammar option for, though.
 */


/* F.1.1 Character Class Escapes
 *
 * [Definition:] A character class escape is a short sequence of
 * characters that identifies predefined character class. The valid
 * character class escapes are the single character escapes, the
 * multi-character escapes, and the category escapes (including the
 * block escapes).
 *
 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc |
 *                         complEsc ) 
 */


/* [Definition:] A single character escape identifies a set containing
 * a only one character -- usually because that character is difficult
 * or impossible to write directly into a regular expression.
 */
/* [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] */
/*                             nrt\|.?*+(){} -   [   ]   ^    */


/* The valid single character escapes are: Identifying the set of
 * characters C(R) containing:
 *
 *   \n the newline character (#xA) 
 *   \r the return character (#xD) 
 *   \t the tab character (#x9) 
 *   \\ \ 
 *   \| | 
 *   \. . 
 *   \- - 
 *   \^ ^ 
 *   \? ? 
 *   \* * 
 *   \+ + 
 *   \{ { 
 *   \} } 
 *   \( ( 
 *   \) ) 
 *   \[ [ 
 *   \] ] 
 */

s(101,terminal,'\\n').
s(102,terminal,'\\r').
s(103,terminal,'\\t').
s(104,terminal,'\\\\').
s(105,terminal,'\\|').
s(106,terminal,'\\.').
s(107,terminal,'\\-').
s(108,terminal,'\\^').
s(109,terminal,'\\?').
s(110,terminal,'\\*').
s(111,terminal,'\\+').
s(112,terminal,'\\{').
s(113,terminal,'\\}').
s(114,terminal,'\\(').
s(115,terminal,'\\)').
s(116,terminal,'\\[').
s(117,terminal,'\\]').

/* [Definition:] [Unicode Database] specifies a number of possible
 * values for the "General Category" property and provides mappings
 * from code points to specific character properties. The set
 * containing all characters that have property X, can be identified
 * with a category escape \p{X}. The complement of this set is
 * specified with the category escape \P{X}. ([\P{X}] = [^\p{X}]).
 */
/* [25] catEsc ::= '\p{' charProp '}' */

s(118,terminal,'\\p{').
s(119,terminal,'\\P{').
s(120,terminal,'}').
s(121,terminal,'}').

/* [26] complEsc ::= '\P{' charProp '}' */


/* [27] charProp ::= IsCategory | IsBlock */


/* Note: [Unicode Database] is subject to future revision. For
 * example, the mapping from code points to character properties might
 * be updated. All minimally conforming processors must support the
 * character properties defined in the version of [Unicode Database]
 * that is current at the time this specification became a W3C
 * Recommendation. However, implementors are encouraged to support the
 * character properties defined in any future version.
 */
/* The following table specifies the recognized values of the "General
 * Category" property.
 *
 *   Category Property Meaning 
 *   Letters     L  All Letters 
 *               Lu uppercase 
 *               Ll lowercase 
 *               Lt titlecase 
 *               Lm modifier 
 *               Lo other 
 *
 *   Marks       M  All Marks 
 *               Mn nonspacing 
 *               Mc spacing combining 
 *               Me enclosing 
 *
 *   Numbers     N  All Numbers 
 *               Nd decimal digit 
 *               Nl letter 
 *               No other 
 *
 *   Punctuation P  All Punctuation 
 *               Pc connector 
 *               Pd dash 
 *               Ps open 
 *               Pe close 
 *               Pi initial quote (may behave like Ps or Pe 
 *                  depending on usage) 
 *               Pf final quote (may behave like Ps or Pe 
 *                  depending on usage) 
 *               Po other 
 *
 *   Separators  Z  All Separators 
 *               Zs space 
 *               Zl line 
 *               Zp paragraph 
 *
 *   Symbols     S All Symbols 
 *               Sm math 
 *               Sc currency 
 *               Sk modifier 
 *               So other 
 *
 *   Other       C  All Others 
 *               Cc control 
 *               Cf format 
 *               Co private use 
 *               Cn not assigned 
 */

/* [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
 *                     Separators | Symbols | Others */


/* [29] Letters ::= 'L' [ultmo]?  */
/* [30] Marks ::= 'M' [nce]?  */
/* [31] Numbers ::= 'N' [dlo]?  */
/* [32] Punctuation ::= 'P' [cdseifo]?  */
/* [33] Separators ::= 'Z' [slp]?  */
/* [34] Symbols ::= 'S' [mcko]?  */
/* [35] Others ::= 'C' [cfon]?  */

s(122,terminal,'Lu').
s(123,terminal,'Sk').
s(124,terminal,'Cn').

/* Note: The properties mentioned above exclude the Cs property. The
 * Cs property identifies "surrogate" characters, which do not occur
 * at the level of the "character abstraction" that XML instance
 * documents operate on.
 */
/* [Definition:] [Unicode Database] groups code points into a number
 * of blocks such as Basic Latin (i.e., ASCII), Latin-1 Supplement,
 * Hangul Jamo, CJK Compatibility, etc. The set containing all
 * characters that have block name X (with all white space stripped
 * out), can be identified with a block escape \p{IsX}. The complement
 * of this set is specified with the block escape \P{IsX}. ([\P{IsX}]
 * = [^\p{IsX}]).
 */
/* [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ */
s(125,'IsBlock','abracadabra').
s(126,'IsBlock','IsGreek').
s(126,'IsBlock','IsGreek').
s(127,'IsBlock','IsCJKCompatibilityIdeographsSupplement').

/* [Definition:] A multi-character escape provides a simple way to
 * identify a commonly used set of characters:
 *
 * 1E has:
 *
 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])  
 *
 * All later versions have:
 *
 * [37] MultiCharEsc ::= '\' [sSiIcCdDwW]  
 */


/* ? [37a] WildcardEsc ::= '.' ?  */


/*   Character sequence Equivalent character class 
 *
 *   .  [^\n\r] 
 *   \s [#x20\t\n\r] 
 *   \S [^\s] 
 *   \i the set of initial name characters, 
 *      those matched by Letter | '_' | ':' 
 *   \I [^\i] 
 *   \c the set of name characters, those matched by NameChar 
 *   \C [^\c] 
 *   \d \p{Nd} 
 *   \D [^\d] 
 *   \w [#x0000-#x10FFFF]-[\p{P}\p{Z}\p{C}] 
 *      (all characters except the set of "punctuation", 
 *      "separator" and "other" characters)  
 *   \W [^\w] 
 */
/* Note: The regular expression language defined here does not attempt
 * to provide a general solution to "regular expressions" over UCS
 * character sequences. In particular, it does not easily provide for
 * matching sequences of base characters and combining marks. The
 * language is targeted at support of "Level 1" features as defined in
 * [Unicode Regular Expression Guidelines]. It is hoped that future
 * versions of this specification will provide support for "Level 2"
 * features.
 */