#!/usr/bin/perl
use strict;
# $Id: extract,v 1.8 2006/04/09 17:20:52 eric Exp $

use XML::DOM;

my $BASE = 'http://www.w3.org/2001/sw/DataAccess/rq23/';
my @EXTRAS = (#'dawgt:approval dawgt:Approved', 
	      #'dawgt:approvedBy <http://lists.w3.org/Archives/Public/public-rdf-dawg/2005JulSep/0175>'
	      );

print "\@prefix rdf:    <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
\@prefix rdfs:	<http://www.w3.org/2000/01/rdf-schema#> .
\@prefix mf:     <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> .
\@prefix qt:     <http://www.w3.org/2001/sw/DataAccess/tests/test-query#> .
\@prefix dawgt:   <http://www.w3.org/2001/sw/DataAccess/tests/test-dawg#> .

<>  rdf:type mf:Manifest ;
    rdfs:comment \"examples extracted from the SPARQL Query specification\" ;
    mf:entries
    ( 
";
&extract('../../../rq23/examples.html', '.');
print "    ).
";

# extract -- parse an HTML file, generate data, query and results
# files, and print turtle manifest entries to standard out.
sub extract {
    my ($source, $target) = @_;
    if (! -e $source) {
	die "source file \"$source\" does not exist\n";
    }
    my $sourceDoc = (new XML::DOM::Parser)->parsefile($source);
    my $exampleGroups = $sourceDoc->getElementsByTagName ('div');
    my $nExampleGroups = $exampleGroups->getLength;
    for (my $iExampleGroup = 0; $iExampleGroup < $nExampleGroups; $iExampleGroup++) {
	my $exampleGroup = $exampleGroups->item($iExampleGroup);
	# $exampleGroups will have lower level divs mixed in. Only want exampleGroups.
	next if ($exampleGroup->getAttributeNode('class')->getValue ne 'exampleGroup');

	my ($name, $base, $suffix, $href, $data, $query, $result); # correspond to the test schema
	my $nodes = $exampleGroup->getElementsByTagName ('div');
	my $nNodes = $nodes->getLength;

	# The main loop of walks through the nodes and emulates a
	# reader's understanding of the specification. If a reader
	# sees a data set, a query and a result, they naturally group
	# them. If they see a dataset, query, result, query, result,
	# they assume the dataset applies to both queries. This script
	# turns those into two tests. A query with two (hopefully
	# equivalent) results produces two tests.

	# We keep track of the current (data, query, result) tuple and
	# dump a query whenever the current query group ends, or a
	# change to the tuple requires that we write down the last
	# test. Each test may have only one query or result so a new
	# one of either of these dumps the current test and records
	# the tuple for the next one.

	for (my $iNodes = 0; $iNodes < $nNodes; $iNodes++) {
	    my $node = $nodes->item($iNodes);
	    my $class = $node->getAttributeNode('class')->getValue;
	    if ($class eq 'dataBox') {
		($name, $base, $suffix, $href) = &getTitle($node->getElementsByTagName('h2')->item(0));
		my $filename = "data-$base$suffix.ttl";
		&createFile($filename, &extractPres($node, 'data'), $name);
		push (@$data, $filename);
	    } elsif ($class eq 'queryGroup') {
		if ($query) {
		    # Dump the previous query so we can record this one.
		    &renderEntry($name, $base, $suffix, $href, $data, $query, $result);
		    $result = undef;
		}
		($name, $base, $suffix, $href) = &getTitle($node->getElementsByTagName('h2')->item(0));
		my $filename = "query-$base$suffix.rq";
		&createFile($filename, &extractPres($node, 'query'), $name);
		$query = $filename;
	    } elsif ($class eq 'result') {
		if ($result) {
		    # Dump the previous query so we can record this one.
		    &renderEntry($name, $base, $suffix, $href, $data, $query, $result);
		}
		($name, $base, $suffix, $href) = &getTitle($node->getPreviousSibling);
		my $filename = "result-$base$suffix.ttl";
		if ($node->getElementsByTagName('table')->getLength == 1) {
		    &createFile($filename, &table2turtle($node->getElementsByTagName('table')->item(0), 'resultTurtle'), $name);
		} elsif ($node->getElementsByTagName('pre')->getLength == 1) {
		    my $pre = $node->getElementsByTagName('pre')->item(0);
		    if ($pre->getAttributeNode('class')->getValue eq 'resultAsk') {
			$filename = $pre->getFirstChild->getData eq 'yes' ? 'true.ttl' : 'false.ttl';
		    } else {
			&createFile($filename, &extractPres($node, 'resultTurtle'), $name);
		    }
		} else {
		    my $str = $node->toString;
		    die "what is $node ?";
		}
		$result = $filename;
	    } elsif ($class eq 'result') {
		# result is found in a resultsGroup
	    } else {
		&chattyWarning('skipping unknown object', $node);
	    }
	}

	&renderEntry($name, $base, $suffix, $href, $data, $query, $result);
    }
}

# renderEntry -- print a manifest test entry.
sub renderEntry {
    my ($name, $base, $suffix, $href, $data, $query, $result) = @_;
    my @actionStrs;
    if ($query) {
	push (@actionStrs, "qt:query  <$query>");
    }
    foreach my $datafile (@$data) {
	push (@actionStrs, "qt:data   <$datafile>");
    }
    my $actionStr = join(" ;\n              ", @actionStrs);
    my @strs;
    push (@strs, "mf:action
            [ $actionStr ] ");
    if ($result) {
	push (@strs, "mf:result  <$result> ");
    }
    $href =~ s/^\.\///;
    $href = $BASE.$href;
    my @top = ("[  mf:name    \"spec-$base$suffix\"", 
	       "rdfs:comment \"Extracted: $name <$href>\"");
    print "      ", join(" ;\n         ", @top, @strs, @EXTRAS), "\n      ]\n";
}

# extractPres -- utility function to extract the data in a <pre/>,
# which are used by the XSLT to record data, queries and
# turtleResults (obselete).
sub extractPres {
    my ($node, $class) = @_;
    my $ret;
    my $dataBoxes = $node->getElementsByTagName('pre');
    my $nDataBoxes = $dataBoxes->getLength;
    for (my $iDataBoxes = 0; $iDataBoxes < $nDataBoxes; $iDataBoxes++) {
	my $dataBox = $dataBoxes->item($iDataBoxes);
	next if ($dataBox->getAttributeNode('class')->getValue ne $class);
	&chattyWarning("already found a ${class}", $node) if $ret;
	$ret = join('', $dataBox->getFirstChild->getData);
	&utf8::decode($ret);
    }
#    my $count = @ret;
#    &chattyWarning("not enough ${class}es found ($count)", $node) if (defined $min && $count < $min);
#    &chattyWarning("to many ${class}es found ($count)", $node) if (defined $max && $count > $max);
    return $ret;
}

# getTitle -- extract title info from a DOM node with an appropriate
# schema.
sub getTitle {
    my ($title) = @_;
    my $base = $title->getElementsByTagName ('span')->item(1)->getFirstChild->getData;
    my $suffixEl = $title->getElementsByTagName ('span')->item(2)->getFirstChild;
    my $suffix = $suffixEl ? $suffixEl->getData : undef;
    my $a = $title->getElementsByTagName('a')->item(0);
    my $href = $a->getAttributeNode('href')->getValue;
    my $name = $a->getFirstChild->getData;
    $name =~ s/\"/\'/g;
    return ($name, $base, $suffix, $href);
}

# table2turtle -- convert results encoded in an HTML table into the
# SPARQL manifest turtle results format.
sub table2turtle {

    # pad -- formatting utility to make turtle results human-readable.
    sub pad {
	my ($prefix, @data) = @_;
	my $pad = ' ' x (length($prefix));
	map {s/\n/\n$pad/gm} @data;
	return $prefix.join(" ;\n$pad", @data);
    }

    my ($table) = @_;
    my @l0;
    push (@l0, "\@prefix rdf:  <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .");
    push (@l0, "\@prefix rs:   <http://www.w3.org/2001/sw/DataAccess/tests/result-set#> .");
    push (@l0, "\@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .");
    push (@l0, "");
    my @l1;
    push (@l1, "rdf:type rs:ResultSet");
    my $trs = $table->getElementsByTagName ('tr');
    if ($trs->getLength > 0) {
	my @variables;
	{
	    my $ths = $trs->item(0)->getElementsByTagName ('th');
	    for (my $i = 0; $i < $ths->getLength; $i++) {
		push (@variables,  $ths->item($i)->getFirstChild->getData);
	    }
	    my $varsStr = join("\", \"", @variables);
	    push (@l1, "rs:resultVariable \"$varsStr\"");
	}

	my @solutions;
	for (my $iSolutions = 1; $iSolutions < $trs->getLength; $iSolutions++) {
	    my @bindings;
	    my $tds = $trs->item($iSolutions)->getElementsByTagName ('td');
	    for (my $iBindings = 0; $iBindings < $tds->getLength; $iBindings++) {
		if (my $dataElement = $tds->item($iBindings)->getFirstChild) {
		    my $value = $dataElement->getData;
		    &utf8::decode($value);

		    # check if we are not dealing with an unbound variable
		    if (!($value eq " ")) { 
			push (@bindings, "[ rs:value    $value ;\n  rs:variable \"$variables[$iBindings]\"\n]");
		    } 
		}
	    }
	    
	    push (@solutions, &pad("[ ", map {"rs:binding $_"} @bindings).' ]');
	}
	push (@l1, &pad("", map {"rs:solution $_"} @solutions));
    }
    push (@l0, &pad("[] ", @l1));
    return join ("\n", @l0)." .\n";
}

sub createFile {
    my ($filename, $text, $title) = @_;
    open (F, '>:utf8', $filename) || die "unable to create \"$filename\"";
    print F "# $title\n\n";
    print F $text;
    close F;
}

sub chattyWarning {
    my ($msg, $node) = @_;
    my $str = $node->toString;
    warn "$msg in $str\n";
}

# $Log: extract,v $
# Revision 1.8  2006/04/09 17:20:52  eric
# ~ fixed encoding issues in values
#
# Revision 1.7  2006/04/09 15:27:15  eric
# + function comments
#
# Revision 1.6  2006/04/06 10:35:03  jbroekst
# fixed problem with unbounds in solutions (extract script now simply does not spit out a binding when this occurs).
#
# Revision 1.5  2006/04/04 15:34:29  eric
# added CVS keywords
#
#