#!/usr/bin/perl use strict; # $Id: extract,v 1.8 2006/04/09 17:20:52 eric Exp $ use XML::DOM; my $BASE = 'http://www.w3.org/2001/sw/DataAccess/rq23/'; my @EXTRAS = (#'dawgt:approval dawgt:Approved', #'dawgt:approvedBy ' ); print "\@prefix rdf: . \@prefix rdfs: . \@prefix mf: . \@prefix qt: . \@prefix dawgt: . <> rdf:type mf:Manifest ; rdfs:comment \"examples extracted from the SPARQL Query specification\" ; mf:entries ( "; &extract('../../../rq23/examples.html', '.'); print " ). "; # extract -- parse an HTML file, generate data, query and results # files, and print turtle manifest entries to standard out. sub extract { my ($source, $target) = @_; if (! -e $source) { die "source file \"$source\" does not exist\n"; } my $sourceDoc = (new XML::DOM::Parser)->parsefile($source); my $exampleGroups = $sourceDoc->getElementsByTagName ('div'); my $nExampleGroups = $exampleGroups->getLength; for (my $iExampleGroup = 0; $iExampleGroup < $nExampleGroups; $iExampleGroup++) { my $exampleGroup = $exampleGroups->item($iExampleGroup); # $exampleGroups will have lower level divs mixed in. Only want exampleGroups. next if ($exampleGroup->getAttributeNode('class')->getValue ne 'exampleGroup'); my ($name, $base, $suffix, $href, $data, $query, $result); # correspond to the test schema my $nodes = $exampleGroup->getElementsByTagName ('div'); my $nNodes = $nodes->getLength; # The main loop of walks through the nodes and emulates a # reader's understanding of the specification. If a reader # sees a data set, a query and a result, they naturally group # them. If they see a dataset, query, result, query, result, # they assume the dataset applies to both queries. This script # turns those into two tests. A query with two (hopefully # equivalent) results produces two tests. # We keep track of the current (data, query, result) tuple and # dump a query whenever the current query group ends, or a # change to the tuple requires that we write down the last # test. Each test may have only one query or result so a new # one of either of these dumps the current test and records # the tuple for the next one. for (my $iNodes = 0; $iNodes < $nNodes; $iNodes++) { my $node = $nodes->item($iNodes); my $class = $node->getAttributeNode('class')->getValue; if ($class eq 'dataBox') { ($name, $base, $suffix, $href) = &getTitle($node->getElementsByTagName('h2')->item(0)); my $filename = "data-$base$suffix.ttl"; &createFile($filename, &extractPres($node, 'data'), $name); push (@$data, $filename); } elsif ($class eq 'queryGroup') { if ($query) { # Dump the previous query so we can record this one. &renderEntry($name, $base, $suffix, $href, $data, $query, $result); $result = undef; } ($name, $base, $suffix, $href) = &getTitle($node->getElementsByTagName('h2')->item(0)); my $filename = "query-$base$suffix.rq"; &createFile($filename, &extractPres($node, 'query'), $name); $query = $filename; } elsif ($class eq 'result') { if ($result) { # Dump the previous query so we can record this one. &renderEntry($name, $base, $suffix, $href, $data, $query, $result); } ($name, $base, $suffix, $href) = &getTitle($node->getPreviousSibling); my $filename = "result-$base$suffix.ttl"; if ($node->getElementsByTagName('table')->getLength == 1) { &createFile($filename, &table2turtle($node->getElementsByTagName('table')->item(0), 'resultTurtle'), $name); } elsif ($node->getElementsByTagName('pre')->getLength == 1) { my $pre = $node->getElementsByTagName('pre')->item(0); if ($pre->getAttributeNode('class')->getValue eq 'resultAsk') { $filename = $pre->getFirstChild->getData eq 'yes' ? 'true.ttl' : 'false.ttl'; } else { &createFile($filename, &extractPres($node, 'resultTurtle'), $name); } } else { my $str = $node->toString; die "what is $node ?"; } $result = $filename; } elsif ($class eq 'result') { # result is found in a resultsGroup } else { &chattyWarning('skipping unknown object', $node); } } &renderEntry($name, $base, $suffix, $href, $data, $query, $result); } } # renderEntry -- print a manifest test entry. sub renderEntry { my ($name, $base, $suffix, $href, $data, $query, $result) = @_; my @actionStrs; if ($query) { push (@actionStrs, "qt:query <$query>"); } foreach my $datafile (@$data) { push (@actionStrs, "qt:data <$datafile>"); } my $actionStr = join(" ;\n ", @actionStrs); my @strs; push (@strs, "mf:action [ $actionStr ] "); if ($result) { push (@strs, "mf:result <$result> "); } $href =~ s/^\.\///; $href = $BASE.$href; my @top = ("[ mf:name \"spec-$base$suffix\"", "rdfs:comment \"Extracted: $name <$href>\""); print " ", join(" ;\n ", @top, @strs, @EXTRAS), "\n ]\n"; } # extractPres -- utility function to extract the data in a
,
# which are used by the XSLT to record data, queries and
# turtleResults (obselete).
sub extractPres {
    my ($node, $class) = @_;
    my $ret;
    my $dataBoxes = $node->getElementsByTagName('pre');
    my $nDataBoxes = $dataBoxes->getLength;
    for (my $iDataBoxes = 0; $iDataBoxes < $nDataBoxes; $iDataBoxes++) {
	my $dataBox = $dataBoxes->item($iDataBoxes);
	next if ($dataBox->getAttributeNode('class')->getValue ne $class);
	&chattyWarning("already found a ${class}", $node) if $ret;
	$ret = join('', $dataBox->getFirstChild->getData);
	&utf8::decode($ret);
    }
#    my $count = @ret;
#    &chattyWarning("not enough ${class}es found ($count)", $node) if (defined $min && $count < $min);
#    &chattyWarning("to many ${class}es found ($count)", $node) if (defined $max && $count > $max);
    return $ret;
}

# getTitle -- extract title info from a DOM node with an appropriate
# schema.
sub getTitle {
    my ($title) = @_;
    my $base = $title->getElementsByTagName ('span')->item(1)->getFirstChild->getData;
    my $suffixEl = $title->getElementsByTagName ('span')->item(2)->getFirstChild;
    my $suffix = $suffixEl ? $suffixEl->getData : undef;
    my $a = $title->getElementsByTagName('a')->item(0);
    my $href = $a->getAttributeNode('href')->getValue;
    my $name = $a->getFirstChild->getData;
    $name =~ s/\"/\'/g;
    return ($name, $base, $suffix, $href);
}

# table2turtle -- convert results encoded in an HTML table into the
# SPARQL manifest turtle results format.
sub table2turtle {

    # pad -- formatting utility to make turtle results human-readable.
    sub pad {
	my ($prefix, @data) = @_;
	my $pad = ' ' x (length($prefix));
	map {s/\n/\n$pad/gm} @data;
	return $prefix.join(" ;\n$pad", @data);
    }

    my ($table) = @_;
    my @l0;
    push (@l0, "\@prefix rdf:   .");
    push (@l0, "\@prefix rs:    .");
    push (@l0, "\@prefix xsd:  .");
    push (@l0, "");
    my @l1;
    push (@l1, "rdf:type rs:ResultSet");
    my $trs = $table->getElementsByTagName ('tr');
    if ($trs->getLength > 0) {
	my @variables;
	{
	    my $ths = $trs->item(0)->getElementsByTagName ('th');
	    for (my $i = 0; $i < $ths->getLength; $i++) {
		push (@variables,  $ths->item($i)->getFirstChild->getData);
	    }
	    my $varsStr = join("\", \"", @variables);
	    push (@l1, "rs:resultVariable \"$varsStr\"");
	}

	my @solutions;
	for (my $iSolutions = 1; $iSolutions < $trs->getLength; $iSolutions++) {
	    my @bindings;
	    my $tds = $trs->item($iSolutions)->getElementsByTagName ('td');
	    for (my $iBindings = 0; $iBindings < $tds->getLength; $iBindings++) {
		if (my $dataElement = $tds->item($iBindings)->getFirstChild) {
		    my $value = $dataElement->getData;
		    &utf8::decode($value);

		    # check if we are not dealing with an unbound variable
		    if (!($value eq " ")) { 
			push (@bindings, "[ rs:value    $value ;\n  rs:variable \"$variables[$iBindings]\"\n]");
		    } 
		}
	    }
	    
	    push (@solutions, &pad("[ ", map {"rs:binding $_"} @bindings).' ]');
	}
	push (@l1, &pad("", map {"rs:solution $_"} @solutions));
    }
    push (@l0, &pad("[] ", @l1));
    return join ("\n", @l0)." .\n";
}

sub createFile {
    my ($filename, $text, $title) = @_;
    open (F, '>:utf8', $filename) || die "unable to create \"$filename\"";
    print F "# $title\n\n";
    print F $text;
    close F;
}

sub chattyWarning {
    my ($msg, $node) = @_;
    my $str = $node->toString;
    warn "$msg in $str\n";
}

# $Log: extract,v $
# Revision 1.8  2006/04/09 17:20:52  eric
# ~ fixed encoding issues in values
#
# Revision 1.7  2006/04/09 15:27:15  eric
# + function comments
#
# Revision 1.6  2006/04/06 10:35:03  jbroekst
# fixed problem with unbounds in solutions (extract script now simply does not spit out a binding when this occurs).
#
# Revision 1.5  2006/04/04 15:34:29  eric
# added CVS keywords
#
#