#!/usr/bin/perl

use strict;

use XML::DOM qw(ELEMENT_NODE);
use XML::XQL;
use XML::XQL::DOM;

use vars qw(%CTypes %Inner);

%CTypes = ('PDF' => 'application/pdf');
%Inner = ('b' => 1, 'i' => 1);

&main(\@ARGV);

sub main {
    my ($argv) = @_;

    # parse the source doc
    my $parser = new XML::DOM::Parser;
    my $doc = $parser->parsefile($argv->[0]);
#    my $nodes = $doc->getElementsByTagName("p");
#    my $query = new XML::XQL::Query(Expr => 'html/body/table/tr/td/h3');
#    my @nodes = $doc->$query->solve($doc);
    my @nodes = $doc->xql('html/body/table/tr/td/h3');

    # create the target doc
    my $outHandle = new FileHandle ($argv->[1], 'w');
    my $outDoc = new XML::DOM::Document;
    my $rdf = $outDoc->createElement('rdf:RDF');
    $rdf->setAttribute('xmlns:rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#');
    $rdf->setAttribute('xmlns:foo', 'http://www.w3.org/2002/06/09-ISWC/toc2rdf.pl#');
    &pad($rdf, 2, $outDoc);

    foreach my $node (@nodes) {
	my $heading = $node->getFirstChild()->getData;
	my $headingT = $heading; $headingT =~ s/ /_/g;
	for (my $p = $node->getNextSibling->getNextSibling; 
	     $p->isa('XML::DOM::Element') && $p->getTagName eq 'p'; 
	     $p = $p->getNextSibling->getNextSibling) {
	    # <p><strong>Semantic Web Enabled Web Services</strong><br />
	    # Dieter Fensel, Christoph Bussler, and Alexander Maedche<br />
	    # LNCS 2342, p. 1 ff.<br />
	    # <a href="../bibs/2342/23420001.htm">Abstract</a> | <a
	    # href="../papers/2342/23420001.pdf">Full article in PDF (50
	    # KB)</a></p>
	    my $strongEl = $p->getFirstChild;
	    my $strongInner = $strongEl->getFirstChild;
	    my $title = &suckEntities(\$strongInner);
	    last if ($title eq 'Author Index');
	    my $authEl = $strongEl->getNextSibling->getNextSibling;
	    my $authors = &suckEntities(\$authEl);
	    my @authors = split(/(?:, *and +)|(?:, *)|(?: +and +)/, $authors);
	    my $mysteryEl = $authEl->getNextSibling;
	    my $mystery = &suckEntities(\$mysteryEl);
	    my $abstractEl = $mysteryEl->getNextSibling->getNextSibling;
	    my $abstract = $abstractEl->getAttribute('href');
	    $abstract =~ m/^\.\.\/bibs\/(.*?)\/(.*?)\.htm$/ || die "unparsable abstract: \"$abstract\"";
	    my ($abstractDir, $abstractNum) = ($1, $2);
	    $abstract =~ s/^\.\./http:\/\/link.springer.de\/link\/service\/series\/0558/;
	    my $articleEl = $abstractEl->getNextSibling->getNextSibling;
	    my $article = $articleEl->getAttribute('href');
	    $article =~ m/^\.\.\/papers\/(.*?)\/(.*?)\.pdf$/ || die "unparsable article: \"$article\"";
	    my ($articleDir, $articleNum) = ($1, $2);
	    $article =~ s/^\.\./http:\/\/link.springer.de\/link\/service\/series\/0558/;
	    if ($articleDir ne $abstractDir || $articleNum ne $abstractNum) {
		die "($articleDir ne $abstractDir || $articleNum ne $abstractNum)";
	    }
	    my $metaEl = $articleEl->getFirstChild;
	    my $meta = &suckEntities(\$metaEl);
	    $meta =~ m/^Full article in (.*?) \(([^\)]+)\)$/ || die "unparsable meta: \"$meta\"";
	    my ($type, $size) = ($1, $2);
#	    print "$heading $title $authors $mystery $abstract $article $type, $size\n";

	    # create the RDF for this article
	    my $description = $rdf->appendChild($outDoc->createElement("foo:$headingT"));
	    &pad($rdf, 2, $outDoc);
	    $description->setAttribute('rdf:about', $article);
	    &pad($description, 4, $outDoc);
	    $description->appendChild($outDoc->createElement('foo:contentType'))->appendChild($outDoc->createTextNode($CTypes{$type}));
	    &pad($description, 4, $outDoc);
	    $description->appendChild($outDoc->createElement('foo:size'))->appendChild($outDoc->createTextNode($size));
	    &pad($description, 4, $outDoc);
	    $description->appendChild($outDoc->createElement('foo:title'))->appendChild($outDoc->createTextNode($title));
	    &pad($description, 4, $outDoc);
	    $description->appendChild($outDoc->createElement('foo:authors'))->appendChild($outDoc->createTextNode($authors));
	    &pad($description, 4, $outDoc);
	    foreach my $author (@authors) {
		$description->appendChild($outDoc->createElement('foo:author'))->appendChild($outDoc->createTextNode($author));
		&pad($description, 4, $outDoc);
	    }
	    $description->appendChild($outDoc->createElement('foo:mystery'))->appendChild($outDoc->createTextNode($mystery));
	    &pad($description, 4, $outDoc);
	    $description->appendChild($outDoc->createElement('foo:abstract'))->setAttribute('rdf:resource', $abstract);
	    &pad($description, 2, $outDoc);
#	    print "$articleNum\n";
	}
    }

    $rdf->print($outHandle);
}

sub suckEntities {
    my ($pTextNode) = @_;
    my @ret = ();
    while ($$pTextNode &&
	   ($$pTextNode->isa('XML::DOM::Text') || 
	    $$pTextNode->isa('XML::DOM::EntityReference') || 
	    ($$pTextNode->isa('XML::DOM::Element') && defined $Inner{$$pTextNode->getTagName}))) {
	if ($$pTextNode->isa('XML::DOM::Text')) {
	    push (@ret, $$pTextNode->getData);
	} elsif ($$pTextNode->isa('XML::DOM::EntityReference')) {
	    push (@ret, $$pTextNode->getData);
	} elsif ($$pTextNode->isa('XML::DOM::Element')) {
	    my $node = $$pTextNode->getFirstChild;
	    push (@ret, &suckEntities(\$node));
	}
	$$pTextNode = $$pTextNode->getNextSibling;
    }
    my $ret = join ('', @ret);
    $ret =~ s/\s+/ /g;
    $ret =~ s/^ //g;
    $ret =~ s/ $//g;
    return $ret;
}
sub pad {
    my ($el, $spaces, $doc) = @_;
    $el->appendChild($doc->createTextNode("\n".(' ' x $spaces)));;
}
