#!/usr/bin/perl
# $Id: rfcIndexGrok.pl,v 1.8 2001/08/16 22:28:39 connolly Exp $
#
# USAGE:
#
# GET http://www.ietf.org/iesg/1rfc_index.txt \
#   | perl rfcIndexGrok.pl >,rfc-index.rdf
#
# TODO:
#   - relate r:authors property to dc:creator better (in progress...)
#   - think about collections a bit (started; see makeSequence)
#   - parse the remaining properties (format, obsoletes, etc.)
#   - integrate a few queries? e.g. number, title, authors?
#   - run as conversion service?
#   - include more info about the conversion: last-modified of input,
#        Revision/Date of conversion script
#   - Ask IETF to publish the index in RDF/xml in the first place
# DONE (since 1.4):
#   - convert to (kludgy) makeStatement() API; produce RDF/xml
#   - string quoting, e.g. of title
# see also Log at end of file.
#

use strict;

my($RFCns) = 'http://www.w3.org/2001/02pd/rfc56#';
my($RDFns) = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
my($RDFSns) = 'http://www.w3.org/2000/01/rdf-schema#';
my($DCns) = 'http://purl.org/dc/elements/1.1/';
my($ContactNS) = 'http://www.w3.org/2000/10/swap/pim/contact#';

my($Out); # @@KLUDGE! copied from pdkb.pl...

&main();

sub main{
  my($minRFC, $maxRFC);
  $minRFC = shift @ARGV;
  $maxRFC = shift @ARGV;

  my($state, $headings, $entry);

  RDF_startDoc();

  &bind("rfc", $RFCns);
  &bind("rdfs", $RDFSns);
  &bind("dc", "http://purl.org/dc/elements/1.1/");
  &bind("c", $ContactNS);

  &makeStatement("", $DCns . 'source',
		 'http://www.ietf.org/iesg/1rfc_index.txt');
  # schema bits inline...

  &makeStatement($ContactNS . 'fullName', $RDFSns . 'subPropertyOf',
		 'http://www.w3.org/2001/vcard-rdf/3.0#FN');
  &makeStatement($ContactNS . 'name', $RDFSns . 'subPropertyOf',
		 'http://www.cyc.com/cyc-2-1/vocab/social-vocab.html#nameOfAgent');
  &makeStatement($ContactNS . 'name', $RDFSns . 'subPropertyOf',
		 'http://xmlns.com/foaf/0.1/name');
  &makeStatement($ContactNS . 'name', $RDFSns . 'subPropertyOf',
		 'http://dublincore.org/2000/03/13-dcagent#agentName');

  &makeStatement($RFCns . 'obsoletes', $RDFSns . 'subPropertyOf',
		 $DCns . 'relation');
  &makeStatement($RFCns . 'obsoleted_by', $RDFSns . 'subPropertyOf',
		 $DCns . 'relation');
  &makeStatement($RFCns . 'updates', $RDFSns . 'subPropertyOf',
		 $DCns . 'relation');
  &makeStatement($RFCns . 'updated_by', $RDFSns . 'subPropertyOf',
		 $DCns . 'relation');

  &makeStatement($RFCns . 'rfc', $RDFSns . 'subPropertyOf',
		 $DCns . 'identifier');
  &makeStatement($RFCns . 'std', $RDFSns . 'subPropertyOf',
		 $DCns . 'identifier');
  &makeStatement($RFCns . 'fyi', $RDFSns . 'subPropertyOf',
		 $DCns . 'identifier');
  &makeStatement($RFCns . 'bcp', $RDFSns . 'subPropertyOf',
		 $DCns . 'identifier');

  $state = 'start';
  $headings = 2;

  while(<>){
    if($state eq 'start'){
      if(/^\s*RFC INDEX\s*$/){
	<>; #skip ---- line
	$headings--;
	if($headings == 0){
	  $entry = '';
	  $state = 'slurp';
	}
      }
    }

    elsif($state eq 'slurp'){
      if(/^\s*$/){
	if($entry){
	  &doEntry($entry, $minRFC, $maxRFC);
	  $entry = '';
	}
      }
      else{
	$entry .= $_;
      }
    }
  }

  if($entry){
    &doEntry($entry, $minRFC, $maxRFC);
  }

  RDF_endDoc();

  print $Out;

}

sub doEntry{
  my($entry, $minRFC, $maxRFC) = @_;
  my($e) = $entry;

  my($num, $title, @authorNames, $date, $fmt, $bytes, $status);
  my(@props);

  $entry =~ s/\s+/ /g;
  if($entry =~ s/^(\d+)\s+//){
    $num = $1;
    return if $minRFC && $num < $minRFC;
    return if $maxRFC && $num > $maxRFC;
  }

  if($entry =~ /^Not Issued.\s*$/){
    $num = $1;

    return if $minRFC && $num < $minRFC;
    return if $maxRFC && $num > $maxRFC;

    my($doc);
    $doc = the($RFCns . 'rfc', $num, "rfc$num-");
    makeStatement($doc, $RDFns . 'type', $RFCns . 'NotIssued');
  }
  else{

    # matching titles was tricky... thanks to
    # Neil Hoggarth for sharing his technique...
    # http://www.kernighan.demon.co.uk/software/
    # http://www.kernighan.demon.co.uk/software/rfcindex-1.2
    # rfcindex,v 1.7 2000/02/20 16:11:54 njh Exp
    if($entry =~ s/^(.*?\.)([\s\n]+([A-Z]\.){1,2} )/$3/){
      $title = $1;
    }elsif($entry =~ s/^(.*?\.)([\s\n]+The[\s\n]+Internet[\s\n]+Society)/$3/){
      $title = $1;
    }elsif($entry =~ s/^(.*?\.\s+)//){
      $title = $1;
    }else{
      warn "bogus entry: $entry";
    }
    $title =~ s/\.$//;

    my($doc);
    $doc = the($RFCns . 'rfc', $num, "rfc$num-");
    makeStatement($doc, $DCns . 'title', '', $title);

    while($entry =~ s/\(([^\)]+)\)\s*$//){
      my($prop) = $1;
      if($prop =~ /Status: ([^\)]+)/){
	$status = $1;
      }
      elsif($prop =~ s/^obsoletes //i){
	&eachRelated($doc, $RFCns . 'obsoletes', $prop);
      }
      elsif($prop =~ s/^obsoleted by //i){
	&eachRelated($doc, $RFCns . 'obsoleted_by', $prop);
      }
      elsif($prop =~ s/^updates //i){
	&eachRelated($doc, $RFCns . 'updates', $prop);
      }
      elsif($prop =~ s/^updated by //i){
	&eachRelated($doc, $RFCns . 'updated_by', $prop);
      }
      elsif($prop =~ s/format: //i){
	while($prop =~ s/(TXT|PS|PDF)=(\d+)//){
	  my($fmt, $qty) = ($1, $2);
	  # these should be describe separate resources...
	  makeStatement($doc, $DCns . 'format', '',
			$fmt eq 'TXT' ? "text/plain" :
			($fmt eq 'PS' ? "application/postscript" :
			"application/pdf")
		       );
	  makeStatement($doc, $RFCns . 'bytes', '', $qty);
	}
      }
      elsif($prop =~ /not online/i){
	makeStatement($doc, $RDFSns . 'comment', '', $prop);
      }
      elsif($prop =~ /Also RFC(\d+)/i){ #@@ odd...
	makeStatement($doc, $RFCns . 'rfc', '', $1);
      }
      elsif($prop =~ /Also STD(\d+)/i){
	makeStatement($doc, $RFCns . 'std', '', $1);
      }
      elsif($prop =~ /Also FYI(\d+)/i){
	makeStatement($doc, $RFCns . 'fyi', '', $1);
      }
      elsif($prop =~ /Also BCP(\d+)/i){
	makeStatement($doc, $RFCns . 'bcp', '', $1);
      }
      elsif($prop =~ /Also RTR(\d+)/i){
	makeStatement($doc, $RFCns . 'rtr', '', $1);
      }
      else{
	warn "unrecognized prop: $prop on $doc";
	push(@props, $prop);
      }
    }

    if(($entry =~ s/\.\s*$//) && # chop of . after date
       ($entry =~ s/\s*([^\.]+)$//)){
      $date = $1;
      $entry =~ s/\.\s*$//; # chop of . after authors
      @authorNames = split(/,\s+/, $entry); #@@BUG: Ed., Jr., and, et. et al.
    }else{
      warn "failed to find date for $num: $e";
    }

    #@@not needed any more? $title =~ s/\"/\'/g; #@@NASTY HACK! but there's no way, in N3, to write strings that end in "

    makeStatement($doc, $RFCns . 'status', '', $status) unless $status eq 'UNKNOWN';

    makeStatement($doc, $RFCns . 'date', '', $date);

    if($date =~ /(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-(\d+)-(\d\d\d\d)/){
      my($month, $day, $year) = ($1, $2, $3);
      $month = index("Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec", $month)/4+1;
      makeStatement($doc, $DCns . 'date', '',
		    sprintf("%04d-%02d-%02d", $year, $month, $day));
    }elsif($date =~ /(January|February|March|April|May|June|July|August|September|October|November|December) (\d\d\d\d)/){
      my($month, $year) = ($1, $2);
      $month =~ s/(...).*/$1/;
      $month = index("Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec", $month)/4+1;
      makeStatement($doc, $DCns . 'date', '',
		    sprintf("%04d-%02d", $year, $month));
    }else{
      warn "cannot parse date: $date, $doc";
    }

    my($an, @authors);
    foreach $an (@authorNames){
      my($a, $hint);
      $hint = $an;
      $hint =~ s/.* //;
      $hint =~ s/[^a-zA-Z]//g;
      $an =~ s/^\s+//;
      $a = &the($ContactNS . 'fullName', $an, $hint);
      push(@authors, $a);
      makeStatement($doc, $DCns . 'creator', $a);
    }

    if($#authors > 0){
      makeSequenceR($doc, $RFCns . 'authors', @authors);
    }else{
      makeStatement($doc, $RFCns . 'author', $authors[0]);
    }
  }
}


sub eachRelated{
  my($doc, $rel, $rfcs) = @_;

  while($rfcs =~ s/(RFC|NIC|EIN|RTR)\s*(\d+)//){
    my($idprop, $obj);
    my($ty, $num) = ($1, $2);
    $idprop = $RFCns . ($ty eq 'RFC' ? 'rfc' :
			$ty eq 'NIC' ? 'nic' :
			$ty eq 'EIN' ? 'ein' :
			$ty eq 'RTR' ? 'rtr' : undef );
    $obj = the($idprop, $num, "rfc$num-");
    makeStatement($doc, $rel, $obj);
  }
}


sub makeSequenceV{
  # make sequence... by value
  my($subj, $pred, @objs) = @_;

  my($obj);
  my($l) = &genSym('listL');
  my($i) = 1;

  makeStatement($subj, $pred, $l);

  foreach $obj (@objs){
    makeStatement($l, $RDFns . "_" . $i++, '', $obj);
  }
}

sub makeSequenceR{
  # make sequence... by reference

  my($subj, $pred, @objs) = @_;

  my($obj);
  my($l) = &genSym('listR');
  my($i) = 1;

  makeStatement($subj, $pred, $l);

  # @@BUG: loses the "and there are no others" property
  # cf @@ in the RDF issues list
  foreach $obj (@objs){
    makeStatement($l, $RDFns . "_" . $i++, $obj);
  }
}


# primitive RDF/xml API, copied from
# Id: pdkb.pl,v 1.16 2001/07/26 09:19:32 connolly Exp 

my(%Prefixes);
my($lastSubj);


# Gag... I'll figure out perl5 OOP/package stuff
# and turn this into RDF::startDoc or whatever eventually...
sub RDF_startDoc{
  undef(%Prefixes);
  $Out = '';
  $lastSubj = ' '; # URIs don't contain spaces
}

sub bind{
  my($pfx, $ns) = @_;

  if($Prefixes{'rdf'}){
    die "cannot bind prefixes after making a statement";
  }

  $Prefixes{$ns} = $pfx;
}


sub RDF_endDoc{
  if($lastSubj ne ' '){
    $Out .= "</rdf:Description\n>";
  }

  $Out .= "</rdf:RDF\n>";
}


sub makeStatement{
  my($subj, $pred, $objRef, $objLit) = @_;

  #print STDERR "== makeStatement ($subj, $pred, $objRef, $objLit)\n";

  if(!$Prefixes{$RDFns}){
    my($ns);
    $Out .= sprintf("<rdf:RDF");
    $Prefixes{$RDFns} = 'rdf';

    foreach $ns (keys %Prefixes){
      $Out .= sprintf("\n xmlns:%s='%s'",
		     $Prefixes{$ns}, asAttr($ns));
    }
    $Out .= " \n>";
  }


  if($lastSubj ne $subj){
    if($lastSubj ne ' '){
      $Out .= "</rdf:Description\n>";
    }
    $Out .= sprintf("<rdf:Description rdf:about='%s'\n>", asAttr($subj));
  }
  $lastSubj = $subj;

  my($ln, $ns);
  $ns = $pred;
  $ln = $& if ($ns =~ s/[a-zA-Z0-9\._-]+$//);

  my($pfx);

  if($pfx = $Prefixes{$ns}){
    $Out .= sprintf("<%s:%s", $pfx, $ln);
  }else{
    $Out .= sprintf("  <%s xmlns='%s'", asAttr($ln), $ns);
  }

  if($objRef){
    $Out .= sprintf(" rdf:resource='%s'\n/>", asAttr($objRef));
  }
  else{
    if($pfx){
      $Out .= sprintf("\n    >%s</%s:%s\n  >", asContent($objLit), $pfx, $ln);
    }else{
      $Out .= sprintf("\n    >%s</%s\n  >", asContent($objLit), $ln);
    }
  }
}


sub asContent{
    my($c) = @_;

    $c =~ s,&,&amp;,g;
    $c =~ s,<,&lt;,g;
    $c =~ s,>,&gt;,g;
    $c =~ s/[\200-\377]/'&#'.ord($&).';'/ge;

    #@@hhm... protect newlines too?

    return $c
}

sub asAttr{
  my($c) = @_;
  
  $c =~ s,&,&amp;,g;
  $c =~ s,<,&lt;,g;
  $c =~ s,>,&gt;,g;
  $c =~ s,\",&quot;,g;
  $c =~ s,\',&apos;,g;
  $c =~ s/[\200-\377]/'&#'.ord($&).';'/ge;
  
  #@@hhm... protect newlines too?
  
  return $c
}

########
#
# primitive RDF/n3 API, copied from
# Id: grokNavItin.pl,v 1.5 2001/07/26 22:54:54 connolly Exp 
#
# which was probably derived from cwm's API.

my($gen);

sub genSym{
  my($hint) = @_;
  
  $gen++;
  return "#_$hint$gen"; #@@KLUDGE: these should be anonymous.
}


my(%Things);

sub the{
  # this assumes $prop is a daml:UniqueProperty
  my($prop, $val, $hint) = @_;
  my($ret);

  $ret = $Things{$prop, $val};
  return $ret if $ret;
  $ret = genSym($hint);
  makeStatement($ret, $prop, '', $val);
  $Things{$prop, $val} = $ret;
  return $ret;
}


# $Log: rfcIndexGrok.pl,v $
# Revision 1.8  2001/08/16 22:28:39  connolly
# now groks min/max RFC number from the command-line
# to make the size of the output manageable.
#
# Revision 1.7  2001/08/16 04:18:24  connolly
# reworked sequences etc.
#
# Revision 1.6  2001/08/06 23:10:48  connolly
# working on dc:creator and agentName and such
#
# Revision 1.5  2001/08/06 21:24:07  connolly
# done:
#    - convert to (kludgy) makeStatement() API; produce RDF/xml
#    - string quoting, e.g. of title
# thinking about...
#    - relate r:authors property to dc:creator better (use &the(...))
#
