#!/usr/bin/perl 
#
# extract_projdata.pl $Id: extract_projdata.pl,v 1.10 2002/07/25 15:08:54 lmiller Exp $
# hackparse the xhtml SWAD-Europe Workpackage descriptions
#
# hackier version by libby 2002-04-26 with a few cross-checks and
# writing individual html files for earch deliverable
# Soon to be superceeded by xslt.
# 
# Dan Brickley <danbri@w3.org> 
# January 2002
#
# See end of doc for more details
# nearby:
#        URL: http://www.w3.org/2000/01/sw/#Approach
#        URL: http://www.w3.org/2000/01/sw/swad-chart.rdf

use strict;

my @todo = <STDIN>;

my $debug = 1;
my $html = 1;

my $DD='views/deliverables';
my $BASE='http://www.w3.org/2001/sw/Europe/plan/workpackages/live/';

my $kickoff="2002-05";
my $dt_rdfcontent="";

my $chartgen = 1; # do we want to generate charts (requires software nearby...)
print STDERR "NOTE: CHARTGEN is OFF. No images generated.\n" unless $chartgen;


my $dt_file='_delivtable.html';
my $dt_content; # HTML deliverables table

my $kickoff="2002-05";
my $dt_rdfcontent="";
my $dt_rdffile="rdf/_esw_projdata.rdf";

$dt_rdfcontent = '<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 xmlns:pm="http://www.w3.org/2002/02/esw/pm#" >'; # todo: add swad-plan vocab


sub warning {
  my $text= shift;
  print STDERR "$text\n";
}


			#### Background knowledge
my $NUM_PARTNERS=5; 	# unlikely to change, but used in tests

## 57 our delivs; 17 std ones
my $TOTAL_NON_STD_DELIVERABLES=57;
my $TOTAL_STD_DELIVERABLES=17;
my $TOTAL_DELIVERABLES = 43; #WRONG!# check; should be sure of this. todo
my $TOTAL_PERSON_MONTHS = 332.4; ## todo: check this

my $WWWBASE='http://www.w3.org/2001/sw/EU/proposal/jan10/dow/wps/';

my @partners = ('ILRT', 'W3C', 'CCLRC', 'HPLabs', 'Stilo');

my (@tdata, @cells);  # needed for table extractor hack

my @pd = generateSummary(@todo); # todo: pass in text, get back @projdata

# report on our summary
# todo: move all novel calculations into summariser code 

# we build up an overview of project while reading project descriptions
my %deliv;
my %lead; # key is proj name, value is main_partner (for now)
my %starts;

my %charts; # generated charts, wp -> local path to image

  ## REPORTxxx

  $dt_content .= "<html><head><title>SWAD-Europe: Deliverables listing</title></head>\n";
  $dt_content .= "<body><h1>Deliverables by month</h1>\n\n";
# $dt_content .= "<table summary=\"deliverables\" border=\"1tr><th>Workpackage</th><th>Deliverable</th><th>Month</th><th>Description</th></tr>\n\n";

foreach (@pd) {
  print "#################################################################\n\n";
  my %p = %{$_};
  foreach (keys %p)  {
    print "$_ -> ". $p{$_} ."\n";
  }


  my $wp = 'unknown_workpackage';
  $wp = $p{'wp_name'}; # wp desc name (from filename, typically)
  my $wp_name = $wp; # hmm  
  # print STDERR "WP DEBUG *** name is: $wp \n" if $debug;
  warning "WP $wp ERROR - no details" unless $p{'wp_details'};
  my %detail = %{ $p{'wp_details'} };  # workpackage details

  my $start_mon =  $p{'wp_start'};
  $starts{$wp}=$start_mon;


### note lead is currently partner with the most erffort. in WP5, this
###is equal, but actually it's stilo. - fixed now.

  $lead{$wp} = $p{'main_partners'} || warning "WP $wp no main partners";


     my $wpnum=$wp; 

     $wpnum=~ s/esw\-wp\-//;


### getting out WP (rather than deiverable) information

#********************


 my $docurl= $BASE. "esw-wp-$wpnum". ".html";


$dt_rdfcontent .="\n<rdf:Description>
  <pm:partnerHomepage rdf:resource=\"http://ilrt.org/\"/>
  <pm:effort>$p{'1_assigned_effort'}</pm:effort>
  <pm:wp_start>$p{'wp_start'}</pm:wp_start>
  <pm:workpackageUrl rdf:resource=\"$docurl\" />
</rdf:Description>";

$dt_rdfcontent .="\n<rdf:Description>
  <pm:partnerHomepage rdf:resource=\"http://www.w3.org/\"/>
  <pm:effort>$p{'2_assigned_effort'}</pm:effort>
  <pm:workpackageUrl rdf:resource=\"$docurl\" />
</rdf:Description>";

$dt_rdfcontent .="\n<rdf:Description>
  <pm:partnerHomepage rdf:resource=\"http://www.rl.ac.uk/\"/>
  <pm:effort>$p{'3_assigned_effort'}</pm:effort>
  <pm:workpackageUrl rdf:resource=\"$docurl\" />
</rdf:Description>";

$dt_rdfcontent .="\n<rdf:Description>
  <pm:partnerHomepage rdf:resource=\"http://www-uk.hpl.hp.com/\"/>
  <pm:effort>$p{'4_assigned_effort'}</pm:effort>
  <pm:workpackageUrl rdf:resource=\"$docurl\" />
</rdf:Description>";

$dt_rdfcontent .="\n<rdf:Description>
  <pm:partnerHomepage rdf:resource=\"http://www.stilo.com/\"/>
  <pm:effort>$p{'5_assigned_effort'}</pm:effort>
  <pm:workpackageUrl rdf:resource=\"$docurl\" />
</rdf:Description>";


#********************


  print "\n\nDeliverable items:\n";


  foreach my $d (sort keys %detail) {
     print "item_id: $d\n";
     my $aboutitem = $detail{$d};
     warning "no deliv. description (text: '$aboutitem')" unless $aboutitem;
     $aboutitem =~ m/Month\s+(\d+):\s*(.*)/i;

##xxxtodo: want WP too. danbri work-in-progress

     my $month=$1; 
     my $deliv_desc = $2;
     warning "$wp - empty deliverable $d / $aboutitem" unless $deliv_desc;
     warning "$wp - deliverable without a month (text: '$aboutitem')" unless $month;


#     my $wpnum=$wp; 

#     $wpnum=~ s/esw\-wp\-//;





  if ($deliv_desc && $month) {
       print "$month :: $deliv_desc\n";
       $deliv{ "$wpnum $d" } = $month. '::'. $deliv_desc; ## Store for later...

  #    unless ($d =~ m/std/) {
 #       $dt_content .= "<tr><td>$wpnum</td><td>$d</td>\n<td>$month</td> <td>$deliv_desc </td> </tr> \n";
#  }
     }   
  }# end workpackage details loop 

#print STDERR "\n\n$start_mon";
 

  ####
  #
  # Generate charts (SVG, PNG) illustrating various quantities, relationships etc.
  #
  # requires: http://biolpc22.york.ac.uk/linux/plotutils/
  #


      if ($chartgen) {
  
  my $chartdata = "views/img/_$wp". ".dat";

  # see also: the old-new.txt file has a simple dependency map. @@todo (graphviz)

  # write out text file data for plotutil tools to read
  #
  open(CHART,">$chartdata") || die "Can't open chart config file $chartdata";
  my $ef = $p{'effort_table'};
  my @effort = split(/\s+/,$ef);
  print CHART "#Effort table for $wp : $ef \n";
  my $num_partners=0;
  for (my $pid = 0; $pid<5; $pid++) {
      if ($effort[$pid]) {
	print CHART $partners[$pid]." ".$effort[$pid] ."\n";
        $num_partners++;
      }
  }      
  print CHART "\n";
  close CHART;
 
  die "No partners found for $wp" unless $num_partners;
  my $label = $p{'heading_text'};
  my $title = "SWAD-E WP $label";
  # if multiple partners on this WP, generate a chart
  if ($num_partners > 1) {
    my $c = 'skyblue2,green,aquamarine,pink,yellow,grey';
    $chartdata =~ s#views/img/##;
    my $radius=1.0; # size of chart  - float from 0.1 to 1.2  default: 0.8
    # todo: how can we trim the spare whitespace around the chart borders?
    foreach my $type ('svg','png') {
	print STDERR "chartgen: $wp - making $type chart for $label\n";
	print STDERR `cd views/img/ ;  bin/ascii_chart -r $radius -P -C $c -T $type -t '$title' -Y Effort < $chartdata > _chart_$wp.$type `;
    }
   $charts{$wp} = $WWWBASE.'views/img/_chart_wp';    
  } else {
  print STDERR "chartgen: $wp - skipping (single partner WP) chart(s) for $label\n" if $debug;
  }


  
  # Generate HTML summary of images
  # TODO: move this elsewhere...
  # todo: handle case where no images generated. handle missing images (for 1 partner WPs better).
  #
  open(CHARTINDEX,">_chartindex.html") || die "Can't write chart index file";
  print CHARTINDEX "<html><head><title>SWAD-Europe: Effort allocation charts</title>\n"; 
  print CHARTINDEX "<link rel=\"StyleSheet\" type=\"text/css\"  href=\"style/swad-europe.css\"  />";
  print CHARTINDEX "<body>\n";

   

  print CHARTINDEX "<h1>Appendix: Charts</h1>\n\n";


  print CHARTINDEX "<h2>SWAD-Europe Gantt Chart</h2>\n\n";
  print CHARTINDEX '<p><img src="views/img/swad-gantt-2002-02-07.gif" alt="SWAD-Europe Workpackage starting date, first and last deliverables" /></p>';

  print CHARTINDEX "<h2>SWAD-Europe Dependency Charts</h2>\n\n";

  print CHARTINDEX "<p><img src=\"views/img/_wp-deps.png\" alt=\"dependencies\"
/>\n";
  print CHARTINDEX "(<a href=\"views/img/_wp_deps.svg\">svg</a>)\n</p>\n\n";

  print CHARTINDEX "\n\n<h2>SWAD-Europe: Effort allocation charts (partners / by package)</h2>\n\n";

  print CHARTINDEX "<p>The following charts illustrate the relative proportion of effort allocated in all multi-partner workpackages.\n\n</p>";
 

  foreach my $wp (sort keys %charts) {
    print CHARTINDEX "<h3>WP: $wp</h3>\n\n";
    my $svg = $wp;
    $svg =~ s/png$/svg/;
    print CHARTINDEX "<img src=\"views/img/_chart_$wp.png\" alt=\"$wp\" /> (<a href=\"views/img/_chart_$wp.svg\">svg</a>)\n\n";
  } 
  print "\n\n</body></html>\n\n";
  close CHARTINDEX;



} ### end chartgen



  # this a bit odd; check per WP instead of per whole proj.
  my $total_effort_count = $p{'total_effort_count'};


  if ($total_effort_count)  {
    warning "total effort count ($total_effort_count) should be same as: $TOTAL_PERSON_MONTHS)" 
	unless ($total_effort_count == $TOTAL_PERSON_MONTHS);
  } else { 
    warning "WP $wp_name has no record of total_effort_count";
  }
 print "\n";



} # end big loop thru workpackage descriptions
  # can now do stuff that requires having seen everything....

##$dt_content .= "\n</table>\n\n";



#####  BY MONTH 
#
my %bymonth;
foreach my $deliverable (keys %deliv) {
  my ($m,$t)=split (/::/,$deliv{$deliverable});
  print STDERR "Debug: by month, storing deliv: $deliverable\n" if $debug;
  warning "Bogus deliverable blurb for $deliverable" unless ($m && $t);
  $bymonth{$m} .= "($deliverable) $t ;;";# bad data structure. todo.
}			# and move this to summariser.

print "Deliverables:\n";
foreach (sort {$a <=> $b} keys %bymonth) {
   print "M=$_ DEL=$bymonth{$_}\n"; 
}

#####  BY PARTNER
# using first partner mentioned in 'main_partners' for now. todo: clarify
print "Workpackage leaders (or main partners):\n"; # fix sorting too
						#by using b/month listing
foreach my $wp (sort keys %lead) {
  print "!!!!WP: $wp !!!!LEAD: ".$lead{$wp} ."\n";
}


# An RDF view of the project plan

my $PMVOCAB = 'http://www.w3.org/2002/02/esw/pm#';

#And and HTML view (fancy sorted by date version):
$dt_content .= "<table summary=\"deliverables\" border=\"1\"><tr>
<th>end M</th>
<th>no</th>
<th>WP</th>";
#<th>name</th>
$dt_content .= "\n<th>desc</th>
<th>lead</th>
<th>est person-m</th>
</tr>\n\n";
#<th>est start m</th>


my @delivs;
#my $BASE='http://www.w3.org/2001/sw/Europe/plan/workpackages/live/';

my $count=0;#for non-standard delivs
my $std_count=0;#for std delivs
my $month_count=0;
my $start=0;#calculated start value
my $start_mon;#scraped start value

foreach my $thismon (sort {$a <=> $b} keys %bymonth) {
  print STDERR "Debug bymonth: thismon=$thismon\n";


  foreach my $info (split(/;;/, $bymonth{$thismon})) {
  print STDERR "Debug info $info\n";

	$info =~ s/^\s*\(([^)]+)\)//;

    my ($w, $num, $shortname) = split(/\s+/,$1,3);

    $num =~ s/://; 

#getting the information from the last set of brackets in the 
#description

    my @thingsinbrackets = split(/\(/,$info);
    my $lastbrackets=pop(@thingsinbrackets);

    my ($dur, $deltype, $security) = split(/,/,$lastbrackets,3);

#cleaning up

    $dur =~ s/month[s]?//;
    $security =~ s/\)//;

#lead partner, already in a hash
    my $longwp="esw-wp-" .$w;
    my $leader= $lead{$longwp};
    $start_mon= $starts{$longwp};

#collecting all delivs, incl std

my $realmon=realMonth($thismon);

    if ($shortname =~ m/std/) {
      $std_count++;
      $dt_content .=
"\n<tr>
<td valign=\"top\">$realmon ($thismon)</td>
<td valign=\"top\">$num</td>
<td valign=\"top\">$w</td>";
#<td>$shortname</td>
      $dt_content .="\n
<td valign=\"top\"><small>$info</small>
</td><td valign=\"top\">$leader</td>
<td valign=\"top\">$dur</td></tr>";
#<td><a href=\"$DD/_std".$std_count.".html\">$num</a></td>
#<td>$start</td></tr>\n"; 
    }else{
      $count++;
      $month_count = $month_count + $dur;
      $start=$thismon-$dur;
      $dt_content .="\n<tr>
<td valign=\"top\">$realmon ($thismon)</td>
<td valign=\"top\">$num</td>
<td valign=\"top\">$w</td>";
#<td>$shortname</td>
      $dt_content .="\n
<td valign=\"top\"><small>$info</small></td>
<td valign=\"top\">$leader</td>
<td valign=\"top\">$dur</td></tr>\n"; 
#<td>$start</td>
#<td><a href=\"$DD/_".$count.".html\">$num</a></td>

    }


    # removed individual deliv generation - should be from rdf anyway


    # The RDF view will include std deliverables too

    $shortname =~ m/(.*):\s+(.*)/;
    my ($dnum, $dname)=($1,$2); 
    my $docurl= $BASE. "esw-wp-$w". ".html#". "del_".$dnum;
    push (@delivs, $docurl);    

    $docurl = trim($docurl);
    $num = trim($num);
    $shortname = trim($shortname);
    $w = trim($w);
    $thismon = trim($thismon);
    $dur = trim($dur);
    $leader =trim($leader);
    $info = trim($info);
    $kickoff = trim($kickoff);
    $deltype = trim($deltype);
    $security = trim($security);
    $start_mon = trim($start_mon);

    $dt_rdfcontent .= "\n<pm:DeliverableSpec rdf:about=\"$docurl$num\"
pm:number=\"$num\" \npm:name=\"$shortname\" \npm:workpackage=\"$w\"
pm:relMonthDue=\"$thismon\" 
pm:realDateDue=\"$realmon\" 
pm:duration=\"$dur\"
pm:lead=\"$leader\">\n";

  $dt_rdfcontent .= "<pm:workpackageUrl rdf:resource=\"".$BASE."esw-wp-".$w.".html\" />\n";

   $dt_rdfcontent .= "<pm:description>$info\n</pm:description>\n";
   $dt_rdfcontent .= "<pm:mainStartDate>$kickoff</pm:mainStartDate>\n";  
   $dt_rdfcontent .= "<pm:wpStartDate>$start_mon</pm:wpStartDate>\n";  
   $dt_rdfcontent .= "<pm:delivType>$deltype</pm:delivType>\n";  
   $dt_rdfcontent .= "<pm:acl>$security</pm:acl>\n";  
   $dt_rdfcontent .= "</pm:DeliverableSpec>\n\n";



  } 
}



# Close HTML
$dt_content .= "\n</table>\n\n";

###libby - cross-check totals
$dt_content .="\n<p>Cross checking totals<br />non-standard deliverables:<br />";

if($count==$TOTAL_NON_STD_DELIVERABLES){
$dt_content .="\n$TOTAL_NON_STD_DELIVERABLES deliverables, ok";
}
else{
$dt_content .="\n$count deliverables, ERROR: should be $TOTAL_NON_STD_DELIVERABLES";
}

##std
$dt_content .="<br />standard deliverables:<br />";

if($std_count==$TOTAL_STD_DELIVERABLES){
$dt_content .="\n$TOTAL_STD_DELIVERABLES deliverables, ok";
}
else{
$dt_content .="\n$std_count deliverables, ERROR: should be $TOTAL_STD_DELIVERABLES";
}

$dt_content .="\n<p>total months for non-standard deliverables " .$month_count . "</p>";

$dt_content .= "</p></body></html>\n\n";

print STDERR "DEBUG: writing deliv table to $dt_file\n" if $debug;
open (DT,">$dt_file") || die "Can't write deliv table file $dt_file";
print DT $dt_content;
close DT;


# Finish RDF stuff (this code getting tangly)
## this broken at the moment - libby

#$dt_rdfcontent .= "<rdf:Seq pm:label=\"Project Schedule (by
#date)\">\n";
#$dt_rdfcontent .= "<rdf:type rdf:resource=\"". $PMVOCAB.
#"ProjectScheduleByDate\"/> \n";
#foreach my $doc (@delivs){ 
#    $dt_rdfcontent .= "<rdf:li rdf:resource=\"$doc\"/>\n";
#}
#$dt_rdfcontent .= "</rdf:Seq>\n";

$dt_rdfcontent .= "\n\n</rdf:RDF>\n\n\n";

print STDERR "DEBUG: writing RDF table to $dt_rdffile\n" if $debug;
open (RDFEXPORT,">$dt_rdffile") || die "Can't write RDF deliv table file $dt_rdffile";
print RDFEXPORT $dt_rdfcontent;
close RDFEXPORT;


#######################################################################

sub trim {
  $_ = shift;
  $_=~ s/^\s+//;
  $_=~ s/\s+$//;
  return $_;
}

#######################################################################
#
#  Summarise XHTML workpackage descriptions (from filename)


sub generateSummary {

  @todo = @_;
  my @pd; # our project data
  my $total_effort_count=0; 	# sum across all described packages 
				# should check against spreadsheet

  foreach my $wp (@todo) {
   chomp $wp;

    my %summary; # somewhere to store our metadata

    ########### load description data from storage

    # print "<h2>Work Package summary: $wp</h2> \n";  
    my $textof = '';
    open(IN,$wp) || die "Couldn't open workpackage description $wp";
    while(<IN>)  { $textof .= $_; }
    close IN;

    ############ EXTRACT MAIN TEXT CHUNKS

    my $wp_name = $wp;
    $wp_name =~ s/\.html$//;
    $wp_name =~ s/^wp_//;



# 
# danbri notes:
# this is pretty rough, and shouldn't be relied on.
# xxx FIXME!
#
## oops - this concats bits of text together which shouldnt be.
#$textof =~ s/\n//g;

    $textof =~ m#<h4>\s*Deliverables\s*</h4>(.*)<[hHbBpP]?#igs; # tricky
    my $blurb = $1;
    #my $nextsection = $2; # todo
    $blurb =~ s/<\/body.*//ig;
    # print STDERR "extract_deliv: blurb: [[ $blurb ]] \n\n";

    $summary{'raw_deliverables_markup'} = $blurb;
    $summary{'raw_milestones_markup'} = 'TODO';
    $summary{'wp_name'} = $wp_name;


    ############  
    #
    # Extract deliverables sub-structure

    $blurb =~ s#</?ul>##g;
    $blurb =~ s#</li>##g;


    if ($blurb =~ m#<li>#) {
      my @items = split(/<li>/, $blurb);
      my $junk = shift @items;
      # print STDERR "DEBUG: deliverables-parser for wp $wp, got blurb: [[ $blurb ]] \n" if $debug;
      my %wpspec;
      my $c=0;
      foreach my $i (@items) {
        $i =~ s/\s+/ /g;
#        $wpspec{"item_$c"} = $i;
        $i =~ s/\(([^)]+)\)//;
        my $sn=$1;
#	print STDERR "DEBUG: grabbing shortname: '$sn'\n";
        $wpspec{$sn}="WP=$wp ".$i; ### uh-oh xxxx
        $c++;           #junk number version? todo - strip out this stuff
      }
   
      $summary{'wp_details'} = \%wpspec;
    } else {
      warning "WP $wp_name - no list structure in deliverables";
    }


    ############ 
    # worktable info 
    # (note: needs globals, see above) 

    $textof =~ m#<h3>\s*(.*)\s*</h3>#;
    my $h3 = $1;
    $h3 =~ s/Workpackage description://ig;


    my ($wp_text, $lead_text) = split(/<h3>/, $h3);
    my ($heading_number, $heading_text) = split(/:/, $wp_text);

    my ($rem, $rem1, $start_text) = split(/<h4>/, $wp_text);
    $start_text =~ s/\s*Start date or starting event: Month(.*)<\/h4>\s*/$1/;
    $start_text=trim($start_text);
#print STDERR "\n\nLIBBYLEAD $start_text  $h3";

### test for lead partner (also got through most effort)

    $lead_text =~ s/\s*Lead Partner:\s*.*\((\d*)\)/$1/;

    $heading_text =~ s/\s+/ /gs;
    $summary{'wp_h3'} = $h3;
    $summary{'heading_number'} = $heading_number;
    $summary{'heading_text'} = $heading_text;
    $summary{'wp_start'} = $start_text;

    warning("heading_number for $wp_name uses non-nums: '$heading_number'")
	if ($heading_number =~ m/[a-zA-Z]/);

    warning "WP $wp_name - no heading text extracted" unless $heading_text;  
    warning "WP $wp_name - no h3 WP name found for wp $wp " unless $h3;

    ###################### Effort Table

    my @effort = worktableValues( $textof );
 
    warning "expected $NUM_PARTNERS in effort allocation table" 
						unless (scalar @effort == $NUM_PARTNERS); 

    $summary{'effort_table'} = join(' ', @effort );

    #warning "Effort table dump: ".$summary{'effort_table'}. "\n"; #


    # effort assignment substructure
    my $assigned_effort=0;
    my $e=$summary{'effort_table'} || warning "WP $wp_name - no effort table";

    my $ccount=1;

    foreach my $i (split (/\s+/,$e  ) )  {
      $assigned_effort += $i;
	my $filn=$ccount . "_assigned_effort";
      $summary{$filn} = $i; 

      $ccount++;
    }

    print "inner assigned_effort: $assigned_effort\n"; 
    $total_effort_count += $assigned_effort; # after done all, stored per wp
    $summary{'wp_assigned_effort'} = $assigned_effort;

    my $most = 0;
    my $main = 0;
    my $i = 1;
    # loop thru
    foreach my $month (@effort) { 
      # print "month: $month \n"; 
      if ( $month == $most) {
	warning "2 partners have equal effort  on $wp - ignoring last ";
#        $main .= " $i";   # we might have two partners w/ same effort 
			# (but one is leader)
      } 
 
      if ( $month > $most) {
        $most = $month;
        $main = "$i";
      }

    $i++;
    }

## cross check lead partner:

    if($lead_text==$main){
	print "\nlead matches $main, $lead_text for $wp";
    }
    else{
	warning "lead does not match $main, $lead_text for $wp. Overriding lead to $lead_text";
	$main=$lead_text;
    }

    $summary{'main_partners'} = $main;
    $summary{'most_months'} = $most;

    warning "no effort table found for WP" unless @effort;

    undef @tdata;
    undef @cells;

    ######################

    # todo: fixthis
#    warning "blurb exists but has no html list items" if ($blurb && 0);

    warning "no deliverables found for $wp" unless $blurb;
  
    push( @pd, \%summary ); # store this summary
 
  } # loop thru WP descriptions (do we care about order of this?)

  foreach  my $p (@pd) {
    ${$p}{'total_effort_count'} = $total_effort_count; # store in each wp
  }
  return @pd;

} # /end extractSummary












############################################################################
# table extraction subs
#
# Note: change this if you change the structure of HTML TABLE for Effort
 
sub worktableValues {
  my $data = shift;
  $data =~ s/\n//g;
  while ($data =~ s#<tr>\s*(.*)\s*</tr># gotRows ('row: '. $1  ) #ei) { ; }
  @tdata = ($cells[13], $cells[14], $cells[15], $cells[16], $cells[17] );
#  print STDERR "EXTRACTED: TABLEDATA: ". join (' ;', @tdata)."</p>\n";
  return (@tdata);
}

sub gotRows {
  my $row=shift;
   print "DEBUG: Got table row: $row \n"; 
  $row =~ s#<td>\s*([^<]*)\s*</td>#gotCell($1)#ge;
  sub gotCell {
    my $cell = shift;
    # print "Cell: $cell <br/>\n ";
    push(@cells,$cell);
  }
}


## turn month 1,2 etc into actual month
sub realMonth{
my $month=shift;
my $realmonth;
if($month<=8){
  my $tmp=($month+4);
  if($tmp<10){$tmp="0".$tmp;}

  $realmonth="2002-".$tmp."-28";
}
if(($month>=9) && ($month<=20)){
  my $tmp=($month-8);
  if($tmp<10){$tmp="0".$tmp;}

  $realmonth="2003-".$tmp."-28";
}
if(($month>=21) && ($month<=30)){
  my $tmp=($month-20);
  if($tmp<10){$tmp="0".$tmp;}

  $realmonth="2004-".$tmp."-28";
}

return $realmonth;
}

## Reporting.



sub htmlReport {

  print "<html><head><title>Automatic Workpackage summary</title>";
  print "<body>\n";
  print "<h1>Extracted Workpackage summaries</h1>\n\n";
 # print "<p>workpackages:</p><ul>\n";

 # foreach my $t (@todo) {
 #   chomp $t;
 #   print "<li>$t</li>\n";
 # }
 # print "</ul>\n";

}





#######################################################################
#######################################################################
#######################################################################
#######################################################################
#
#
# Overview
#
#
# This script reads (initially as text; might redo in XSLT) the individual
# workpackage description files, and extracts metadata summaries of each package.
#
# The HTML doc was derrived from a word file and contains little semantic markup.
# The idea here is to clarify our implicit project ontology by refining 
# this perl script and the HTML markup, so that useful project summaries 
# can be automatically extracted from the source markup.
#
# If this approach is to work for ongoing project management, we will need
# to be careful when managing the XHTML source files. Maybe use Amaya?
# we'll also need to write some tests to ensure data integrity and no 
# obvious goofs in content (eg. WPs with no deliverables, etc).
# This stuff can be hacked and hardcoded in Perl initially, perhaps reworked
# using XSLT, Cwm etc as things progress.

# Initial modest goal: a workable 'front page' for the XHTML project
# description based on the XHTML workpackage descriptions.

# We make the following assumptions about structure.
#
# Target datastructure:
# For each workpackage, we want to extract the following info, so we 
# can generate a table of contents, RDF overview, index etc.

## Implicit schema for WP descriptions:
#
#      <td>Del. no.</td> 
#      <td>Deliverable name</td>	--from H4 / list items 
#      <td>WP no.</td> 		 	--from file and H3
#      <td>Lead participant</td>
#      <td>Estimated person months</td>
#      <td>Del. type</td>
#      <td>Security</td>
#      <td>Delivery (Project month)</td>

# ordering issues:
# we initially assume that the workpackage
# descriptions will be loaded in typical order; our html output based on
# this.


# 
# (1.) We can match <h4>Deliverables</h4> and take following content until
# the next H4 as a chunk of wellformed XHTML describing the workpackage.
#
# BADDATA NOTE: wp6.6 lacks <h4>Deliverables</h4>

# (2.) 
# Workpackage number and description:
#
# from filename:  m/wp([^.]).html gives: $1 WP number
# from markup: the first (only) <h3> matches number and description 
#
#examples: (3rd is broken, note missing : after numbers)
#
#wp1.html:<h3>Workpackage description: 1: Project Management</h3>
#wp2-3.html:<h3>Workpackage description: 2: Dissemination and</h3>
#wp6.4.html:<h3>Workpackage description: 6.4 Visualisation Demonstrator</h3>
#wp6.5.html:<h3>Workpackage description: 6.5: XML and Semantic Web Integration research
#
# BADDATA NOTE: fix wp6.4 H3


# (3.)
# The HTML descriptions we get from the H4s contain a list of deliverable names
# preceded by 'Month n:'.
# They don't seem to have the deliverable number, so correlating month to
# deliverable number is tricky

# BADDATA NOTE:
# Some files use <br/> and others use <li>; clean up

# (4.) 
# OPEN ISSUES
# Numbering scheme for deliverables is unclear
# Structure needs clarification. 
# How do we define 'deliverables' vs 'milestones  and expected results' ?
# can both have free text blurbs plus lists?
# can both have internal / external deliverables?

# some h3 are bogus (text leaked out of h3 onto next line, eg. 2.3)

# (5.)
# Effort table issues
# we need to get 'lead partner' from somewhere.
# most cases we can get main partner, but sometimes two might
# have same number of months.

#
#  - internal versus external deliverables
#  - can we extract from milestones or just deliverables section?
#  - how do we indicate dependencies?
#  - planned reorganisation of WP structure; need neutral deliv. names.  
#  - the HTML '<title>' elements are empty.
#  - what is <tbody> ? (html4 maybe?)
#  - We need to parse the HTML TABLE for participants / numbers
#  - tables lack accessibility info ('summary'?); could put useful stuff in there?
#  - Can we infer lead partner for a WP is one with most months? 
#  - we should have contactEmail for each WP and/or deliverable (should we?)
#  - we have no org metadata for partners
#  - look at the other schemas W3C SWAD using for org stuff

# Feature Creep Dept:
#
# see alsos to html, list msgs about deliverables.
# cross-ref to W3C SWAD org chart work
# generate .dot stuff using Dan and Tim's neat tools...


#### GRAPHING FEATURES

# depends on: GNU plotutils
# using utilities for drawing,
# piechart:
# http://www.usf.uni-osnabrueck.de/~breiter/tools/piechart/piecharts.en.html
# and ascii_chart
# http://biolpc22.york.ac.uk/linux/plotutils/
# http://www.gnu.org/software/plotutils/plotutils.html
# don't need gnuplot; all the GNU stuff is in plotutils (and libplot-dev, debian)
# 
# also graphviz.
