#!/usr/bin/perl
use strict;

use Getopt::Long;

my $OrthPattern = qr/^ *Orthology - (\w+) vs. (\w+) \(Sorted by ([^\)]+)\)/;
my $ProvPattern = qr/^Data Attributes:  M - MGI curated, C - HomoloGene calculated, B - MGI curated and HomoloGene calculated$/;
my %MGDprovs = (
    'AA' => 'Amino-acid-sequence-comparison', 
    'CE' => 'Coincident-expression', 
    'CL' => 'Conserved-map-location', 
    'FH' => 'Formation-of-functional-heteropolymers', 
    'FC' => 'Functional-complementation', 
    'IX' => 'Immunological-cross-reaction', 
    'NS' => 'Not-specified', 
    'NT' => 'Nucleotide-sequence-comparison', 
    'SI' => 'Similar-response-to-specific-inhibitors', 
    'SL' => 'Similar-subcellular-location', 
    'SS' => 'Similar-substrate-specificity', 
    'SU' => 'Similar-subunit-structure', 
    'UN' => 'Unreviewed', 
    'XH' => 'Cross-hybridization-to-same-molecular-probe', 
    );
my %NCBItaxNos = (
    10090 => 'Mouse', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=10090
    10116 => 'Rat', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=10116
    148305 => 'RiceBlastFungus', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=148305
    28985 => 'KluyveromycesLactis', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=28985
    33169 => 'EremotheciumGossypii', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=33169
    3702 => 'ThaleCress', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=3702
    4530 => 'Rice', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=4530
    4896 => 'FissionYeast', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=4896
    4932 => 'BakersYeast', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=4932
    5141 => 'NeurosporaCrassa', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=5141
    5833 => 'MalariaParasite', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=5833
    6239 => 'Nematode', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=6239
    7165 => 'MalariaMosquito', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=7165
    7227 => 'FruitFly', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=7227
    9031 => 'Chicken', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=9031
    9598 => 'Chimpanzee', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=9598
    9606 => 'Human', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=9606
    9615 => 'Dog', # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name=9615
    );

use constant Species => 0;
use constant GeneId => 1;

&main (\@ARGV);

sub main {
    my ($args) = @_;
    my ($iFile, $oFile, $from, $to, $idxs, $ncbis);
    if (GetOptions ("input|i=s" => \$iFile, 
		    "output|o=s" => \$oFile, 
		    "from=s" => \$from, 
		    "to=s" => \$to, 
		    "idxs=s" => sub {
			$idxs ||= [];
			push (@$idxs, split(',', $_[1]));
		    }, 
		    "ncbi=s" => sub {
			my %nameToNum = map {$NCBItaxNos{$_} => $_} keys %NCBItaxNos;
			$ncbis ||= {};
			foreach my $name (split(',', $_[1])) {
			    if (my $num = $nameToNum{$name}) {
				$ncbis->{$num} = 1;
			    } else {
				die "unknown NCBI type \"$name\"";
			    }
			}
		    })) {
	&parse($iFile, $oFile, $from, $to, $idxs, $ncbis);
    }
}

sub parse {
    my ($iFile, $oFile, $from, $to, $idxs, $ncbis) = @_;

    my $in = \*STDIN;
    my $out = \*STDOUT;
    if ($iFile) {
	open (I, '<', $iFile) || 
	    die "couldn't input open \"$iFile\": $!\n";
	$in = \*I;
    }
    my $lineNo = 0;
    my $fmt = undef; # Stays undef for tab-delimited.
    my $fakeRead = undef; # For when we try to parser header but there is none.

    if ($ncbis) {
    } else {

	my $line = &readLine($in, \$lineNo);
	if ($line =~ m/^\#/) {
	    while ($line =~ m/^\#/ || $line =~ m/^$/) {
		$line = &readLine($in, \$lineNo);
	    }

	    if ($line =~ /$OrthPattern/) {
		if (defined $from && $from ne $1) {
		    die "$lineNo: expected mapping from \"$from\" in \"$line\"";
		}
		if (defined $to && $to ne $2) {
		    die "$lineNo: expected mapping to \"$to\" in \"$line\"";
		}
		($from, $to) = ($1, $2);
		$line = &readLine($in, \$lineNo);
	    } else {
		die "$lineNo: failed to match /$OrthPattern/: \"$line\"";
	    }
	    while ($line =~ m/^$/) {
		$line = &readLine($in, \$lineNo);
	    }

	    if ($line =~ /$ProvPattern/) {
		$line = &readLine($in, \$lineNo);
	    } else {
		die "$lineNo: failed to match /$ProvPattern/: \"$line\"";
	    }
	    while ($line =~ m/^$/) {
		$line = &readLine($in, \$lineNo);
	    }
	    my $headingsLine = $line;
	    $line = &readLine($in, \$lineNo);

	    my ($indexes, $headings);
	    if ($line =~ m/^-+[ \-]+/) {
		# space-delimited
		my @cols = split(/(-+)/, $line);
		my @widths;
		if (length($cols[0]) != 0) {
		    die "$lineNo: error reading ";
		}
		for (shift @cols; @cols; ) {
		    push (@widths, length(shift @cols) + length(shift @cols));
		}
		$fmt = join('', map {"A$_"} @widths);
		$headings = [unpack($fmt, $headingsLine)];
	    } else {
		# tab-delimited
		$headings = [split(/\t/, $headingsLine)]
	    }
	    $idxs = &headingIndexes($headings, $from, $to);
	} elsif (!defined $from || !defined $to || !defined $idxs) {
	    die "0: -from, -to and -idxs required if file has no heading";
	} else {
	    # There was no header, so need to fake a read.
	    $fakeRead = $line;
	}
    }
    if ($oFile) {
	open (O, '>', $oFile) || 
	    die "couldn't output open \"$oFile\": $!\n";
	$out = \*O;
    }
    &printHeadings($out, $from, $to);
    if ($ncbis) {
	my $lastHomoGroup = undef;
	my (@entries, %entrez2homoGroup);
	sub printEntries {
	    for (my $iEntry = 1; $iEntry < @entries; $iEntry++) {
		&printEntry($out, 
			    $entries[0][GeneId], $entries[$iEntry][GeneId], 
			    'NCBI', 
			    $iEntry == 1 ? $entries[0][Species] : undef, # don't repeat
			    $entries[$iEntry][Species]);
	    }
	}

	while (my $cells = &read($in, \$lineNo, undef, [0, 1, 4], undef)) {
	    my ($homoGroup, $speciesNo, $geneId) = @$cells;
	    if (!defined $lastHomoGroup) {
		$lastHomoGroup = $homoGroup;
	    } elsif ($lastHomoGroup != $homoGroup) {
		# Pair each nth entry with the first.
		&printEntries();
		$lastHomoGroup = $homoGroup;
		@entries = ();
	    }
	    if ($entrez2homoGroup{$geneId}) {
		warn "$lineNo: entrez gene ID already used for group $entrez2homoGroup{$geneId}";
	    }
	    $entrez2homoGroup{$geneId} = $homoGroup;
	    my $species = $NCBItaxNos{$speciesNo} || die "$lineNo: unknown species number \"$speciesNo\"";
	    if ($ncbis->{$speciesNo}) {
		push (@entries, [$species, $geneId]); # Species, GeneId
	    }
	}
	# Catch the last set (aggregating when EOF was encountered).
	&printEntries();
    } else {
	while (my $cells = &read($in, \$lineNo, $fmt, $idxs, $fakeRead)) {
	    undef $fakeRead;
	    &printEntry($out, @$cells, $from, $to);
	}
    }
    if ($iFile) { close $in }
    if ($oFile) { close $out }
}

sub readLine {
    my ($in, $pLineNo) = @_;
    $$pLineNo++;
    if (my $ret = <$in>) {
	chomp $ret;
	return $ret;
    } else {
	return undef;
    }
}

sub printHeadings {
    my ($out, $from, $to) = @_;
    print $out "\@prefix : <http://www.w3.org/2007/03/entrez-gene#> .
\@prefix rpt: <ftp://ftp.informatics.jax.org/pub/reports/HMD_RatHuman5.rpt#> .
\@prefix eg: <http://sw.neurocommons.org/2007/entrez-gene/> .
";
}

sub headingIndexes {
    my ($headings, $from, $to) = @_;
    my $ret = [];
    for (my $i = 0; $i < @$headings; $i++) {
	if ($headings->[$i] eq "$from EntrezGene ID") {
	    push (@$ret, $i);
	} elsif ($headings->[$i] eq "$to EntrezGene ID") {
	    push (@$ret, $i);
	} elsif ($headings->[$i] eq 'Data Attributes') {
	    push (@$ret, $i);
	}
    }
    if (@$ret != 3) {
	my $all = join('|', @$headings);
	die "did not find all the expected headings in $all";
    }
    return $ret;
}

sub read {
    my ($in, $pLineNo, $fmt, $idxs, $fakeRead) = @_;
    if (my $line = $fakeRead || &readLine($in, $pLineNo)) {
	my @cells;
	if ($fmt) {
	    # space separated
	    @cells = unpack($fmt, $line);
	    map {$cells[$_] =~ s/^ +//} @$idxs; # a few have leading spaces
	    map {$cells[$_] =~ s/ +$//} @$idxs; # and many have trailing spaces
	} else {
	    # tab separated
	    @cells = split(/\t/, $line);
	}
	return [map {$cells[$_]} @$idxs];
    } else {
	return undef;
    }
}

sub printEntry {
    my ($out, $fromGeneId, $toGeneId, $prov, $fromSpecies, $toSpecies) = @_;
    my $evidence = '';
    my $fromPredicate = 'from';
    my $toPredicate = 'to';
    if ($prov eq 'NCBI') {
	$evidence = "; 
  :evidence rpt:HomologoGene-calculated ";
	$fromPredicate = $toPredicate = 'symto';
    } elsif ($prov eq 'M') {
	$evidence = "; 
  :evidence rpt:MGI-curated ";
    } elsif ($prov eq 'C') {
	$evidence = "; 
  :evidence rpt:HomologoGene-calculated ";
    } elsif ($prov eq 'B') {
	$evidence = "; 
  :evidence rpt:MGI-curated , rpt:HomologoGene-calculated ";
    } else {
	my @provs = split(',', $prov);
	foreach my $p (@provs) {
	    if (my $mapped = $MGDprovs{$p}) {
	$evidence .= "; 
  :evidence rpt:$mapped ";		
	    } else {
		warn "\$lineNo: what is provenence \"$p\"?\n";
	    }
	}
    }
    my $fid = "eg:id-$fromGeneId";
    my $tid = "eg:id-$toGeneId";
    my $fromSpeciesStr = $fromSpecies ? "\n$fid a :${fromSpecies}-critter ." : '';
    my $toSpeciesStr = $toSpecies ? "\n$tid a :${toSpecies}-critter ." : '';
    print $out "[ a :PutativeOrthology ; 
  :$fromPredicate $fid ; :$toPredicate $tid $evidence] .$fromSpeciesStr$toSpeciesStr\n";
}
