#!/usr/bin/perl

use strict;
use File::Copy;

use constant PCOUNTFILE => "/var/tmp/tmp_pcount.$$";
use constant PUNFCOUNTFILE => "/var/tmp/tmp_punfcount.$$";
use constant LOGFILE => '/share/go/logs/GO-counts.log';
use constant DOCDIR => '/share/ftp/go/www/';
use constant PROJECT_COUNT_FILE => 'project-filtered-nums.html';
use constant PROJECT_UNFILTER_COUNT_FILE => 'project-unfiltered-nums.html';
use constant PROJECT_DATESTAMP => 'project-datestamp.html';
use constant ASSOCIATIONSDIR => '/share/ftp/go/gene-associations';
use constant SUBMISSIONDIR => '/share/ftp/go/gene-associations/submission';
use constant READMEURL => 'gene-associations/readme/';
use constant DOWNLOADURL => 'http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/';
use constant HTTPDOWNLOAD => 'gene-associations/';

my @mon2month = (
	     'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
	     'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec',
	     );

#
# main hash for HTML table.  Information matching file name with its source and description.
#
# THIS DEFINES THE ORDER OF THE FILES IN THE OUTPUT.
# Please alphabetize using the genus species name, not the file name.
#
my @order = (
    'PAMGO_Atumefaciens.gz',
    'goa_arabidopsis.gz',
    'tair.gz',
    'aspgd.gz',
    'jcvi.gz',
    'goa_cow.gz',
    'wb.gz',
    'cgd.gz',
    'goa_dog.gz',
    'goa_zebrafish.gz',
    'zfin.gz',
    'PAMGO_Ddadantii.gz',
    'dictyBase.gz',
    'fb.gz',
    'ecocyc.gz',
    'goa_chicken.gz',
    'gonuts.gz',
    'GeneDB_tsetse.gz',
    'goa_human.gz',
    'GeneDB_Lmajor.gz',
    'PAMGO_Mgrisea.gz',
    'goa_mouse.gz',
    'mgi.gz',
    'PAMGO_Oomycetes.gz',
    'gramene_oryza.gz',
    'goa_pdb.gz',
    'GeneDB_Pfalciparum.gz',
    'pseudocap.gz',
    'goa_rat.gz',
    'rgd.gz',
    'reactome.gz',
    'sgd.gz',
    'GeneDB_Spombe.gz',
    'pombase.gz',
    'sgn.gz',
    'goa_pig.gz',
    'GeneDB_Tbrucei.gz',
    'goa_uniprot_noiea.gz',
    'sub/goa_pdb.gz',
    'sub/reactome.gz',
    'sub/goa_uniprot.gz',
    'sub/goa_arabidopsis.gz',
    'sub/goa_zebrafish.gz',
    'sub/goa_mouse.gz',
    'sub/goa_rat.gz',
    'sub/goa_dog.gz',
    'sub/goa_pig.gz',
    );

my %species = (

	       'PAMGO_Atumefaciens.gz'=>'<span class="spp">Agrobacterium tumefaciens</span>str. C58<br>PAMGO',
	       'goa_arabidopsis.gz'=>'<span class="spp">Arabidopsis thaliana</span><br>GO Annotations @ EBI',
	       'tair.gz'=>'<span class="spp">Arabidopsis thaliana</span><br>TAIR',
	       'aspgd.gz'=>'<span class="spp">Aspergillus nidulans</span><br>AspGD',
	       'goa_cow.gz'=>'<span class="spp">Bos taurus</span><br>GO Annotations @ EBI',
	       'wb.gz'=>'<span class="spp">Caenorhabditis elegans</span><br>WormBase',
	       'cgd.gz'=>'<span class="spp">Candida albicans</span><br>CGD',
	       'goa_dog.gz'=>'<span class="spp">Canis lupus familiaris</span><br>GO Annotations @ EBI',
	       'goa_zebrafish.gz'=>'<span class="spp">Danio rerio</span><br>GO Annotations @ EBI',
	       'zfin.gz'=>'<span class="spp">Danio rerio</span><br>ZFIN',
	       'PAMGO_Ddadantii.gz'=>'<span class="spp">Dickeya dadantii</span><br>PAMGO',
	       'dictyBase.gz'=>'<span class="spp">Dictyostelium discoideum</span><br>dictyBase',
	       'fb.gz'=>'<span class="spp">Drosophila melanogaster</span><br>FlyBase',
	       'ecocyc.gz'=>'<span class="spp">Escherichia coli</span><br>PortEco',
	       'goa_chicken.gz'=>'<span class="spp">Gallus gallus</span><br>GO Annotations @ EBI',
               'gonuts.gz'=>'<span class-"spp">Gene Ontology Normal Usage Tracking System (GONUTS)',
	       'GeneDB_tsetse.gz'=>'<span class="spp">Glossina morsitans</span><br>Sanger&nbsp;GeneDB',
	       'goa_human.gz'=>'<span class="spp">Homo sapiens</span><br>GO Annotations @ EBI',
	       'GeneDB_Lmajor.gz'=>'<span class="spp">Leishmania major</span><br>Sanger&nbsp;GeneDB',
	       'PAMGO_Mgrisea.gz'=>'<span class="spp">Magnaporthe grisea</span><br>PAMGO',
	       'goa_mouse.gz'=>'<span class="spp">Mus musculus</span><br>GO Annotations @ EBI',
	       'mgi.gz'=>'<span class="spp">Mus musculus</span><br>MGI',
	       'PAMGO_Oomycetes.gz'=>'Oomycetes<br>PAMGO',
	       'gramene_oryza.gz'=>'<span class="spp">Oryza sativa</span><br>Gramene',
	       'goa_pdb.gz'=>'Protein Data Bank [multispecies]<br>GO Annotations @ EBI',
	       'GeneDB_Pfalciparum.gz'=>'<span class="spp">Plasmodium falciparum</span><br>Sanger&nbsp;GeneDB',
	       'pseudocap.gz'=>'<span class="spp">Pseudomonas aeruginosa</span> PAO1<br>PseudoCAP',
	       'goa_rat.gz'=>'<span class="spp">Rattus norvegicus</span><br>GO Annotations @ EBI',
	       'rgd.gz'=>'<span class="spp">Rattus norvegicus</span><br>RGD',
	       'reactome.gz'=>'Reactome [multispecies]<br>CSHL & EBI',
	       'jcvi.gz'=>'Comprehensive Microbial Resource [multispecies]<br>JCVI',
	       'sgd.gz'=>'<span class="spp">Saccharomyces cerevisiae</span><br>SGD<br>Stanford University',
	       'GeneDB_Spombe.gz'=>'<span class="spp">Schizosaccharomyces pombe</span><br>PomBase<br>University of Cambridge, UK',
	       'pombase.gz'=>'<span class="spp">Schizosaccharomyces pombe</span><br>PomBase<br>University of Cambridge, UK',
	       'sgn.gz'=>'<span class="spp">Solanaceae</span><br>SGN',
	       'goa_pig.gz'=>'<span class="spp">Sus scrofa</span><br>GO Annotations @ EBI',
	       'GeneDB_Tbrucei.gz'=>'<span class="spp">Trypanosoma brucei</span><br>Sanger&nbsp;GeneDB',
	       'goa_uniprot_noiea.gz'=>'UniProt [multispecies]<br>IEA annotations have been removed<br>GO Annotations @ EBI',
	       'goa_uniprot.gz'=>'UniProt [multispecies]<br>GO Annotations @ EBI',
	       );

my %readme = (
    'aspgd.gz'=>'aspgd.README',
    'cgd.gz'=>'cgd.README',
    'dictyBase.gz'=>'dictyBase.README',
    'fb.gz'=>'fb.README',
    'ecocyc.gz'=>'EcoCyc.README',
    'GeneDB_Lmajor.gz'=>'GeneDB_Lmajor.README',
    'GeneDB_Pfalciparum.gz'=>'GeneDB_Pfalciparum.README',
    'GeneDB_Spombe.gz'=>'pombase.README',
    'pombase.gz'=>'pombase.README',
    'GeneDB_Tbrucei.gz'=>'GeneDB_Tbrucei.README',
    'GeneDB_tsetse.gz'=>'GeneDB_tsetse.README',
    'goa_arabidopsis.gz'=>'goa.README',
    'goa_chicken.gz'=>'goa.README',
    'goa_cow.gz'=>'goa.README',
    'goa_human.gz'=>'goa.README',
    'goa_mouse.gz'=>'goa.README',
    'goa_pdb.gz'=>'goa_pdb.README',
    'goa_dog.gz'=>'goa_pdb.README',
    'goa_pig.gz'=>'goa_pdb.README',
    'goa_rat.gz'=>'goa.README',
    'goa_uniprot.gz'=>'goa.README',
    'goa_uniprot_noiea.gz'=>'goa.README',
    'goa_zebrafish.gz'=>'goa.README',
    'gramene_oryza.gz'=>'gramene_oryza.README',
    'mgi.gz'=>'mgi.README',
    'PAMGO_Atumefaciens.gz'=>'PAMGO_Atumefaciens.README',
    'PAMGO_Ddadantii.gz'=>'PAMGO_Ddadantii.README',
    'PAMGO_Oomycetes.gz'=>'PAMGO_Oomycetes.README',
    'PAMGO_Mgrisea.gz'=>'PAMGO_Mgrisea.README',
    'pseudocap.gz'=>'PseudoCAP.README',
    'rgd.gz'=>'rgd.README',
    'reactome.gz'=>'reactome.README',
    'sgd.gz'=>'sgd.README',
    'tair.gz'=>'tair.README',
    'jcvi.gz'=>'jcvi_prokaryotic.README',
    'sgn.gz'=>'sgn.README',
    'wb.gz'=>'WormBase.README',
    'zfin.gz'=>'zfin.README',
    );

open (LOG, ">>" . LOGFILE ) || &logdie ("Cannot open " . LOGFILE . "\n");

# select LOGFILE so that STDOUT is captured
select (LOG); $| = 1;

print "\n-----------------------------\n";
print scalar localtime, "\tGO association counts\n";
print "-----------------------------\n";

# hash for last modification date for each gene association file
my $filemoddate = 0;
my $filesubmitdate = 0;
my $filesize = 0;
my @filelist = 0;
my $filteredfile = 0;
#my $filerevision = 0;

chdir (ASSOCIATIONSDIR) || &logdie ("FATAL: cannot chdir to " . ASSOCIATIONSDIR . ": $!\n");

&dodir(ASSOCIATIONSDIR, 0);

#
# Detect if a new gene_association file has been added to the repository.
# If yes, then add it to the order array and print a warning.
# The statistics for this file will be included at the end of the HTML table
#  the description will just be the filename.
#
my $assocdir = ASSOCIATIONSDIR;

foreach my $xfile (@filelist) {

    # cycle through the files found on disk with &dodir
    if ($xfile =~ /\/gene_association\./) {
	my $foundfile = 0;

	$xfile =~ s#$assocdir/gene_association\.##;

	# cycle through the list of known files defined in @order above
	foreach my $yfile (@order) {
	    $foundfile = 1 if ($xfile eq $yfile);
	}

	# if disk file was not in @order push it into @order
	unless ($foundfile) {
	    if ($xfile ne "goa_uniprot.gz")
	    {
		push(@order, $xfile);
		print "Found new file: $xfile\n";
		warn "Found new file: $xfile\n";
	    }
	}
    }
}

# output file, initially written to /var/tmp/
open (POUT, ">" . PCOUNTFILE) || &logdie ("Cannot open " . PCOUNTFILE . "\n");
open (PUNFOUT, ">" . PUNFCOUNTFILE) || &logdie ("Cannot open " . PUNFCOUNTFILE . "\n");

# output stats in the order defined in order array
foreach my $sfile (@order) {

#
# date filtered file was last modified
#
    my $file = "";

    if ( $sfile =~ /sub/ ) {
	$sfile =~ s/sub\///;
	$file = SUBMISSIONDIR . "/gene_association." . $sfile;
	$filteredfile = 0;
    } else {
	$file = ASSOCIATIONSDIR . "/gene_association." . $sfile;
	$filteredfile = 1;
    }

    #  index to the array from stat()
    #  dev,ino,mode,nlink,uid,gid,rdev,size,atime,mtime,ctime,blksize,blocks
    my @stats = stat $file;

    if ($stats[7] < 1024) {
	$filesize = "$stats[7] b";
    } elsif ($stats[7] < 1048576) {
	my $size = $stats[7]/1024;
	$filesize = sprintf ("%.1f", $size);
	$filesize .= " kb";
    } elsif ($stats[7] < 1073741824) {
	my $size = ($stats[7]/1024)/1024;
	$filesize = sprintf ("%.1f", $size);
	$filesize .= " mb";
    } elsif ($stats[7] >= 1073741824) {
	my $size = (($stats[7]/1024)/1024)/1024;
	$filesize = sprintf ("%.1f", $size);
	$filesize .= " gb";
    }

    my ($sec,$min,$hour,$mday,$mon,$year) = localtime($stats[9]);
    $mon = $mon + 1;
    $year = $year + 1900;

    $filemoddate = "$mon/$mday/$year";

    open (GA, "/usr/bin/gzcat $file |") || &logdie ("Cannot gzcat $file\n");

    # $tcount, $ieacount, $gpcount;
    my (@adata) = &associationcounts();

    close GA;

#    open (CVSREV, "/usr/local/bin/cvs -d /share/go/cvs status $file |") || &logdie ("Cannot determine status of $file: $!\n");
#
#    while (<CVSREV>) {
#        chomp;
#
#        if ( /\sRepository revision:\s+(\d+\.\d+)\s/ ) {
#            $filerevision = $1;
#        }
#    }
#
#    close (CVSREV);

    # name should be defined
    unless (defined($species{$sfile})) {
	$species{$sfile} = $sfile;
	# STDOUT outs to logfile, STDERR goes to cron user
	print "Found $sfile, no description for this file available.\n";
	warn "Found $sfile, no description for this file available.\n";
    }

    if ($adata[2] == 0 ) {
	# removed files with no annotations.  This includes
        # goa_arabidopsis, goa_rat, goa_mouse, GeneDB_Spombe, and several others
	next;
    }

    if ( $filteredfile ) {
	if ($adata[2] > 9 ) {
	    print POUT "<tr><td>$species{$sfile}</td>\n";
	    print POUT "<td>$adata[2]</td>\n";
	    print POUT "<td>$adata[0]<br>($adata[1] non-IEA)</td>\n";
	    print POUT "<td>$filemoddate</td>\n";
	    print POUT "<td><ul><li><a href=\"" . DOWNLOADURL . "gene_association.$sfile?rev=HEAD\">annotations</a> [$filesize]</li><li><a href=\"" . READMEURL . "$readme{$sfile}\">README</a></li></ul></td></tr>\n";
	}
    } else {
	print PUNFOUT "<tr><td>$species{$sfile}</td>\n";
	print PUNFOUT "<td>$adata[2]</td>\n";
	print PUNFOUT "<td>$adata[0]<br>($adata[1] non-IEA)</td>\n";
	print PUNFOUT "<td>$filemoddate</td>\n";
	print PUNFOUT "<td><ul><li><a href=\"" . HTTPDOWNLOAD . "submission/gene_association.$sfile\">annotations</a> [$filesize]</li><li><a href=\"" . READMEURL . "$readme{$sfile}\">README</a></li></ul></td></tr>\n";
    }
}

close POUT;
close PUNFOUT;

#
# check is project counts changed using sum, if yes update
#
open (SUMOUT, "/usr/bin/cksum " . PCOUNTFILE . " " . DOCDIR . PROJECT_COUNT_FILE . " |") || &logdie ("cksum failed: $!\n");

my $newsum = 0;
my $cachesum = 0;

while ( <SUMOUT> ) {
    chomp;

    my (@sumoutput) = split(/\s+/);

    if ($sumoutput[2] eq PCOUNTFILE) {
        $newsum = $sumoutput[0];
    } elsif ($sumoutput[2] eq DOCDIR . PROJECT_COUNT_FILE) {
        $cachesum = $sumoutput[0];
    }
}

unless ($newsum == $cachesum) {

    # clone LOG filehandle to STDOUT and STDERR to capture output from cvs commit
    open (STDOUT, ">&LOG") || &logdie ("Count not dup LOG filehandle: $!\n");
    open (STDERR, ">&LOG") || &logdie ("Count not dup LOG filehandle: $!\n");

    # tmp files moved to production directory, a sandbox of the GO CVS
    move(PCOUNTFILE, DOCDIR . PROJECT_COUNT_FILE) || &logdie ("Cannot move " . PROJECT_COUNT_FILE . " into place in " . DOCDIR . "\n");
    move(PUNFCOUNTFILE, DOCDIR . PROJECT_UNFILTER_COUNT_FILE) || &logdie ("Cannot move " . PROJECT_UNFILTER_COUNT_FILE . " into place in " . DOCDIR . "\n");

    # get time, localtime starts with 1900
    my ($mon, $day, $year) = (localtime())[4,3,5];
    $year += 1900;

    # converts abbreviated month name to full month name
    my $month = $mon2month[$mon];
    my $dstr = "$year/$month/$day";

    # overwrite file with current date
    open (DATE, "/bin/date '+%B %e, %Y' |") || &logdie ("Cannot run date command: $!\n");
    open (DATEFILE, ">" . DOCDIR . PROJECT_DATESTAMP) || &logdie ("Cannot open project-datestamp.html: $!\n");

    while ( <DATE> ) {
	chomp;
	print DATEFILE "$_\n";
    }

    close (DATEFILE);
    close (DATE);

    # commit files
    my $cmd = "cd " . DOCDIR . " && /usr/local/bin/cvs -d /share/go/cvs commit -m $dstr " . PROJECT_COUNT_FILE . " " . PROJECT_UNFILTER_COUNT_FILE . " " . DOCDIR . PROJECT_DATESTAMP;

    my $status = system($cmd);
    if ($status) { &logdie ("CVS commit failed: $!\n"); }

} else {
    print "No change needed. File not changed and CVS commit not run.\n";
    # no changes -- remove tmp files
    unlink PCOUNTFILE;
    unlink PUNFCOUNTFILE;
}

close LOG;

exit 0;

sub dodir {
    my ($dir,$nlink) = @_;
    my ($dev,$ino,$mode,$subcount);
    my @filenames;

    # At the top level, we need to find nlink ourselves.

    ($dev,$ino,$mode,$nlink) = stat($dir) unless $nlink;

    # Get the list of files in the current directory.
    opendir(DIR, $dir) || &logdie ("Cannot open \"$dir\" for reading: $!\n");
    @filenames = readdir(DIR);
    closedir(DIR);

    if ($nlink == 2) { # This dir has no subdirectories.
	for (@filenames) {
	    next if $_ eq '.';
	    next if $_ eq '..';
	    next if -l $_;
	    next if $_ =~ /submission/;
	    next if $_ =~ /\/\.svn\//;
	    if ($_ =~ /gene_association/ && -f $_ ) {
		push (@filelist, "$dir/$_");
	    }
	}
    } else { # This dir has subdirectories.
	$subcount = $nlink - 2;
	for (@filenames) {
	    next if $_ eq '.';
	    next if $_ eq '..';
	    next if -l $_;
	    next if $_ =~ /submission/;
	    if ( $_ =~ /gene_association./ && -f $_ ) {
		push (@filelist, "$dir/$_");
	    }
	    next if $subcount == 0;  # Seen all the subdirs?

	    # Get link count and check for directoriness.

	    ($dev,$ino,$mode,$nlink) = lstat($_);
	    next unless -d $_;

	    # It really is a directory, so do it recursively.

	    chdir $_ || &logdie ("Cannot chdir to $_: $!");
	    &dodir("$dir/$_", $nlink);
	    chdir '..';
	    --$subcount;
	}
    }
}


#
# counts from each gene associations file
#
sub associationcounts {

# Annotation Counts
    my $tcount = 0;
    my $ieacount = 0;

    # Total Gene Product Count
    my %gphash = ();
    my $gpcount = 0;

    while ( <GA> ) {
	chomp;

	if (/^\!/) { next; }

	my @cols = split(/\t/);
    
	# simple check of an annotation, the aspect is stated
	unless (defined($cols[8])) { next; }

	if ($cols[6] ne 'IEA') {
	    $ieacount++;
	}

	$tcount++;

	unless (defined($gphash{$cols[1]})) {
	    $gpcount++;
	    $gphash{$cols[1]} = 1;
	}
    }

    return ($tcount, $ieacount, $gpcount);
}

# print to STDERR and to the LOG file
sub logdie
{
    my $message = $_[0];

    print $message;
    die $message;
}
