#!/usr/bin/perl
#
# Script Name: filter-gene-association-wrapper.pl
# Date:        Sep 2005
# 
# Description:
#  This is a wrapper script to process the filtered gene-associations files 
#  created by filter-gene-association.pl script.  Briefly, this wrapper script 
#  does the following things:
#
# 1. cvs update of /go/gene-associations/ and /go/gene-associations/submission/ 
#    directories to get the most up-to-date files from the cvs repository.
#
# 2. For each gzipped gene-association file in /go/gene-associations/submission/ directory, 
#    call the filtering script to filter out bad data.  For example, to process 
#    the gene-associations file from SGD:
#
#    > filter-gene-association.pl -r -i gene-association.sgd.gz
#
# Note:  The script filter-gene-association.pl creates a .filtered.gz and .report file per GA
#        file. At this stage, .filterered.gz file has no cvs version or date in its header
#        section.
#
# 3. For each .filterted.gz file, open up the appropriate gzipped file under
#    /go/gene-associations/ and write out its contents (except the cvs version line and the
#    cvs date lines) to a .current file 
#
# 4. Use checksum to determine if the new file is different from the current version.
#
#    i) If the files are the same, e-mail the report, delete report and filtered files;
#       no need to check anything into CVS
#
#    ii) If the files are different, write out the new CVS version and date to the header 
#        .filtered file, gzip, and move it to the ftp site (just another sandbox):
#
#        /share/ftp/go/gene-associations/ 
#
#    Also, checkin the moved copy to cvs.
#
# Author:     Maintained by the Gene Ontology Consortium 
#             Anand Sethuraman (anand@genome.stanford.edu)
#
###############################################################################

use strict;
use File::Copy;

############ Define GO CVS directory, OBO and abbreviations files #############
my $gocvsbase = '/share/ftp/pub/go';
my $gobinbase = '/share/ftp/pub/go/software/utilities';

my $geneAssocDir = "$gocvsbase/gene-associations/";
my $submissionDir = "$gocvsbase/gene-associations/submission/";

my $cvsDir = '/share/ftp/pub/go/gene-associations/';
my $cvs = '/tools/gnu/bin/cvs';
my $cvsLocation = '/share/go/cvs';

### variables to get data from gene association config files
my $configfile;
my (%email_report);

my $gzip = '/usr/bin/gzip --best';
my $gzipFileExtn = 'gz';
my $gzcat = '/usr/bin/gzcat';
my $gunzip = '/usr/bin/gunzip';
my $mailer = '/usr/bin/mailx';
my $DEBUG = 1;
my $grep = '/usr/xpg4/bin/grep'; ### Does not need to write anything to the standard output; 
                                 ### run with -q option; Exits with zero status if an input line
                                 ### is selected.
our $opt_i;  #interactive mode

use Getopt::Std;

getopts('i');

my $interactive = defined($opt_i);

print "Interactive Mode On!\n" if ($interactive);

### get month, day, and year for the subject line for the report email
my ($sec,$min,$hour,$mday,$mon,$year) = localtime(time);
$year += 1900;
$mon += 1;
($mon = "0" . $mon) if ($mon < 10);
($mday = "0" . $mday) if ($mday < 10);

### What day of the week
my $dayofweek = `date +%a`;
chomp $dayofweek;

# create /tmp/gatmp directory with process number in filename
my $gatmp = "/tmp/gatmp-$$";
mkdir $gatmp || die "Cannot create directory $gatmp: $!\n";

###############################################################################

### Do a CVS update to make sure all submitted gene-association files are current
chdir($geneAssocDir);
my $status = system("$cvs -q update -dP");

die "CVS update failed\n" if ($status);

opendir(NEW, $submissionDir) || die "Cannot open $submissionDir for reading : $!\n";

### read in all the gene-association files
my @newFiles = grep(/\.$gzipFileExtn$/i, readdir NEW);

for my $file (@newFiles) {

    ### do not include results of previous filter script if that were not cleaned up
    next if ( $file =~ /.filtered./ );

    ### skip time check if running interactively
    unless ($interactive) {

	my $now = time;

        ### If its not Saturday then check the modification time of the file.
	### If the file is less than 24 hours old, ie. its new, then check it.
        ### Otherwise assume that it has already been checked and jump out of the for loop.

	if ( $dayofweek ne "Sat" ) {
	    my ( @filestats ) = stat "submission/$file";

	    ### Only need to process files that have been committed in the past 24 hours
	    ### 24 hours * 60 minutes * 60 seconds = 86400 seconds

	    # print "\n* Stat of $file *\n" if ($DEBUG);
	    next if ( ($now - $filestats[9]) > 86400);

	} else {

	    ### To get here if it is Saturday.
	    ### Recheck all files, except the UniProt file unless its new.
	    ### If the file is over 24 hours old jump to the end of the for loop.

	    if ( $file eq "gene_association.goa_uniprot.gz" ) {
		my ( @filestats ) = stat "submission/$file";
		next if ( ($now - $filestats[9]) > 86400);
	    }
	}

    }

    # print "\n* Continuing on $file *\n" if ($DEBUG);

    my $base_file_name;

    if ($file =~ m/(.+)\.$gzipFileExtn/i) {
	$base_file_name = $1;
    }

    my $input = 'N';

    if ($interactive) {
	print "Process $file [y:N] : ";

	$input = <STDIN>;

	chomp($input);

	next unless ($input =~ /^[yY].*$/);
    }

    print "\n* Processing: $file *\n" if ($DEBUG);

    ### remove any old files first
    if (-e "/tmp/${base_file_name}.report") {
	system("/usr/bin/rm -f /tmp/${base_file_name}.report");
    }
    if (-e "/tmp/${base_file_name}.filtered.gz") {
	system("/usr/bin/rm -f /tmp/${base_file_name}.filtered.gz");
    }

    ### run the filtering script on this file to create .filtered.gz and .report files
    my $status = system ("/usr/bin/csh -c 'unlimit; $gobinbase/filter-gene-association.pl -q -r -i $submissionDir$file'");

    die "$gobinbase/filter-gene-association.pl failed\n" if ($status);

    # If file exists, normal case, generate the cksum. If the file
    # does not exist, create it and add it to the repository.

    my $checkSumOld = "";

    if (-e "$geneAssocDir$file") {
	$checkSumOld = &checkSumGAFile("$geneAssocDir$file");
    } else {
	system("/usr/bin/touch $geneAssocDir$file");
	print "/usr/bin/touch $geneAssocDir$file\n";
	system("$cvs -Q -d $cvsLocation add -m newfile $geneAssocDir$file");
	print "$cvs -Q -d $cvsLocation add $geneAssocDir$file\n";
	system("/usr/bin/csh -c 'unlimit; $cvs -Q -d $cvsLocation commit -m newfile $geneAssocDir$file'");
	print "$cvs -Q -d $cvsLocation commit -m newfile $geneAssocDir$file\n";
    }
	
    ### parse the config file
    &parse_gene_assoc_config_file($base_file_name);

    ### start examing the .filtered.gz file to see whether it should
    ### be checked into cvs or not

    ### sometimes all the lines in the sumitted GA file may have been
    ### filtered out and so its corresponding .filtered.gz file may
    ### not have any lines, except the header section.

    my $checkSumNew = &checkSumGAFile("/tmp/${base_file_name}.filtered.gz");

    ### is filtered file identical to the version in CVS
    ### force commit if in interactive mode
    if ($checkSumOld eq $checkSumNew) {

	print "- Filtered file is the same as the previous version\n" if ($DEBUG);

	# don't skip to next file if running interactively
	unless ($interactive) { 
	    ### no need to commit the newly filtered file into CVS
	    ### delete .filtered.gz and .report files and jump to next file.
	    if (-e "/tmp/${base_file_name}.report") {
		system("/usr/bin/rm -f /tmp/${base_file_name}.report");
	    }
	    if (-e "/tmp/${base_file_name}.filtered.gz") {
		system("/usr/bin/rm -f /tmp/${base_file_name}.filtered.gz");
	    }

	    ### move on to processing the next gene-association file
	    next; 	### jump to end of for loop
	} 
    }

    ### newly filtered file is different (or being forced) from the current GA file
    ### proceed with checking it into cvs

    my ($newversion, $newdate) = &get_cvs_header_info($base_file_name);

    # clean up old file
    if (-e "/tmp/${base_file_name}.$gzipFileExtn") {
	system("/usr/bin/rm -f /tmp/${base_file_name}.$gzipFileExtn");
    }

    ### before commmitting to CVS, write out the new CVS version and date into the header
    open (IN, "$gzcat /tmp/${base_file_name}.filtered.gz |") || die "Cannot open /tmp/${base_file_name}.filtered.gz for reading: $!\n";
    open (TMP, "> $gatmp/$base_file_name") || die "Cannot open $gatmp/$base_file_name for writing:$!\n";
    print TMP "!CVS Version: Revision: $newversion \$\n";
    print TMP "!GOC Validation Date: $newdate \$\n";
#
# Add submission date to header
#
    my ( @substats ) = stat "$submissionDir$file";
    my ($subsec,$submin,$subhour,$submday,$submon,$subyear) = localtime(@substats[9]);
    $submon = $submon + 1;
    $subyear = $subyear + 1900;
    print TMP "!Submission Date: $submon/$submday/$subyear\n";

    print TMP <<EndofMessage;
!
! The above \"Submission Date\" is when the annotation project provided
! this file to the Gene Ontology Consortium (GOC).  The \"GOC Validation
! Date\" indicates when this file was last changed as a result of a GOC
! validation and filtering process.  The \"CVS Version\" above is the
! GOC version of this file.
!
! Note: The contents of this file may differ from that submitted to the
! GOC. The identifiers and syntax of the file have been checked, rows of
! data not meeting the standards set by the GOC have been removed. This
! file may also have annotations removed because the annotations for the
! listed Taxonomy identifier are only allowed in a file provided by
! another annotation project.  The original submitted file is available from:
!  http://www.geneontology.org/gene-associations/submission/
!
! For information on which taxon are allowed in which files please see:
!  http://www.geneontology.org/GO.annotation.shtml\#script
!
EndofMessage

    while (defined(my $line = <IN>)) {
	print TMP "$line";
    }
    close(TMP);
    close(IN);

    ### now gzip, copy over to the ftp site and check into cvs
    my $status = system("$gzip $gatmp/$base_file_name");

    die "gzip $base_file_name failed: $!\n" if ($status);

    copy ("$gatmp/${base_file_name}.$gzipFileExtn", $cvsDir) || die "copy failed $gatmp/${base_file_name}.$gzipFileExtn $cvsDir: $!\n";
    my $dstr = `date +%Y%m%d`;
    chomp $dstr;
    my $status = system("/usr/bin/csh -c 'unlimit; $cvs -Q -d $cvsLocation commit -m $dstr $cvsDir${base_file_name}.$gzipFileExtn >> /dev/null'");

    die "cvs commit of $base_file_name failed: $!\n" if ($status);

    # if no errors there is no report
    if (-e "/tmp/${base_file_name}.report") {
	&mail_report("$base_file_name");
	system("/usr/bin/rm -f /tmp/${base_file_name}.report");
    }

    if (-e "/tmp/${base_file_name}.filtered.gz") {
	system("/usr/bin/rm -f /tmp/${base_file_name}.filtered.gz");
    }
    
    if (-e "$gatmp/${base_file_name}.$gzipFileExtn") {
	system("/usr/bin/rm -f $gatmp/${base_file_name}.$gzipFileExtn");
    }

    print "Committed new version and cleaned up\n" if ($interactive);

}

closedir(NEW);

exit;

###############################################################################
################################ FUNCTIONS ####################################
###############################################################################

###############################################################################
sub parse_gene_assoc_config_file {
###############################################################################

    my ($base_file_name) = @_;  ### eg: gene_association.sgd.gz

    $configfile = "${submissionDir}${base_file_name}.conf";

    my $file_name = "${base_file_name}.gz";
    my $species;

    open (META, $configfile) || die "Cannot open file $configfile for reading: $!\n";

    while ( <META> ) {
        chomp;

	if (m/^email\_report\=(.+)$/) {
            $email_report{ lc($file_name) } = $1;          
        }
    }

    close (META);

}

###############################################################################
sub mail_report {
###############################################################################

    my ($base_file_name) = @_;

    my $gene_assoc_file = $base_file_name . '.' . $gzipFileExtn;

    my $report_file_name = $base_file_name . '.report';

    my $report = "";

    open (REPORT, "/tmp/${report_file_name}") || die "Cannot open output file /tmp/${report_file_name} for reading: $!\n";
    while(defined (my $line = <REPORT>)) { 
	$report .= $line;
    }
    close(REPORT);

    my $recipients = $email_report{ lc($gene_assoc_file) };
    my $subject = "GO Filter Report: $gene_assoc_file, $mon-$mday-$year";

    if ($recipients =~ m/\@/) {
	open(MAIL, "| $mailer -s '$subject' -b 'cherry\@stanford.edu' $recipients") || die "Could not open mailer\n";
	print MAIL $report;
	close(MAIL);
    }
    else {
        print STDERR "- No recipient e-mail in config: $gene_assoc_file\n\n$subject\n$report\n"; 
    }
}

#####################################################################
sub get_cvs_header_info {
#####################################################################

### This method will query the CVS repository to find latest revision 
### number of the gene_association file 
    
    my ($base_file_name) = @_;
    
    my $date = `date +%m/%d/%Y`;

    chomp($date);

    open(CVSREV, "$cvs -d $cvsLocation status $geneAssocDir${base_file_name}.$gzipFileExtn |") 
	or die "Cannot find status of current file\n";

    my $nextversion = '';

    while (<CVSREV>) {
        chomp;

        if ( /\sRepository revision:\s+(\d+)\.(\d+)\s/ ) {

            my ($major, $minor) = ($1, $2);

            $minor++;

            $nextversion = $major . '.' . $minor;

        }
    }

    close (CVSREV);

    die "nextversion not set\n" if ($nextversion eq ''); 
    
    return($nextversion, $date);

}

###############################################################################
sub checkSumGAFile {
###############################################################################

    my $file = shift;

    die "$file doesn't exist: $!" if !(-e $file);

    my $checkSum = `$gzcat $file | $grep -v '^!' | /tools/gnu/bin/md5`;

    $checkSum =~ s/ .*$//;

    if ($?){

        die "An error occured when running md5 : $? : $!";

    }

    chomp ($checkSum);

#    print "checkSumGAFile: $file\t$checkSum\n" if ($DEBUG);

    return $checkSum;

}

###############################################################################