#!/usr/bin/perl
#

# filter-paint-association files to segregate individual files per reference genome species
# version: $Revision: 1.11 $
# date: $Date: 2011/11/30 20:36:39 $
#
# specification of the gene_association format is defined at:
#   http://www.geneontology.org/GO.annotation.html#file
#
# Requires Perl 5.6.1 or later
#
# POD documentation is at the bottom of the file.
#  to see the documentation use pod2text from the command line
#
# Maintained by BBOP for the Gene Ontology Consortium
#  author: Suzanna Lewis (suzi@berkeleybop.org) script, maintained since August 2010
#
###############################################################################

use strict;
use Getopt::Long;
use IO::Compress::Gzip qw(gzip $GzipError);

############ Get and check user passed in single character switches ##########
# process command line arguments
my $opt_h = '';
my $opt_d = '';
my $opt_r = '';
my $opt_v = '';

GetOptions(
    'verbose' => \$opt_v,
    'help' => \$opt_h,
    'datadirectory=s' => \$opt_d,
    'report' => \$opt_r,
    );

############ Define GO SVN directory, OBO and abbreviations files #############

# if GO SVN directory structure changed this will also need to change
my $paintdir = $opt_d;
my $submissiondir = "$paintdir/pre-submission";

# separate gene_association.paint_XXX.gz files will be written out to the gene-associations/submission/paint/pre-submission dir
# (where XXX will be substituted by the name of one of the ref. genome organisms)
###############################################################################

my $printhelp = defined($opt_h);
my $debug = defined($opt_v);
# TRUE if the user wants the details report
# otherwise just the summary is provided
my $writereport = defined($opt_r);

###############################################################################
################## Define variables ###################
###############################################################################

# boiler plate configuration file used for all gaf files
my %conf = ();
&populate_conf_content;

# hash by taxon ID to hash of organism name (for naming the file) 
# and an array of GAF lines
my %organism = ();
# fill in the hash with reference genome taxon IDs
&populate_organism_hash;

# Column position of GO database ID
use constant DBID => 1;
# Column position of WITH
use constant WITH => 7;
# Column position of GO aspect
use constant ASPECT => 8;
# Column position of taxon ID
use constant TAXON => 12;

# For report writing
my $ancestors;
my $proteins;
my $ances_cnt;
my $annot_cnt;

### parse PAINT gene associations files
&parse_paint_assoc_files;

### write reference genome organism specific gene association files
&write_gene_assoc_files;

my @error_buf = ();

exit;

###############################################################################
################################           ####################################
################################ FUNCTIONS ####################################
################################           ####################################
###############################################################################


###############################################################################
sub parse_paint_assoc_files
{
###############################################################################

    my $base_file_name = "";
    my $dirpath = "/tmp/";
    my $savedirpath = "";
    my $input_gaf_version = 1;
    my @report_buf = ();

    opendir(DIR, $paintdir) || die "can't open $paintdir: $!\n";

    my $total_ancestors = 0;
    my $total_terms = 0;
    my $total_uniques = 0;
    my $total_new;
    my $total_ances = {'C' => 0, 'P' => 0, 'F' => 0};
    my $total_annot = {'C' => 0, 'P' => 0, 'F' => 0};
    my $total_fams = 0;

    while ( defined(my $file = readdir (DIR))) 
    {
	if ($file =~ /PTHR\d{5}$/)
	{
	    $total_fams++;
	    $ancestors = {};
    	    $proteins = {};
    	    $ances_cnt = {'C' => 0, 'P' => 0, 'F' => 0};
    	    $annot_cnt = {'C' => 0, 'P' => 0, 'F' => 0};

	    my $gaf_file = "$paintdir/$file/$file.gaf";
	    push (@report_buf, "\n$file\n") if ($writereport);
            print STDERR "Filtering $gaf_file\n" if ($debug);
	    &parse_paint_assoc_file($gaf_file);

    	    if ($writereport)
    	    {
		my $ancestor_total = keys %$ancestors;
		$total_ancestors += $ancestor_total;
		push(@report_buf, "Number of unique ancestors annotated:\t$ancestor_total\n");
		my $ances_sum = $ances_cnt->{'C'} + 
				     $ances_cnt->{'P'} + 
				     $ances_cnt->{'F'};
		$total_terms += $ances_sum;
		my $reportline = "Number of terms annotated to ancestors:\t$ances_sum [";
		$reportline .= " $ances_cnt->{'P'} BP,";
		$reportline .= " $ances_cnt->{'C'} CC,";
		$reportline .= " $ances_cnt->{'F'} MF ]\n";
		$total_ances->{'P'} += $ances_cnt->{'P'};
		$total_ances->{'C'} += $ances_cnt->{'C'};
		$total_ances->{'F'} += $ances_cnt->{'F'};
		push(@report_buf, $reportline);
	
		my $protein_total = keys %$proteins;
		$total_uniques += $protein_total;
		push(@report_buf, "Number of unique proteins annotated:\t$protein_total\n");

		my $annot_sum = $annot_cnt->{'C'} + 
				   $annot_cnt->{'P'} + 
				   $annot_cnt->{'F'};
		$total_new += $annot_sum;
		$reportline = "Number of new protein annotations:\t$annot_sum [";
		$reportline .= " $annot_cnt->{'P'} BP,";
		$reportline .= " $annot_cnt->{'C'} CC,";
		$reportline .= " $annot_cnt->{'F'} MF ]\n";
		$total_annot->{'P'} += $annot_cnt->{'P'};
		$total_annot->{'C'} += $annot_cnt->{'C'};
		$total_annot->{'F'} += $annot_cnt->{'F'};
		push(@report_buf, $reportline);

    	    }
	}
    }

    if ($writereport) 
    {
	push(@report_buf, "\nTotal number of families annotated:\t$total_fams\n");
	push(@report_buf, "Total number of unique ancestors annotated:\t$total_ancestors\n");
	my $reportline = "Total number of terms annotated to ancestors:\t$total_terms [";
	$reportline .= " $total_ances->{'P'} BP,";
	$reportline .= " $total_ances->{'C'} CC,";
	$reportline .= " $total_ances->{'F'} MF ]\n";
	push(@report_buf, $reportline);
	push(@report_buf, "Total number of unique proteins annotated:\t$total_uniques\n");
	$reportline = "Total number of new protein annotations:\t$total_new [";
	$reportline .= " $total_annot->{'P'} BP,";
	$reportline .= " $total_annot->{'C'} CC,";
	$reportline .= " $total_annot->{'F'} MF ]\n";
	push(@report_buf, $reportline);
    }
    closedir DIR;

    if ($writereport)
    {
	print STDERR "Writing report to $submissiondir/paintreport.txt\n";
	open (REPORT, "> $submissiondir/paintreport.txt") || die "Cannot open report file $submissiondir/paintreport.txt for writing: $!\n";
	if (@error_buf) {
	    print REPORT "********** ERROR(S) **********\n\n";
	    foreach my $line (@error_buf) 
	    {
 	        print REPORT $line;
	    }
	    print REPORT "\n********** END ERRORS **********\n";
	}
	foreach my $line (@report_buf) 
	{
 	    print REPORT $line;
	}
	close REPORT;
    } 
}

###############################################################################
sub parse_paint_assoc_file
{
###############################################################################
    my ($gaf_file) = @_;

    if (!(open (INPUT, $gaf_file))) {
	print STDERR "Cannot open GAF file $gaf_file for reading: $!\n";
	push(@error_buf, "Could not open $gaf_file for reading\n");
	return;
    }
    my %others = ();

    # Begin input loop
    while ( defined(my $line = <INPUT>) )
    {
	chomp $line;
	
	$line =~ s/GO_REF/PAINT_REF/;
	$line =~ s/UniProtKB\/Swiss-Prot/UniProtKB/;
	$line =~ s/colocates_with/colocalizes_with/;
	$line =~ s/taxon:284812/taxon:4896/;

	# split TAB delimited columns
	my @cols = split(/\t/, $line);

	my $db = $cols[0];
	my $tax_id = $cols[TAXON];

	if ($line !~ /\tSTOP\t/ && $line !~ /\tCUT\t/) {
	    if ($db !~ /PANTHER/ && $tax_id =~ /^taxon/)
	    {
	        # this is one of the child/leaf node (i.e. extant protein) annotations.
	        $annot_cnt->{$cols[ASPECT]}++;
	        $proteins->{$cols[DBID]} = 1;
	        ##print "Added $cols[DBID] to proteins from:\n$line\n";

	        my $found = 0;
	        my @taxa = keys %organism;
	        while (@taxa && !$found)
	        {
		    my $taxon = pop(@taxa);
		    if ($taxon =~ /^$tax_id$/) 
		    {
		        # clean up the with column first to contain solely the ancestor
		        # Only do this for those without "NOT" because
		        # those don't have anything in the "with" column
		        if ($line !~ /\tNOT/)
		        {
		    	    $line =~ s/\t\S*\|?(PANTHER:PTN\d{9})\S*\t/\t$1\t/;
		    	    if (!$1) {
				print STDERR "Couldn't find PTHR family in with column, in line: $line\n";
			        exit 1;
		    	    }
		        }
			# update to the new evidence codes
			$line =~ s/\tIDS\t/\tIBD\t/;
			$line =~ s/\tIAS\t/\tIBA\t/;
			$line =~ s/\tIMR\t/\tIKR\t/;

		        push(@{$organism{$taxon}{'lines'}}, "$line\n");
		        $found = 1;
		    }
	        }
	        if (!$found)
	        {
		    $others{$tax_id} = $db if ($debug);
		    $line =~ s/\t\S*\|?(PANTHER:PTN\d{9})\S*\t/\t$1\t/;
		    push(@{$organism{'other'}{'lines'}}, "$line\n");
	        }
	    } elsif ($db =~ /PANTHER/ && $cols[WITH] !~ /PANTHER/) 
	    {
	        $ancestors->{$cols[DBID]} = 1;
	        $ances_cnt->{$cols[ASPECT]}++;
	    }
	}
    }

    if ($debug)
    {
	##foreach my $taxa (keys %others)
	##{
	##    print STDERR "Did not find $taxa for $others{$taxa} in organism hash\n";
	##}	    
	##foreach my $taxon (keys %organism)
	##{
	##    my $linecount = @{$organism{$taxon}{'lines'}};
	##    print STDERR "Found $linecount annotations for $taxon\n";
	##}
    }

    close(INPUT);

}

###############################################################################
sub write_gene_assoc_files
{
###############################################################################
    my ($sec,$min,$hour,$mday,$mon,$currentyear) = localtime(time);
    $currentyear += 1900;
    $mon += 1;
	    ($mon = "0" . $mon) if ($mon < 10);
	    ($mday = "0" . $mday) if ($mday < 10);
	    my $datetoday = "$mon/$mday/$currentyear";

	    my $gene_assoc_header;
	    $gene_assoc_header .= "!gaf-version: 2.0\n";
	    $gene_assoc_header .= "!\n";
	    $gene_assoc_header .= "! $datetoday\n";
	    $gene_assoc_header .= "!\n";

	    # probably there is a better way to do this, but need to
	    # put all the alternate taxon ids for a single MOD into a
	    # single file. Doing this by first clearing the file and
	    # just writing out the header


	    foreach my $taxon (keys %organism)
	    {
		my $gaf_file = "$submissiondir/gene_association.paint_${organism{$taxon}{'name'}}.gaf";
		open (GAF, "> ${gaf_file}") || die "Cannot open GAF file $gaf_file for writing: $!\n";
		print GAF $gene_assoc_header;
		my $linecount = @{$organism{$taxon}{'lines'}};

		print GAF "! $linecount associations from PAINT for ${organism{$taxon}{'name'}}\n";
		print STDERR "! $linecount associations from PAINT for ${organism{$taxon}{'name'}}\n" if ($debug);
		print GAF "!\n";

		close GAF;
	    }

	    # now write out the contents using append
	    foreach my $taxon (keys %organism)
	    {
		my $base_file_name = "$submissiondir/gene_association.paint_${organism{$taxon}{'name'}}";
		my $gaf_file = "$base_file_name.gaf";
		print "Writing gene association file ${gaf_file}\n" if ($debug);

		write_gene_assoc_config_file($base_file_name);

		open (GAF, ">> ${gaf_file}") || die "Cannot write output ${gaf_file}: $!\n";

		my @lines = @{$organism{$taxon}{'lines'}};
		foreach my $line (@lines)
		{
		    print GAF $line;
		}
		close GAF;

	    	my $status = gzip $gaf_file => "$base_file_name.gz" 
			or die "gzip failed: $GzipError\n";

		#close GAFGZ;
	    }
	}

	###############################################################################
	sub write_gene_assoc_config_file
	{
	###############################################################################
	    my ($base_file_name) = @_;  ### eg: gene_association.sgd

	    my $configfile = "${base_file_name}.conf";

	    open (CONF, "> $configfile") || die "Cannot open config file $configfile for writing: $!\n";

	    foreach my $label (keys %conf)
	    {
		my $confline = $label . ":\t" . $conf{$label} . "\n"; 
		print CONF $confline;
	    }

	    close (CONF);

	}

	###############################################################################
	sub populate_organism_hash
	###############################################################################
	{
	    $organism{'taxon:9606'} = { 'name' => 'goa_human', 'lines' => [] };
	    $organism{'taxon:10090'} = { 'name' => 'mgi', 'lines' => [] };
	    $organism{'taxon:10116'} = { 'name' => 'rgd', 'lines' => [] };
	    $organism{'taxon:559292'} = { 'name' => 'sgd', 'lines' => [] };
	    $organism{'taxon:4932'} = { 'name' => 'sgd', 'lines' => [] };
	    $organism{'taxon:237561'} = { 'name' => 'cgd', 'lines' => [] };
	    $organism{'taxon:44689'} = { 'name' => 'dictyBase', 'lines' => [] };
	    $organism{'taxon:7227'} = { 'name' => 'fb', 'lines' => [] };
	    $organism{'taxon:4896'} = { 'name' => 'pombase', 'lines' => [] };
	    $organism{'taxon:3702'} = { 'name' => 'tair', 'lines' => [] };
	    $organism{'taxon:6239'} = { 'name' => 'wb', 'lines' => [] };
	    $organism{'taxon:7955'} = { 'name' => 'zfin', 'lines' => [] };
	    $organism{'taxon:83333'} = { 'name' => 'ecocyc', 'lines' => [] };
	    $organism{'taxon:9031'} = { 'name' => 'goa_chicken', 'lines' => [] };
	    $organism{'other'} = { 'name' => 'other', 'lines' => [] };
	}

###############################################################################
sub check_options
###############################################################################
{
    if ($printhelp)
    {

	print STDERR <<END;

      Usage:  $0 [-h] [-v] [-r] [-i input file]

	  -h displays this message
	  -d verbose mode (for debugging), quiet mode is default
          -r write e-mail report
	  -i input to present a standard file as input.
             Default is all PAINT directories

	  examples:

	      check a particular file for annotations

		  % $0 -i gene-associations/submission/paint/PTHR22573/PTHR22573.save.gaf

	      run in debug mode

		  % $0 -d

END

    exit;

    }

}

###############################################################################
sub populate_conf_content
###############################################################################
{
    %conf = ( 'project_name' => 'RefGenome',
	      'contact_email' => 'pantherdb-paint-users@lists.sourceforge.net',
              'project_url' => 'http://gocwiki.geneontology.org/index.php/Paint',
              'funding_source' => 'NIH (NHGRI)', 
              'email_report' => 'pantherdb-paint-users@lists.sourceforge.net' );
}

###############################################################################

__END__

=head1 NAME

I<filter-paint-association.pl> - creates separate gaf files for each ref. genome species from paint files

=head1 SYNOPSIS

=over

=item print usage

  filter-paint-association.pl -h

=back

=over

=item run checks on the specified gene association file

  filter-paint-association.pl -i gene_association.sgd.gz

=back

=head1 DESCRIPTION

Filter paint gene association files to generate GAFs that are categorized according to organism
rather than according to protein family. The newly generated GAF files are committed to the
standard gene_association/submission/paint/pre-submission directory.

=head1 ARGUMENTS

Arguments can control debug messages, reporting, and working from a specific protein family.

=over

=item -h

print usage message

=item -d

debug mode

=item -i

name of input paint gene association file.

=item -r

creates a report file in the submission/paint/pre-submission directory: .paint_report

=back

=head1 INPUT

The specification of the gene_association format is defined at:
http://www.geneontology.org/GO.annotation.html#file

=over

=back

=head1 OUTPUT

The default output is a set of gene association association files
defined according to organism. These are written into the 
gene_association/submission/paint/pre-submission directory, gzipped and committed to SVN.
When using -r option, an additional output file will be created: .paint_report

=cut

