#!/usr/bin/perl
use strict;

my @dbs;
if ($ARGV[0] && $ARGV[0] =~ /^\-.+/) {
    my $opt = shift @ARGV;
    if ($opt eq '-h' || $opt eq '--help') {
        usage();
        exit 0;
    }
    elsif ($opt eq '--db') {
        push(@dbs, shift);
    }
    elsif ($opt eq '--refg') {
        @dbs = qw(chicken dictybase ecocyc fb genedb_spombe human mgi rgd sgd tair wb zfin);
    }
    else {
        print STDERR "Unknown option: $opt\n";
        exit 1;
    }
}

if (!@dbs) {
    @dbs = @ARGV;
}

if (!@dbs) {
    print usage();
    exit 1;
}

my %errh = ();
my %err_count_by_type = ();

foreach my $db (@dbs) {
    if ($db =~ /gp2protein\.(\S+)\.gz/) {
        $db = $1;
    }
    my $idspace = $db;

    %err_count_by_type = ();

    my $gp2proteinf = "gp2protein.$db.gz";
    my $assocf = "../gene-associations/gene_association.$db.gz";
    if ($db eq 'uniprot') {
        $idspace = 'UniProtKB';
        $assocf = "../gene-associations/gene_association.goa_uniprot.gz";
    }
    if ($db eq 'human') {
        $idspace = 'UniProtKB';
        $assocf = "../gene-associations/gene_association.goa_human.gz";
    }
    if ($db eq 'chicken') {
        $idspace = 'UniProtKB';
        $assocf = "../gene-associations/gene_association.goa_chicken.gz";
    }

# ----------------------------------------
# parse assocs
# ----------------------------------------
    my %geneh = ();
    if (!$assocf) {
        flag('no_file',"no file: $assocf");
        next;
    }
    open(F,"gzip -dc $assocf |") || die "cannot open $assocf";
    while (<F>) {
        next if (/^\!/);
        my @vals = split;
        if (lc($vals[0]) ne lc($idspace)) {
            flag('db_mismatch_in_assocfile',"$vals[0] != $idspace");
        }
        $geneh{$vals[1]} = $vals[11];
    }
    close(F);

# ----------------------------------------
# parse gp2f
# ----------------------------------------
    
    if (!$gp2proteinf) {
        flag('no_file',"no file: $gp2proteinf");
        next;
    }
    open(F,"gzip -dc $gp2proteinf |") || die "cannot open $gp2proteinf";
    my %hasseq = ();
    my %hasseqtype = ();
    my %seqtypeh = ();
    while (<F>) {
        next if (/^\!/);
        my @vals = split;
        #my $g = "$idspace:$vals[0]";
        my $g = "$vals[0]";
        if ($g =~ /(\w+):(.*)/) {
            if (lc($1) ne lc($idspace)) {
                flag('db_mismatch_in_gp2protein',"$1 != $idspace");
            }
            $g = $2;
        }
        if (!$geneh{$g}) {
            # no assoc - ok
        }
        if (!$vals[1]) {
            $vals[1] = "NULL:";
        }
        my @seqs = split(/;/,$vals[1]);
        if (!@seqs) {
            @seqs = ("NULL:");
        }
        my @seqtypes = map {/([\w\-]+):/;$1} @seqs;
        $hasseq{$g} = 1;
        $hasseqtype{$_}++ foreach @seqtypes;
        $seqtypeh{$_}->{$g} = 1 foreach @seqtypes;
        if (!$geneh{$g}) {
            # no assoc - ok
        }
    }
    close(F);

    foreach my $g (keys %geneh) {
        my $t = $geneh{$g};
        if (!$hasseq{$g} && ($t eq 'gene' || $t eq 'protein')) {
            flag('noseq',"not found in gp2protein: $g");
        }
    }

    my $num_genes_with_seqs = keys %hasseq;
    print "=== $db ===\n\n";
    printf "* num_entities_annotated: %d\n", scalar(keys %geneh);
    foreach my $t (keys %geneh) {
        #printf "** Entity_type: $t %d\n", scalar (keys %{$geneh{$t}});
    }
    printf "* num_genes_with_seqs: %d\n", $num_genes_with_seqs;
    foreach my $st (keys %seqtypeh) {
        printf "** Seq_type: $st %d %d %\n", scalar (keys %{$seqtypeh{$st}}), (scalar (keys %{$seqtypeh{$st}}) / $num_genes_with_seqs) * 100;
    }
    print "\n";
    foreach my $t (keys %err_count_by_type) {
        printf "** Flag: $t: %d\n", $err_count_by_type{$t};
    }
    print "\n\n";
}

exit(0);

sub flag {
    my $type = shift;
    my $msg = shift;
    $err_count_by_type{$type}++;
    if ($errh{$msg}) {
        return;
    }
    $errh{$msg} = 1;
    printf STDERR "$type\t$msg\n";
}

sub scriptname {
    my @p = split(/\//,$0);
    pop @p;
}


sub usage {
    my $sn = scriptname();

    print <<EOM;
    $sn FILE [FILE2..]

        This file filters certain content from the full/extended GO obo file
        to generate the GO basic file.

        At this time this includes the following steps:

        1. filters inter-ontology link
        2. filters intra-molecular_function ontology regulates links

        As of Feb 9 2009, the full GO will include regulation links that span
        ontologies (BP to MF), and regulation links within MF. This script
        will filter out these additions.

        Note that the actual behavior of this script may change with time, as
        we add more advanced content to the full GO.

        The filtered ontology is written on STDOUT

        A report is written on STDERR

EOM
}

