#!/usr/bin/perl use strict; my @dbs; if ($ARGV[0] && $ARGV[0] =~ /^\-.+/) { my $opt = shift @ARGV; if ($opt eq '-h' || $opt eq '--help') { usage(); exit 0; } elsif ($opt eq '--db') { push(@dbs, shift); } elsif ($opt eq '--refg') { @dbs = qw(chicken dictybase ecocyc fb genedb_spombe human mgi rgd sgd tair wb zfin); } else { print STDERR "Unknown option: $opt\n"; exit 1; } } if (!@dbs) { @dbs = @ARGV; } if (!@dbs) { print usage(); exit 1; } my %errh = (); my %err_count_by_type = (); foreach my $db (@dbs) { if ($db =~ /gp2protein\.(\S+)\.gz/) { $db = $1; } my $idspace = $db; %err_count_by_type = (); my $gp2proteinf = "gp2protein.$db.gz"; my $assocf = "../gene-associations/gene_association.$db.gz"; if ($db eq 'uniprot') { $idspace = 'UniProtKB'; $assocf = "../gene-associations/gene_association.goa_uniprot.gz"; } if ($db eq 'human') { $idspace = 'UniProtKB'; $assocf = "../gene-associations/gene_association.goa_human.gz"; } if ($db eq 'chicken') { $idspace = 'UniProtKB'; $assocf = "../gene-associations/gene_association.goa_chicken.gz"; } # ---------------------------------------- # parse assocs # ---------------------------------------- my %geneh = (); if (!$assocf) { flag('no_file',"no file: $assocf"); next; } open(F,"gzip -dc $assocf |") || die "cannot open $assocf"; while () { next if (/^\!/); my @vals = split; if (lc($vals[0]) ne lc($idspace)) { flag('db_mismatch_in_assocfile',"$vals[0] != $idspace"); } $geneh{$vals[1]} = $vals[11]; } close(F); # ---------------------------------------- # parse gp2f # ---------------------------------------- if (!$gp2proteinf) { flag('no_file',"no file: $gp2proteinf"); next; } open(F,"gzip -dc $gp2proteinf |") || die "cannot open $gp2proteinf"; my %hasseq = (); my %hasseqtype = (); my %seqtypeh = (); while () { next if (/^\!/); my @vals = split; #my $g = "$idspace:$vals[0]"; my $g = "$vals[0]"; if ($g =~ /(\w+):(.*)/) { if (lc($1) ne lc($idspace)) { flag('db_mismatch_in_gp2protein',"$1 != $idspace"); } $g = $2; } if (!$geneh{$g}) { # no assoc - ok } if (!$vals[1]) { $vals[1] = "NULL:"; } my @seqs = split(/;/,$vals[1]); if (!@seqs) { @seqs = ("NULL:"); } my @seqtypes = map {/([\w\-]+):/;$1} @seqs; $hasseq{$g} = 1; $hasseqtype{$_}++ foreach @seqtypes; $seqtypeh{$_}->{$g} = 1 foreach @seqtypes; if (!$geneh{$g}) { # no assoc - ok } } close(F); foreach my $g (keys %geneh) { my $t = $geneh{$g}; if (!$hasseq{$g} && ($t eq 'gene' || $t eq 'protein')) { flag('noseq',"not found in gp2protein: $g"); } } my $num_genes_with_seqs = keys %hasseq; print "=== $db ===\n\n"; printf "* num_entities_annotated: %d\n", scalar(keys %geneh); foreach my $t (keys %geneh) { #printf "** Entity_type: $t %d\n", scalar (keys %{$geneh{$t}}); } printf "* num_genes_with_seqs: %d\n", $num_genes_with_seqs; foreach my $st (keys %seqtypeh) { printf "** Seq_type: $st %d %d %\n", scalar (keys %{$seqtypeh{$st}}), (scalar (keys %{$seqtypeh{$st}}) / $num_genes_with_seqs) * 100; } print "\n"; foreach my $t (keys %err_count_by_type) { printf "** Flag: $t: %d\n", $err_count_by_type{$t}; } print "\n\n"; } exit(0); sub flag { my $type = shift; my $msg = shift; $err_count_by_type{$type}++; if ($errh{$msg}) { return; } $errh{$msg} = 1; printf STDERR "$type\t$msg\n"; } sub scriptname { my @p = split(/\//,$0); pop @p; } sub usage { my $sn = scriptname(); print <