#!/usr/bin/perl use strict; use File::Copy; use constant PCOUNTFILE => "/var/tmp/tmp_pcount.$$"; use constant PUNFCOUNTFILE => "/var/tmp/tmp_punfcount.$$"; use constant LOGFILE => '/share/go/logs/GO-counts.log'; use constant DOCDIR => '/share/ftp/go/www/'; use constant PROJECT_COUNT_FILE => 'project-filtered-nums.html'; use constant PROJECT_UNFILTER_COUNT_FILE => 'project-unfiltered-nums.html'; use constant PROJECT_DATESTAMP => 'project-datestamp.html'; use constant ASSOCIATIONSDIR => '/share/ftp/go/gene-associations'; use constant SUBMISSIONDIR => '/share/ftp/go/gene-associations/submission'; use constant READMEURL => 'gene-associations/readme/'; use constant DOWNLOADURL => 'http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/'; use constant HTTPDOWNLOAD => 'gene-associations/'; my @mon2month = ( 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', ); # # main hash for HTML table. Information matching file name with its source and description. # # THIS DEFINES THE ORDER OF THE FILES IN THE OUTPUT. # Please alphabetize using the genus species name, not the file name. # my @order = ( 'PAMGO_Atumefaciens.gz', 'goa_arabidopsis.gz', 'tair.gz', 'aspgd.gz', 'jcvi.gz', 'goa_cow.gz', 'wb.gz', 'cgd.gz', 'goa_dog.gz', 'goa_zebrafish.gz', 'zfin.gz', 'PAMGO_Ddadantii.gz', 'dictyBase.gz', 'fb.gz', 'ecocyc.gz', 'goa_chicken.gz', 'gonuts.gz', 'GeneDB_tsetse.gz', 'goa_human.gz', 'GeneDB_Lmajor.gz', 'PAMGO_Mgrisea.gz', 'goa_mouse.gz', 'mgi.gz', 'PAMGO_Oomycetes.gz', 'gramene_oryza.gz', 'goa_pdb.gz', 'GeneDB_Pfalciparum.gz', 'pseudocap.gz', 'goa_rat.gz', 'rgd.gz', 'reactome.gz', 'sgd.gz', 'GeneDB_Spombe.gz', 'pombase.gz', 'sgn.gz', 'goa_pig.gz', 'GeneDB_Tbrucei.gz', 'goa_uniprot_noiea.gz', 'sub/goa_pdb.gz', 'sub/reactome.gz', 'sub/goa_uniprot.gz', 'sub/goa_arabidopsis.gz', 'sub/goa_zebrafish.gz', 'sub/goa_mouse.gz', 'sub/goa_rat.gz', 'sub/goa_dog.gz', 'sub/goa_pig.gz', ); my %species = ( 'PAMGO_Atumefaciens.gz'=>'Agrobacterium tumefaciensstr. C58
PAMGO', 'goa_arabidopsis.gz'=>'Arabidopsis thaliana
GO Annotations @ EBI', 'tair.gz'=>'Arabidopsis thaliana
TAIR', 'aspgd.gz'=>'Aspergillus nidulans
AspGD', 'goa_cow.gz'=>'Bos taurus
GO Annotations @ EBI', 'wb.gz'=>'Caenorhabditis elegans
WormBase', 'cgd.gz'=>'Candida albicans
CGD', 'goa_dog.gz'=>'Canis lupus familiaris
GO Annotations @ EBI', 'goa_zebrafish.gz'=>'Danio rerio
GO Annotations @ EBI', 'zfin.gz'=>'Danio rerio
ZFIN', 'PAMGO_Ddadantii.gz'=>'Dickeya dadantii
PAMGO', 'dictyBase.gz'=>'Dictyostelium discoideum
dictyBase', 'fb.gz'=>'Drosophila melanogaster
FlyBase', 'ecocyc.gz'=>'Escherichia coli
PortEco', 'goa_chicken.gz'=>'Gallus gallus
GO Annotations @ EBI', 'gonuts.gz'=>'Gene Ontology Normal Usage Tracking System (GONUTS)', 'GeneDB_tsetse.gz'=>'Glossina morsitans
Sanger GeneDB', 'goa_human.gz'=>'Homo sapiens
GO Annotations @ EBI', 'GeneDB_Lmajor.gz'=>'Leishmania major
Sanger GeneDB', 'PAMGO_Mgrisea.gz'=>'Magnaporthe grisea
PAMGO', 'goa_mouse.gz'=>'Mus musculus
GO Annotations @ EBI', 'mgi.gz'=>'Mus musculus
MGI', 'PAMGO_Oomycetes.gz'=>'Oomycetes
PAMGO', 'gramene_oryza.gz'=>'Oryza sativa
Gramene', 'goa_pdb.gz'=>'Protein Data Bank [multispecies]
GO Annotations @ EBI', 'GeneDB_Pfalciparum.gz'=>'Plasmodium falciparum
Sanger GeneDB', 'pseudocap.gz'=>'Pseudomonas aeruginosa PAO1
PseudoCAP', 'goa_rat.gz'=>'Rattus norvegicus
GO Annotations @ EBI', 'rgd.gz'=>'Rattus norvegicus
RGD', 'reactome.gz'=>'Reactome [multispecies]
CSHL & EBI', 'jcvi.gz'=>'Comprehensive Microbial Resource [multispecies]
JCVI', 'sgd.gz'=>'Saccharomyces cerevisiae
SGD
Stanford University', 'GeneDB_Spombe.gz'=>'Schizosaccharomyces pombe
PomBase
University of Cambridge, UK', 'pombase.gz'=>'Schizosaccharomyces pombe
PomBase
University of Cambridge, UK', 'sgn.gz'=>'Solanaceae
SGN', 'goa_pig.gz'=>'Sus scrofa
GO Annotations @ EBI', 'GeneDB_Tbrucei.gz'=>'Trypanosoma brucei
Sanger GeneDB', 'goa_uniprot_noiea.gz'=>'UniProt [multispecies]
IEA annotations have been removed
GO Annotations @ EBI', 'goa_uniprot.gz'=>'UniProt [multispecies]
GO Annotations @ EBI', ); my %readme = ( 'aspgd.gz'=>'aspgd.README', 'cgd.gz'=>'cgd.README', 'dictyBase.gz'=>'dictyBase.README', 'fb.gz'=>'fb.README', 'ecocyc.gz'=>'EcoCyc.README', 'GeneDB_Lmajor.gz'=>'GeneDB_Lmajor.README', 'GeneDB_Pfalciparum.gz'=>'GeneDB_Pfalciparum.README', 'GeneDB_Spombe.gz'=>'pombase.README', 'pombase.gz'=>'pombase.README', 'GeneDB_Tbrucei.gz'=>'GeneDB_Tbrucei.README', 'GeneDB_tsetse.gz'=>'GeneDB_tsetse.README', 'goa_arabidopsis.gz'=>'goa.README', 'goa_chicken.gz'=>'goa.README', 'goa_cow.gz'=>'goa.README', 'goa_human.gz'=>'goa.README', 'goa_mouse.gz'=>'goa.README', 'goa_pdb.gz'=>'goa_pdb.README', 'goa_dog.gz'=>'goa_pdb.README', 'goa_pig.gz'=>'goa_pdb.README', 'goa_rat.gz'=>'goa.README', 'goa_uniprot.gz'=>'goa.README', 'goa_uniprot_noiea.gz'=>'goa.README', 'goa_zebrafish.gz'=>'goa.README', 'gramene_oryza.gz'=>'gramene_oryza.README', 'mgi.gz'=>'mgi.README', 'PAMGO_Atumefaciens.gz'=>'PAMGO_Atumefaciens.README', 'PAMGO_Ddadantii.gz'=>'PAMGO_Ddadantii.README', 'PAMGO_Oomycetes.gz'=>'PAMGO_Oomycetes.README', 'PAMGO_Mgrisea.gz'=>'PAMGO_Mgrisea.README', 'pseudocap.gz'=>'PseudoCAP.README', 'rgd.gz'=>'rgd.README', 'reactome.gz'=>'reactome.README', 'sgd.gz'=>'sgd.README', 'tair.gz'=>'tair.README', 'jcvi.gz'=>'jcvi_prokaryotic.README', 'sgn.gz'=>'sgn.README', 'wb.gz'=>'WormBase.README', 'zfin.gz'=>'zfin.README', ); open (LOG, ">>" . LOGFILE ) || &logdie ("Cannot open " . LOGFILE . "\n"); # select LOGFILE so that STDOUT is captured select (LOG); $| = 1; print "\n-----------------------------\n"; print scalar localtime, "\tGO association counts\n"; print "-----------------------------\n"; # hash for last modification date for each gene association file my $filemoddate = 0; my $filesubmitdate = 0; my $filesize = 0; my @filelist = 0; my $filteredfile = 0; #my $filerevision = 0; chdir (ASSOCIATIONSDIR) || &logdie ("FATAL: cannot chdir to " . ASSOCIATIONSDIR . ": $!\n"); &dodir(ASSOCIATIONSDIR, 0); # # Detect if a new gene_association file has been added to the repository. # If yes, then add it to the order array and print a warning. # The statistics for this file will be included at the end of the HTML table # the description will just be the filename. # my $assocdir = ASSOCIATIONSDIR; foreach my $xfile (@filelist) { # cycle through the files found on disk with &dodir if ($xfile =~ /\/gene_association\./) { my $foundfile = 0; $xfile =~ s#$assocdir/gene_association\.##; # cycle through the list of known files defined in @order above foreach my $yfile (@order) { $foundfile = 1 if ($xfile eq $yfile); } # if disk file was not in @order push it into @order unless ($foundfile) { if ($xfile ne "goa_uniprot.gz") { push(@order, $xfile); print "Found new file: $xfile\n"; warn "Found new file: $xfile\n"; } } } } # output file, initially written to /var/tmp/ open (POUT, ">" . PCOUNTFILE) || &logdie ("Cannot open " . PCOUNTFILE . "\n"); open (PUNFOUT, ">" . PUNFCOUNTFILE) || &logdie ("Cannot open " . PUNFCOUNTFILE . "\n"); # output stats in the order defined in order array foreach my $sfile (@order) { # # date filtered file was last modified # my $file = ""; if ( $sfile =~ /sub/ ) { $sfile =~ s/sub\///; $file = SUBMISSIONDIR . "/gene_association." . $sfile; $filteredfile = 0; } else { $file = ASSOCIATIONSDIR . "/gene_association." . $sfile; $filteredfile = 1; } # index to the array from stat() # dev,ino,mode,nlink,uid,gid,rdev,size,atime,mtime,ctime,blksize,blocks my @stats = stat $file; if ($stats[7] < 1024) { $filesize = "$stats[7] b"; } elsif ($stats[7] < 1048576) { my $size = $stats[7]/1024; $filesize = sprintf ("%.1f", $size); $filesize .= " kb"; } elsif ($stats[7] < 1073741824) { my $size = ($stats[7]/1024)/1024; $filesize = sprintf ("%.1f", $size); $filesize .= " mb"; } elsif ($stats[7] >= 1073741824) { my $size = (($stats[7]/1024)/1024)/1024; $filesize = sprintf ("%.1f", $size); $filesize .= " gb"; } my ($sec,$min,$hour,$mday,$mon,$year) = localtime($stats[9]); $mon = $mon + 1; $year = $year + 1900; $filemoddate = "$mon/$mday/$year"; open (GA, "/usr/bin/gzcat $file |") || &logdie ("Cannot gzcat $file\n"); # $tcount, $ieacount, $gpcount; my (@adata) = &associationcounts(); close GA; # open (CVSREV, "/usr/local/bin/cvs -d /share/go/cvs status $file |") || &logdie ("Cannot determine status of $file: $!\n"); # # while () { # chomp; # # if ( /\sRepository revision:\s+(\d+\.\d+)\s/ ) { # $filerevision = $1; # } # } # # close (CVSREV); # name should be defined unless (defined($species{$sfile})) { $species{$sfile} = $sfile; # STDOUT outs to logfile, STDERR goes to cron user print "Found $sfile, no description for this file available.\n"; warn "Found $sfile, no description for this file available.\n"; } if ($adata[2] == 0 ) { # removed files with no annotations. This includes # goa_arabidopsis, goa_rat, goa_mouse, GeneDB_Spombe, and several others next; } if ( $filteredfile ) { if ($adata[2] > 9 ) { print POUT "$species{$sfile}\n"; print POUT "$adata[2]\n"; print POUT "$adata[0]
($adata[1] non-IEA)\n"; print POUT "$filemoddate\n"; print POUT "

\n"; } } else { print PUNFOUT "$species{$sfile}\n"; print PUNFOUT "$adata[2]\n"; print PUNFOUT "$adata[0]
($adata[1] non-IEA)\n"; print PUNFOUT "$filemoddate\n"; print PUNFOUT "

annotations [$filesize]
README

\n"; } } close POUT; close PUNFOUT; # # check is project counts changed using sum, if yes update # open (SUMOUT, "/usr/bin/cksum " . PCOUNTFILE . " " . DOCDIR . PROJECT_COUNT_FILE . " |") || &logdie ("cksum failed: $!\n"); my $newsum = 0; my $cachesum = 0; while ( ) { chomp; my (@sumoutput) = split(/\s+/); if ($sumoutput[2] eq PCOUNTFILE) { $newsum = $sumoutput[0]; } elsif ($sumoutput[2] eq DOCDIR . PROJECT_COUNT_FILE) { $cachesum = $sumoutput[0]; } } unless ($newsum == $cachesum) { # clone LOG filehandle to STDOUT and STDERR to capture output from cvs commit open (STDOUT, ">&LOG") || &logdie ("Count not dup LOG filehandle: $!\n"); open (STDERR, ">&LOG") || &logdie ("Count not dup LOG filehandle: $!\n"); # tmp files moved to production directory, a sandbox of the GO CVS move(PCOUNTFILE, DOCDIR . PROJECT_COUNT_FILE) || &logdie ("Cannot move " . PROJECT_COUNT_FILE . " into place in " . DOCDIR . "\n"); move(PUNFCOUNTFILE, DOCDIR . PROJECT_UNFILTER_COUNT_FILE) || &logdie ("Cannot move " . PROJECT_UNFILTER_COUNT_FILE . " into place in " . DOCDIR . "\n"); # get time, localtime starts with 1900 my ($mon, $day, $year) = (localtime())[4,3,5]; $year += 1900; # converts abbreviated month name to full month name my $month = $mon2month[$mon]; my $dstr = "$year/$month/$day"; # overwrite file with current date open (DATE, "/bin/date '+%B %e, %Y' |") || &logdie ("Cannot run date command: $!\n"); open (DATEFILE, ">" . DOCDIR . PROJECT_DATESTAMP) || &logdie ("Cannot open project-datestamp.html: $!\n"); while ( ) { chomp; print DATEFILE "$_\n"; } close (DATEFILE); close (DATE); # commit files my $cmd = "cd " . DOCDIR . " && /usr/local/bin/cvs -d /share/go/cvs commit -m $dstr " . PROJECT_COUNT_FILE . " " . PROJECT_UNFILTER_COUNT_FILE . " " . DOCDIR . PROJECT_DATESTAMP; my $status = system($cmd); if ($status) { &logdie ("CVS commit failed: $!\n"); } } else { print "No change needed. File not changed and CVS commit not run.\n"; # no changes -- remove tmp files unlink PCOUNTFILE; unlink PUNFCOUNTFILE; } close LOG; exit 0; sub dodir { my ($dir,$nlink) = @_; my ($dev,$ino,$mode,$subcount); my @filenames; # At the top level, we need to find nlink ourselves. ($dev,$ino,$mode,$nlink) = stat($dir) unless $nlink; # Get the list of files in the current directory. opendir(DIR, $dir) || &logdie ("Cannot open \"$dir\" for reading: $!\n"); @filenames = readdir(DIR); closedir(DIR); if ($nlink == 2) { # This dir has no subdirectories. for (@filenames) { next if $_ eq '.'; next if $_ eq '..'; next if -l $_; next if $_ =~ /submission/; next if $_ =~ /\/\.svn\//; if ($_ =~ /gene_association/ && -f $_ ) { push (@filelist, "$dir/$_"); } } } else { # This dir has subdirectories. $subcount = $nlink - 2; for (@filenames) { next if $_ eq '.'; next if $_ eq '..'; next if -l $_; next if $_ =~ /submission/; if ( $_ =~ /gene_association./ && -f $_ ) { push (@filelist, "$dir/$_"); } next if $subcount == 0; # Seen all the subdirs? # Get link count and check for directoriness. ($dev,$ino,$mode,$nlink) = lstat($_); next unless -d $_; # It really is a directory, so do it recursively. chdir $_ || &logdie ("Cannot chdir to $_: $!"); &dodir("$dir/$_", $nlink); chdir '..'; --$subcount; } } } # # counts from each gene associations file # sub associationcounts { # Annotation Counts my $tcount = 0; my $ieacount = 0; # Total Gene Product Count my %gphash = (); my $gpcount = 0; while ( ) { chomp; if (/^\!/) { next; } my @cols = split(/\t/); # simple check of an annotation, the aspect is stated unless (defined($cols[8])) { next; } if ($cols[6] ne 'IEA') { $ieacount++; } $tcount++; unless (defined($gphash{$cols[1]})) { $gpcount++; $gphash{$cols[1]} = 1; } } return ($tcount, $ieacount, $gpcount); } # print to STDERR and to the LOG file sub logdie { my $message = $_[0]; print $message; die $message; }