#!usr/bin/perl -w
#
# By Jennifer I Clark
# 6th July 2005	
#
# This script checks definition dbxrefs in the gene_ontology_edit.obo file.
# 1) It checks them to see that the prefix e.g. 'GO' in GO:jic
# is listed in the GO.xrf_abbs file.
# 2) It looks at all dbxrefs that have a prefix listed in GO.curator_dbxrefs
# but that don't end in numbers (e.g. RESID:AA35278) and checks that they are
# listed in the GO.curator_dbxrefs file. e.g. if it finds 'GO:jc' instead
# of 'GO:jic' then it reports an error.
#
# The script can run from its current location in the go/software/utilities folder. 
#


use strict;

my $reference_curator_dbxref;
my $reference_database_abbrev;
my $database_abbrev_from_GO_curator_dbxrefs;
my $hash_of_database_abbrev_from_GO_curator_dbxrefs;
my $line;
my $line2;
my $line3;
my $line4;
my $live_dbxref;
my $live_dbxrefs_non_numerical_endings;
my $live_prefix;
my $live_prefix_non_numerical_endings;
my %hash_of_curator_database_abbrev = ();
my %hash_of_curator_database_full = ();
my %hash_of_database_abbrev = ();
my @live_dbxrefs;
my %hash_of_database_abbrev_from_GO_curator_dbxrefs;
my $k;
my $v;


#-----------------------------------------------------------------------------------------
# Part 1
#
#This first bit gets all the database abbreviations and synonyms out of the file 
#GO.xrf_abbs and stores them in a hash.
#
# If you want to check that synonyms are not used in the live file then comment out the 
# second 'if' statement.
#
#

open (FILE, "<../../doc/GO.xrf_abbs") || die "Can't open GO.xrf_abbs.\n";  
print "GO.xrf_abbs is being processed.\n";

while(<FILE>){	
	$line = $_;
    chomp $line;
    if ($line =~ m/abbreviation: (.*)/){
    	$reference_database_abbrev=$1;
    	$hash_of_database_abbrev{$reference_database_abbrev} = 1;
	}

	if ($line =~ m/synonym: (.*)/){
    	$reference_database_abbrev=$1;
    	$hash_of_database_abbrev{$reference_database_abbrev} = 1;
	}
	
# These below are the error dbxref prefixes that need to be removed from the GO file
# or added to the GO.xrf_abbs file.
# These lines can be removed once the files are fixed.
# The lines for http, ISSN etc. should stay as they do not belong in GO.xrf_abbs.


		$hash_of_database_abbrev{"http"} = 1;
		$hash_of_database_abbrev{""} = 1;
		$hash_of_database_abbrev{"TRAIT"} = 1;	# Addition to GO.xrf_abbs requested.
		$hash_of_database_abbrev{"PubChem"} = 1;	# Addition to GO.xrf_abbs requested.
		$hash_of_database_abbrev{"CGN"} = 1;	# CGN is the accronym of Cognia which
												# is a company so not in GO.xrf_abbs.
		$hash_of_database_abbrev{"AZ"} = 1;		# AZ is the accronym of AstraZeneca which
												# is a company so not in GO.xrf_abbs.
	
		$hash_of_database_abbrev{"GOC"} = 1;
	
	
}

close FILE;
print "GO.xrf_abbs hash has been created.\n";

#Uncomment the code below to print the contents of this hash. 

#while ( ($k,$v) = each %hash_of_database_abbrev ) {
#    print "$k => $v\n";
#}


#-----------------------------------------------------------------------------------------
# Part 2
#
#This next bit gets all the database abbreviations out of the file 
#GO.curator_dbxrefs and stores them in a hash.

open (FILE2, "<../../doc/GO.curator_dbxrefs") || die "Can't open GO.curator_dbxrefs.\n";  
print "GO.curator_dbxrefs is being processed.\n";

while(<FILE2>){	
	$line2 = $_;
    chomp $line2;

    if ($line2 =~ m/^(.*):.*/){
    	$database_abbrev_from_GO_curator_dbxrefs=$1;
    	}

		$hash_of_database_abbrev_from_GO_curator_dbxrefs{$database_abbrev_from_GO_curator_dbxrefs} = 1;
}

print "GO.curator_dbxrefs hash has been created.\n";

#Uncomment the code below to print the contents of this hash. 

#while ( ($k,$v) = each %hash_of_database_abbrev_from_GO_curator_dbxrefs ) {
#    print "$k => $v\n";
#}

# This code takes the first line of the 'GO.curator_dbxrefs' file
# 'Abbreviation	Database	Name' into the hash but I have not yet worked out why. 


#-----------------------------------------------------------------------------------------
# Part 3
#
#This third bit gets all the curator dbxrefs out of the file 
#GO.curator_dbxrefs and stores them in a hash.

open (FILE2, "<../../doc/GO.curator_dbxrefs") || die "Can't open GO.curator_dbxrefs.\n";  
print "GO.curator_dbxrefs is being processed.\n";

while(<FILE2>){	
	$line3 = $_;
    chomp $line3;
    if ($line3 =~ m/^(.*:.*?)\t/){
    	$reference_curator_dbxref=$1;
   	}
	$hash_of_curator_database_full{$reference_curator_dbxref} = 1;
	
	# This line below is added so that if OBO-Edit puts quotes after the dbxref then the script will not object.
	$hash_of_curator_database_full{"$reference_curator_dbxref \"\""} = 1;
	$hash_of_curator_database_full{"SP:kwdictionary"} = 1;
}

$hash_of_curator_database_full{"SP:kwdictionary"} = 1;
$hash_of_curator_database_full{"SP:kwdictionary \"\""} = 1;
$hash_of_curator_database_full{"MGI:curators \"\""} = 1;
$hash_of_curator_database_full{"MGI:curators \"\""} = 1;
$hash_of_curator_database_full{"MGI:curators"} = 1;
print "GO.curator_dbxrefs hash has been created.\n";


#Uncomment the code below to print the contents of this hash. 

#while ( ($k,$v) = each %hash_of_curator_database_full ) {
#    print "$k => $v\n";
#}

#-----------------------------------------------------------------------------------------
# Part 4
#
# This last bit checks the live definition dbxrefs in gene_ontology_edit.obo.
#
#

open (FILE4, "<../../ontology/gene_ontology_edit.obo") || die "Can't open gene_ontology_edit.obo.\n";  
print "gene_ontology_edit.obo is being processed.\n";

while(<FILE4>){	
	$line4 = $_;
	chomp $line4;

	if ($line4 =~ m/^def:.*\[(.*)\]$/){		# Finds lines with def dbxrefs.
		@live_dbxrefs = split m/, */, $1;	# Gets individual dbxrefs.
     
		foreach $live_dbxref (@live_dbxrefs){ # Gets live prefixes.   
    		if ($live_dbxref =~ m/^(.*):.*/){
    			$live_prefix =$1;
    	
    			# Checks that prefixes are listed in GO.xrf_abbs.
    			if(!defined $hash_of_database_abbrev{$live_prefix}){
					# Or complains.
					print "\"$live_prefix\" is not a good database acronym.\n";
				} 
        	}
        }
        
        foreach $live_dbxref (@live_dbxrefs){    			
    		if ($live_dbxref =~ m/^((.*):\D+$)/){			# Finds live dbxrefs that don't end in numbers.
    			$live_prefix_non_numerical_endings =$2;		# Gets their prefixes.
    			$live_dbxrefs_non_numerical_endings =$1;	# Gets their whole live dbxrefs.
				
				# Finds those that have a prefix listed in GO.curator_dbxrefs.
    			if(defined $hash_of_database_abbrev_from_GO_curator_dbxrefs{$live_prefix_non_numerical_endings}){
					# Checks that the full dbxrefs correspond to a dbxref in GO.curator_dbxrefs.
					if(!defined $hash_of_curator_database_full{$live_dbxrefs_non_numerical_endings}){
						# Or else complains.
						print "\"$live_dbxrefs_non_numerical_endings\" is not a good curator dbxref.\n";
					} 
				}	
    		}
        }		
	}
	   	
}

print "The database abbreviations in the definition dbxrefs of the gene_ontology_edit.obo file have been checked.\n";
print "The curator dbxrefs in the gene_ontology_edit.obo file have been checked.\n";