#!/usr/bin/perl -w
# count the number of terms, defined terms, obsoletes and other such stuff.

use strict;
use Data::Dumper;

if (! @ARGV)
{	die "script invocation:\n\ncounts.pl <obo_file> <path_to_go_directory>\n\npath_to_go_directory is optional if the obo_file path contains go/ontology\n";
}

my $obo_file = shift @ARGV;
if (!$obo_file || ! -f $obo_file)
{	die "Missing the required file $obo_file\n";
}

my $path_to_go = "";
if (@ARGV)
{	$path_to_go = shift @ARGV;
	$path_to_go .= "/" unless substr($path_to_go, -1, 1) eq '/';
}
else
{	if ($obo_file =~ /\/?go\/ontology\//)
	{	($path_to_go = $obo_file) =~ s/(.*?\/?go\/)ontology.+/$1/;
	}
	elsif ($obo_file =~ /^ontology\//)
	{	## in the go/ directory
		## no need to do anything with the path to go
	}
	else
	{	## help! No idea what's going on here.
	}
}

#print STDERR "path to go: $path_to_go\n";
## make sure we can find the dirs we need

if (-d $path_to_go . 'www')
{ ## we're ok!
	#print STDERR "Found $path_to_go" . "www!\n";
}
else
{	die "Could not find the www directory - was not at $path_to_go/www!";
}

my $template = $path_to_go . "www/counts.tmpl";
my $output = $path_to_go . "www/counts.html";

## make sure we've got the template file
if (! -f $template)
{	die "The file $template could not be found!";
}

## open the template file and parse the contents.
my $tmpl_str;
open( TMPL, "< $template" ) or die "Could not open template file $template: $!";
while (<TMPL>)
{	$tmpl_str .= $_;
}
close(TMPL);

my $data;
## open the OBO file and parse the contents
{	local $/ = "\n\n";
	open( OBO, "< $obo_file") or die "Could not open $obo_file: $!";
	print STDERR "opening $obo_file...\n";
	my $header;
	while (<OBO>)
	{	#print STDERR "START\n$_\nEND\n";
		if (/\[Term\]/m)
		{	$data->{TOTAL}++;
			## collect namespace, obsolete, def info
			if (/^is_obsolete: true/m)
			{	$data->{OBS}++;
				next;
			}
			if (/^namespace: (.*?)$/m)
			{	$data->{ns}{$1}++;
			}
			else
			{	$data->{ns}{unspecified}++;
			}
			if (/^def: /m)
			{	$data->{def}++;
			}
		}
		elsif (! $header)
		{	my $regex = 'remark: cvs version: \$' . 'Revision: (.*?) \$';
			if (/$regex$/m)
			## ok, this is the header
			{	$header++;
				$data->{VERSION} = $1;
				if (/^date: (.*?)$/m)
				{	$data->{DATE} = $1;
				}
			}
		}
	}
}

#print STDERR "data: " . Dumper($data);

## do the stats
if (! $data->{TOTAL})
{	die "No terms were found in the file. How curious!";
}
$data->{OBS} = 0 if ! $data->{OBS};

$data->{subtotal} = $data->{TOTAL} - $data->{OBS};

if ($data->{subtotal} == 0)
{	die "No extant terms found in ontology!";
}

if (! $data->{def})
{	$data->{DEF} = 0;
}
else
{	$data->{DEF} = sprintf("%.1f", $data->{def} / $data->{subtotal} * 100);
}

#print STDERR "data: " . Dumper($data) . "\n\n";

foreach my $x qw(TOTAL OBS DEF VERSION DATE)
{	$tmpl_str =~ s/$x/$data->{$x}/;
}

if ($data->{ns}{unspecified})
{	warn "Found " . $data->{ns}{unspecified} . " terms without a namespace specified.\n";
}

my $ns_tmpl = "NUM NS<br>";
if ($tmpl_str =~ /LOOP_START(.*?)LOOP_END/s)
{	$ns_tmpl = $1;
}
my $ns_str;
foreach my $ns (sort keys %{$data->{ns}})
{	my $temp = $ns_tmpl;
	$temp =~ s/NAMESPACE/$ns/;
	$temp =~ s/NUM/$data->{ns}{$ns}/;
	$ns_str .= $temp;
}

$tmpl_str =~ s/LOOP_START.*?LOOP_END/$ns_str/s;

open(OUT, ">$output") or die "Could not create output file $output: $!";
print OUT $tmpl_str;
close(OUT);

exit(0);
