#!/usr/bin/perl -w

=head1 NAME

get_term_info.pl - retrieve term information from a list of GO IDs

=head1 SYNOPSIS

 get_term_info.pl --input /data/cytoscape_output.txt --go_file /go/doc/GO.terms_alt_ids
 --output /my_files/mapping.txt --format ind

=head1 DESCRIPTION

Takes a list of GO IDs (or numbers representing GO IDs) and outputs information
about the term name and ontology designation

=head2 Input parameters

=head3 Required

=over

=item -i || --input /path/to/file_name

input file of GO IDs

=item -g || --go_file /path/to/GO.terms_alt_ids

GO.terms_alt_ids file, available at http://www.geneontology.org/doc/GO.terms_alt_ids

=item -o || --output /path/to/file_name

output file for results

=back

=head3 Optional switches

=over

=item -f || --format

choose a format for your output

options:

i	display the term id
n	display the term name
d	display the term domain (the ontology or namespace of the term)
o	display "obs" if the term is obsolete

By default, the output is in the form

ID [tab] name [tab] domain

but it can be configured by entering a string representing the data you want
to show.

E.g.

-f ind   =>  default output, i.e.  ID [tab] name [tab] domain

-f ni    =>  name [tab] ID

-f indo  =>  ID [tab] name [tab] domain [tab] obs

=item -v || --verbose

prints various messages

=back

=cut

use strict;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;

run_script(\@ARGV);

exit(0);

sub run_script {

	my $options = parse_options(@_);

	# check verbosity
	if (! defined $options->{verbose})
	{	$options->{verbose} = $ENV{GO_VERBOSE} || 0;
	}

	## OK, looks like we're going to have to compare the files.

	print STDERR "Parsed options. Now starting script...\n" if $options->{verbose};

	my $input;
	## open up the input and parse the data
	open(IN, "<" . $options->{'input'}) or die("Could not open " . $options->{'input'} . "! $!");
	while (<IN>)
	{	next unless /[\w\d]/;
		## pull out the content
		if (/\s*g?o?:?0*(\d+)\s*/i)
		{	if (length($1) > 7)
			{	warn "GO:$1 is not a valid GO ID (too many digits)";
				next;
			}
			my $id = "GO:" . sprintf("%07d", $1);
			$input->{$id}++;
		}
	}
	close IN;

	if (! $input || ! keys %$input)
	{	die("No data found in input file. Dying");
	}

	my $name_h = {
		'f' => 'molecular function',
		'p' => 'biological process',
		'c' => 'cellular component',
		'obs' => 'obsolete',
		'i' => 'term ID',
		'n' => 'term name',
		'd' => 'ontology',
		'o' => 'obsolete',
	};

	## GO:ID [tab] GO:ID2 GO:ID3 [tab] name [tab] F|P|C [tab] obs
	## i = 0; n = 2; d = 3; o = 4
	my $file_format = { 'i' => '0', 'n' => 2, 'd' => 3, 'o' => 4 };
	my @tr = map { $file_format->{$_} } @{$options->{format}};
	my $print_id = sub {
		my $row = shift;
		return join("\t", map { $row->[$_] || "" } @tr);
	};

	open(OUT, ">" . $options->{'output'}) or die("Could not create " . $options->{'output'} . ": $!");
	print OUT "! File format:\n! " . join(" [tab] ", map { $name_h->{$_} } @{$options->{'format'}}) . "\n";
	my $go_data;
	open(FH, "<" . $options->{'go_file'}) or die("Could not open " . $options->{'go_file'} . "! $!");
	while (<FH>)
	{	next unless /\w/;
		next if /^!/;
		## GO:ID [tab] GO:ID2 GO:ID3 [tab] name [tab] F|P|C [tab] obs
		my @term = split(/[\t\n]/, $_);
		if ($input->{ $term[0] })
		{	$term[3] = $name_h->{ lc( $term[3] ) };
			print OUT &$print_id( \@term ) . "\n";
			delete $input->{ $term[0] };
		}
		if ($term[1] =~ /GO/)
		{	$term[3] = $name_h->{ lc( $term[3] ) };
			my $id = shift @term;
			map { $go_data->{$_} = [ $id." ($_)", @term] } split(/\s+/, $term[0]);
		}
	}
	close FH;

	## check if we still have any terms left
	my $warned;
	if (keys %$input)
	{	foreach (sort keys %$input)
		{	if ($go_data->{$_})
			{	if (! $warned)
				{	print OUT "! The following IDs have been merged into other IDs\n".
					"! Merged ID shown in brackets\n";
					$warned++;
				}
				print OUT &$print_id( $go_data->{$_}) . "\n";
				delete $input->{$_};
			}
		}
		if (keys %$input)
		{	print STDERR "No entries found for the following IDs:\n" . join("\n", sort keys %$input) . "\n";
		}
	}
	close OUT;
}

# parse the options from the command line
sub parse_options {
	my $opt;
	my $args = shift;

	while (@$args && $args->[0] =~ /^\-/) {
		my $o = shift @$args;
		if ($o eq '-i' || $o eq '--input') {
			if (@$args && $args->[0] !~ /^\-/)
			{	$opt->{input} = shift @$args;
			}
		}
		elsif ($o eq '-g' || $o eq '--go_file') {
			if (@$args && $args->[0] !~ /^\-/)
			{	$opt->{go_file} = shift @$args;
			}
		}
		elsif ($o eq '-o' || $o eq '--output') {
			if (@$args && $args->[0] !~ /^\-/)
			{	$opt->{output} = shift @$args;
			}
		}
		elsif ($o eq '-f' || $o eq '--format') {
			if (@$args && $args->[0] !~ /^\-/)
			{	$opt->{f_string} = shift @$args;
			}
		}
		elsif ($o eq '-h' || $o eq '--help') {
			system("perldoc", $0);
			exit(0);
		}
		elsif ($o eq '-v' || $o eq '--verbose') {
			$opt->{verbose} = 1;
		}
		else {
			warn("Error: no such option: $o. Ignoring");
		}
	}
	return check_options($opt);
}


# process the input params
sub check_options {
	my $opt = shift;
	my $errs;

	if (!$opt)
	{	die_msg( "Error: please ensure you have specified the input file(s) and/or date(s) and an output file." );
	}

	if (!$opt->{input})
	{	push @$errs, "specify an input file using -i /path/to/<file_name>";
	}

	if (!$opt->{go_file})
	{	push @$errs, "specify an output file using -g /path/to/<file_name>";
	}

	if (!$opt->{output})
	{	push @$errs, "specify an output file using -o /path/to/<file_name>";
	}

	if ($opt->{f_string})
	{	my @arr;
		if (index($opt->{f_string}, 'i') == -1 && index($opt->{f_string}, 'n') == -1)
		{	push @$errs, "please specify either term ID or term name in the output format";
		}
		foreach ('i', 'n', 'd', 'o')
		{	my $ix = index( $opt->{f_string}, $_ );
			if ($ix != -1)
			{	push @arr, [ $ix , $_ ];
			}
		}
		if (@arr)
		{	$opt->{format} = [ map { $_->[1] } sort { $a->[0] <=> $b->[0] } @arr ];
		}
		else
		{	## use the default format
			$opt->{format} = [ "i", "n", "d" ];
		}
	}

	if (! $opt->{format})
	{	## use the default format
		$opt->{format} = [ "i", "n", "d" ];
	}

	if ($ENV{DEBUG})
	{	$opt->{verbose} = 1;
	}

	if ($errs && @$errs)
	{	die_msg( join("\n", @$errs) );
	}

	return $opt;
}

sub die_msg {
	my $msg =  shift || "";
	die join("\n", $msg, "The help documentation can be accessed with the command\n\tdef-differ.pl --help\n");
}

=head1 AUTHOR

Amelia Ireland

=cut