=head1 NAME

gencode_tracking_system:::sources::gtf

=head1 DESCRIPTION

Helper script for the GENCODE tracking system.
Read GTF file, store in internal tracking system db

 example:
   17      secretsource    gene    18107433        18108488        4919    +       .       gene_id gene000000; gene_name "Fpr-rs1"  ;

 The attributes in column 9 are split by ";", the type / value pairs then by " ".
 Can also be used to GFF2 files.
 Can also be used to GFF3 files if the attribute seperator $sep is set to "=".
 Expects to find some id in the attributes.

=head1 CONTACT

Felix Kokocinski, fsk@sanger.ac.uk

=head1 COPYRIGHT

Copyright Felix Kokocinski, 2008-2010, 
supported by Wellcome Trust Sanger Institute (UK) 
and National Human Genome Research Institute (USA).

Released under GPL v2.

=cut

package gencode_tracking_system::sources::gtf;

use strict;
use warnings;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT = qw( run_update );

my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $das_source);
my $features    = 0;
my $gene_status = 'new';
my $updated;

#define the input file format
#what fields to use
my $import_type = "gene";
my $att_id_type = "gene_id";
#attribute seperator
my $sep = " ";


sub run_update {
  ($tracking_dbh, $prepare_hash, $user_id, $category_id, $das_source) = @_;

  #TODO: load dynamically
  my $gene_type   = 'new_genes';
  if(defined($OTHER_SERVERS{$das_source}->{'data_type'})){
    $gene_type = $OTHER_SERVERS{$das_source}->{'data_type'};
  }

  my $file        = $OTHER_SERVERS{$das_source}->{'file'};
  print STDERR "Using file $file\n";

  my ($current_gene_id, $current_transcript_id);

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;

    my ($chr, $source, $type, $start, $end, $score, $strand, $phase, $attributes) = split("\t", $line);
    my $id_to_use = "";
    my $remarks = "";

    #check type
    #store only gene or/and transcript level entries?
    #store genes/transcripts as types given, using ids in attributes? -> requires entire file to be read first

    next unless $type eq $import_type;

    #get attributes, split by ";"
    my @attributes = split(";", $attributes);
    foreach my $attribute (@attributes){
      $attribute =~ s/^\s//;
      $attribute =~ s/\s$//;
      #replace other separators with single space, a bit hacky
      $attribute =~ s/$sep/ / if($sep ne " ");
      #separate into key and value
      $attribute =~ /^(\w+)?\s(\"?[\w\:\@\.\-\+\s]+\"?)\s?$/;
      my $att_type  = $1;
      my $att_value = $2;
      if(!(defined($att_type) && defined($att_value))){
	warn "Missing key / value information for \"$attribute\" in \n$line\n";
	next;
      }
      $att_value =~ s/"//g;

      #decide which value to use as the object ID
      if($att_type eq $att_id_type){
	$id_to_use = $att_value;
      }

      #store all attributes as remarks
      $remarks .= $att_type." = ".$att_value."\n";
    }

    $chr =~ s/^chr//;

    my %transcript;
    $current_transcript_id = 0;

    #build structure for transcript
    $transcript{'chrom'}        = $chr;
    $transcript{'biotype'}      = $gene_type;
    $transcript{'status'}       = $gene_status;
    $transcript{'type'}         = "transcript";
    $transcript{'start'}        = $start;
    $transcript{'end'}          = $end;
    $transcript{'strand'}       = '+';
    $transcript{'id'}           = $id_to_use;
    $transcript{'description'}  = $remarks;

    #clone gene from transcript
    my %gene = %transcript;

    #store gene
    if($VERBOSE){
      print STDERR "GENE:\n";
      print_element(\%gene);
    }
    $gene{'type'} = "gene";
    ($current_gene_id, $updated) = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene',
				      0, 0, $user_id, $category_id);

    #store transcript
    if($VERBOSE){
      print STDERR "TRANSCRIPT:\n";
      print_element(\%transcript);
    }
    ($current_transcript_id, $updated) = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
					    $current_gene_id, 0, $user_id, $category_id);

    $features++;

    #last if($features > 5); #for tests
  }

  print STDERR "Have ".$features." features.\n" if($VERBOSE);
}


1;

