#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system, to be run nightly.
#
# Connect to the external DAS servers,
# fetch data, store in internal tracking system db
# and recalculate dependant fields in db

#example:
# <FEATURE id="ENSP00000306241.dup.243624" label="ENSP00000306241.dup.243624">
#  <TYPE id="pseudogene_exon" category="transcription">Pseudogene Exon</TYPE>
#  <START>118892</START>
#  <END>119095</END>
#  <METHOD id="yale_pseudopipe">Yale PseudoPipe</METHOD>
#  <ORIENTATION>-</ORIENTATION>
#  <NOTE>Duplicated Pseudogene.  Parent protein: ENSP00000306241</NOTE>
#  <LINK href="http://tables.gersteinlab.org/human36/ENSP00000306241.dup.243624">http://tables.gersteinlab.org/human36/ENSP00000306241.dup.243624</LINK>
# </FEATURE>

package gencode_tracking_system::sources::yale_pseudogenes;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT = qw( run_update );

sub run_update {
  my ($response, $chrom, $tracking_dbh, $prepare_hash, $user_id, $category_id, $previous_genes) = @_;

  my $current_gene_id;
  my $current_transcript_id;
  my $current_feature_id;
  my $gene_status = 'new';
  my %new_features = ();
  my $prefix = "YALE";

  # go through loci
  while (my ($url, $features) = each %$response) {

    my %genes = ();
    my %transcripts = ();
    my %sub_elements = ();

    if(ref $features eq "ARRAY"){
      print "Received ".scalar @$features." features.\n" if $VERBOSE;

    FEATURES:
      foreach my $feature (@$features) {

	#remove duplicates from overlapping regions
	if(defined $previous_genes and exists $previous_genes->{$feature->{'feature_id'}}){
	  next FEATURES;
	}
	$new_features{$feature->{'feature_id'}} = 1;

	#get notes
	my @notes = @{ $feature->{'note'} };
	my ($gene_type, $parent_id) = split('\. ', $notes[0]);
	#$parent_id =~ s/Parent protein\: //g;

	my %sub_element;
	my $grouphash = $feature->{'group'}->[0];

	#build structure for exons and general items
	#get element type
	my $element_type = lc($feature->{'type'});

	$element_type    =~ m/((intron)|(UTR)|(exon))/g;
	$element_type    = $1 || "exon";

	my $group_type   = $feature->{'feature_id'};
	my $strand       = $feature->{'orientation'};
	my $phase        = ".";
	if($feature->{'phase'}){
	  $phase = $feature->{'phase'};
	}
	elsif($element_type eq "exon"){
	  $phase = "0";
	}
	else{
	  $phase = ".";
	}

	$sub_element{'id'}         = $feature->{'feature_id'};
	$sub_element{'chrom'}      = $chrom;
	$sub_element{'biotype'}    = $prefix."_".$gene_type;
	$sub_element{'status'}     = $gene_status;
	$sub_element{'type'}       = $element_type;
	$sub_element{'start'}      = $feature->{'start'};
	$sub_element{'end'}        = $feature->{'end'};
	$sub_element{'score'}      = ".";
	$sub_element{'strand'}     = $strand;
	$sub_element{'phase'}      = $phase;
	$sub_element{'parent'}     = '';

	if(!exists $transcripts{ $feature->{'feature_id'} }){

	  my %transcript;

	  #build structure for transcript
	  $transcript{'chrom'}        = $chrom;
	  $transcript{'biotype'}      = $gene_type;
	  $transcript{'status'}       = $gene_status;
	  $transcript{'type'}         = "transcript";
	  $transcript{'start'}        = $feature->{'start'};
	  $transcript{'end'}          = $feature->{'end'};
	  $transcript{'strand'}       = $strand;
	  $transcript{'description'}  = $parent_id;
	  $transcript{'id'}           = $feature->{'feature_id'};
	  $transcript{'parent'}       = '';
	  $transcript{'alias'}        = '';

	  $transcripts{ $feature->{'feature_id'} } = \%transcript;
	}
	else{
	  if($feature->{'start'} < $transcripts{ $feature->{'feature_id'} }->{'start'} ){
	    $transcripts{ $feature->{'feature_id'} }->{'start'} = $feature->{'start'};
	  }
	  elsif($feature->{'end'} > $transcripts{ $feature->{'feature_id'} }->{'end'} ){
	    $transcripts{ $feature->{'feature_id'} }->{'end'} = $feature->{'end'};
	  }
	}

	#save entry for exons, etc.
	if(!defined($sub_elements{$feature->{'feature_id'}})){
	  $sub_elements{$feature->{'feature_id'}} = [];
	}
	push(@{ $sub_elements{$feature->{'feature_id'}} }, \%sub_element);
	
      }
      @$features = ();

      foreach my $transcript (keys %transcripts){

	#clone gene from transcript
	my %gene = %{ $transcripts{$transcript} };
	$gene{'type'} = "gene";

	#store gene
	if($VERBOSE){
	  print "GENE:\n";
	  print_element(\%gene);
	}
	$current_gene_id = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene', 0, 0, 
        				  $user_id, $category_id);

	#store transcript
	if($VERBOSE){
	  print "TRANSCRIPT $transcript:\n";
	  print_element($transcripts{$transcript});
	}
	$current_transcript_id = store_features($tracking_dbh, $prepare_hash, $transcripts{$transcript},
						'transcript', $current_gene_id, 0, $user_id, $category_id);

	#store sub-features
	foreach my $sub_element (@{ $sub_elements{$transcripts{$transcript}->{'id'} } }){
	  if($VERBOSE){
	    print "ELEMENT:\n";
	    print_element($sub_element);
	  }
	  $current_feature_id = store_features($tracking_dbh, $prepare_hash, $sub_element, 'subfeature', 
					       $current_gene_id, $current_transcript_id, $user_id, $category_id);
	}

      }

    }
  }

  return \%new_features;
}


1;
