#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# Read tab-delimited file, store in internal tracking system db
#
# LOAD ENTRIES FOR CCDS ENTRIES FOUND IN ENSEMBL BUT MISSING IN LOUTRE
#

package gencode_tracking_system::sources::missing;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

my %ccds_ids;
my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $current_gene_id, $ens_db, $ga, $ta, $upd, $hga);
my ($related_id);

my $test = 0;

sub run_update {
  my ($tracking_dbhh, $prepare_hashh, $user_idd, $category_idd) = @_;

  #get general project instead:
  $current_gene_id = get_project_for_category($prepare_hash, $tracking_dbh, $category_id);
  die "Can't find general project for category $category_id!\n" unless $current_gene_id;

  #open data file
  my $file        = "/nfs/acari/fsk/2_encode/data/ensembl/missing_CCDS.v53.new.das";
  open(IN, "<$file") or die "cant open file $file!\n";

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( !($line =~ /^[\d]/) );

    my ($num, $loc, $int_ensembl_ids, $ccds_ids, $havana_ids, $havana_biotype, $missing_label, $tag2) = split("\t", $line);

    my ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc);

    $related_id = 0;

    if($ccds_ids =~ /\,/){
      my $transcript_id;
      my @ccds_ids = split(",", $ccds_ids);

      foreach my $ccds_id (@ccds_ids){
	my $use_id = $int_ensembl_ids;
	if($int_ensembl_ids =~ /\,/){

	  my $transcript_id = get_transcript_id_for_ccds_id($int_ensembl_ids, $ccds_id);

	  if($transcript_id){
	    $use_id = $transcript_id;
	  }
	  else{
	    warn ">>>cant find ccds-transcript for $line\n";
	  }
	}
	#print STDERR "ENTRY[1] $chrom, $start, $end, $havana_ids, $use_id, $ccds_id, $missing_type, $missing_label.\n";
	create_entry($chrom, $start, $end, $strand, $havana_ids, $use_id, $ccds_id, $missing_type, $transcript_id, $missing_label);

	
      }

    }
    else{
      #print STDERR "ENTRY[2] $chrom, $start, $end, $havana_ids, $int_ensembl_ids, $ccds_ids, $missing_label, $missing_type.\n";
      create_entry($chrom, $start, $end, $strand, $havana_ids, $int_ensembl_ids, $ccds_ids, $missing_type, $int_ensembl_ids, $missing_label);
    }

    $features++;

    if($test and $features > 20){ last }

  }

  close(IN);

  print STDERR "WE FOUND ".(scalar keys %ccds_ids)." CCDS IDS.\n";

  return $features;

}


sub create_entry {
  my ($chrom, $start, $end, $strand, $havana_ids, $int_ensembl_ids, $ccds_id, $missing_type, $etranscript_id, $missing_label) = @_;
  my ($current_gene_id, $current_transcript_id);

  #print STDERR "$ccds_id\n";

  #keep track of ids
  if(defined $ccds_ids{$ccds_id}){
    print STDERR "SEEN!\n";
    return;
  }
  else{
    $ccds_ids{$ccds_id} = 1;
  }

  if(!$etranscript_id and $int_ensembl_ids){
    my $use_id2;
    $etranscript_id = get_transcript_id_for_ccds_id($int_ensembl_ids, $ccds_id);
  }

  if(!$int_ensembl_ids and $etranscript_id){
    $int_ensembl_ids = {$ga->fetch_by_transcript_id($etranscript_id)}->stable_id;
  }

  if($int_ensembl_ids =~ /^ENST/){
    my %gids = ();
    foreach my $tid (split(',', $int_ensembl_ids)){
      my $gid = $ga->fetch_by_transcript_stable_id($tid)->stable_id;
      if($gid){ $gids{$gid} = 1; }
    }
    $int_ensembl_ids = join(", ", keys %gids);
  }

  #if(!$strand){
    $strand = get_strand($int_ensembl_ids);
  #}

  #print "Splitting $havana_ids.\n";
  my ($hav_id) = split(',', $havana_ids);
  my $hav_t_id;
  if(!$hav_id){
    $hav_id = '--';
    $hav_t_id = '--';
  }
  else{
    $hav_t_id = get_transc_id($hav_id);
  }

  #add space
  $havana_ids =~ s/,/, /g;
  $havana_ids =~ s/\./\-\-/;
  if(!$hav_t_id){$hav_t_id='--'}

  #print $ccds_id."\t".$etranscript_id."\n";

  my $description = "CCDS: ".$ccds_id."\nENSEMBL-IDS: ".$int_ensembl_ids.
    "\nENSEMBL-TRANSCRIPT-IDS: ".$etranscript_id.
    "\nHAVANA-GENE-IDS: ".$havana_ids."\nHAVANA-TRANSCRIPT-IDS: ".$hav_t_id.
    "\nNOTE: HAVANA-IDs supplied for orientation only.\nNOTE: ".$missing_label;

  my %transcript;
  $current_transcript_id = 0;

  #build structure for transcript
  $transcript{'chrom'}        = $chrom;
  $transcript{'type'}         = 'transcript';
  $transcript{'start'}        = $start;
  $transcript{'end'}          = $end;
  $transcript{'strand'}       = $strand;
  $transcript{'id'}           = "".$ccds_id;
  $transcript{'description'}  = $description;

  #clone gene from transcript
  my %gene = %transcript;
  $gene{'type'} = "gene";

  #store as gene
  if($VERBOSE){
    print "GENE:\n";
    print_element(\%gene);
  }
  ($current_gene_id, $upd) = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene',
					    0, 0, $user_id, $category_id);

  #store as transcript
  $transcript{'parent'}  = "".$ccds_id;
  if($VERBOSE){
    print "TRANSCRIPT:\n";
    print_element(\%transcript);
  }
  ($current_transcript_id, $upd) = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
					  $current_gene_id, 0, $user_id, $category_id);

  if($related_id){
    #print "RELATION: $related_id\n";
    issue_relation($tracking_dbh, $current_transcript_id, $related_id);
    #add_issue_realtion($tracking_dbh, $prepare_hash, $current_transcript_id, $related_id);
  }
  else{
    $related_id = $current_transcript_id;
  }

}



1;

__END__
