#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# Read tab-delimited file, store in internal tracking system db
#
# LOAD ENTRIES FOR cDNAs FOUND IN ENSEMBL BUT MISSING IN LOUTRE
#

package gencode_tracking_system::sources::ensembl_missing_cdnas;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $das_source);
my $features    = 0;

sub run_update {
  ($tracking_dbh, $prepare_hash, $user_id, $category_id, $das_source) = @_;

  #my $file        = "/nfs/acari/fsk/2_encode/data/ensembl/e56/recent_unannotated_cdnas.das";
  my $file        = $OTHER_DAS_SERVERS{$das_source}->{'file'};
  print STDERR "Using file $file\n";

  my %havana_ids  = ();
  my %cdna_ids    = ();
  my $missing_label = "";

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #get general project instead:
  my $current_gene_id = get_project_for_category($prepare_hash, $tracking_dbh, $category_id);
  warn "Can't find general project for category $category_id!\n" unless $current_gene_id;
  $current_gene_id = create_general_project($tracking_dbh, $OTHER_DAS_SERVERS{$das_source}->{'category'});

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( !($line =~ /^[\d]/) );
    my ($num, $loc, $int_ensembl_id, $havana_ids, $cdna_id, $date) = split("\t", $line);
    my ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc);

    #keep track of ids
    if(defined $cdna_ids{$cdna_id}){
      print "SEEN $cdna_id!\n";
      $cdna_ids{$cdna_id}++;
      $cdna_id .= "_".$cdna_ids{$cdna_id};
    }
    else{
      $cdna_ids{$cdna_id} = 1;
    }

    #add space
    $havana_ids =~ s/,/, /g;

    create_entry($chrom, $start, $end, $strand, $havana_ids, $cdna_id, $current_gene_id, $missing_label, $date);

  }
  close(IN);

  print STDERR "WE FOUND ".(scalar keys %cdna_ids)." different cDNA IDS.\n";

  return $features;

}

sub create_entry {
  my ($chrom, $start, $end, $strand, $havana_ids, $cdna_id, $current_gene_id, $missing_label, $date) = @_;

  my %transcript = ();

  #build structure for transcript
  $transcript{'chrom'}        = $chrom;
  $transcript{'biotype'}      = "cDNA";
  $transcript{'type'}         = "transcript";
  $transcript{'start'}        = $start;
  $transcript{'end'}          = $end;
  $transcript{'strand'}       = $strand;
  $transcript{'id'}           = "MisC_".$cdna_id;
  $transcript{'description'}  = "cDNA: ".$cdna_id.
                                "\nHAVANA-TRANSCR-IDS: ".$havana_ids.
				"\nDATE: ".$date;

  #store as transcript
  if($VERBOSE){
    print "TRANSCRIPT:\n";
    print_element(\%transcript);
  }
  my ($current_transcript_id, $updated) = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
					     $current_gene_id, 0, $user_id, $category_id);

  $features++;
}


1;

__END__

#remove count in id
update issues set subject=CONCAT("MisC_", SUBSTR(subject, LOCATE("_", subject)+1)) where category_id=7 ;

#remove project custom values
delete cv from custom_values cv, issues i where cv.customized_id=i.id and cv.custom_field_id!=8 and value="missing_cDNA";

#change "biotype"
update custom_values cv, issues i set cv.value="cDNA" where cv.customized_id=i.id and cv.custom_field_id=8 and cv.value="missing_cDNA";
