#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# Read tab-delimited file, store in internal tracking system db
#
# LOAD ENTRIES FOR EXONS FOUND IN ENSEMBL BUT MISSING IN LOUTRE
#

package gencode_tracking_system::sources::ensembl_missing_exons;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT  = qw( run_update );

my ($tracking_dbh, $prepare_hash, $user_id, $category_id);
my $features = 0;

sub run_update {
  ($tracking_dbh, $prepare_hash, $user_id, $category_id) = @_;

  my $das_source    = 'ensembl_exons';
  my $missing_label = "missing_exon";
  my $file          = "/nfs/acari/fsk/2_encode/data/ensembl/e54/unique_exons.das2";

  my ($current_gene_id, $current_transcript_id);

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #get general project instead:
  $current_gene_id = get_project_for_category($prepare_hash, $tracking_dbh, $category_id);
  die "Can't find general project for category $category_id!\n" unless $current_gene_id;

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( !($line =~ /^[\d]/) );
    my ($num, $loc, $ensembl_id, $havana_ids, $exons_id, $exon_len, $exon_num) = split("\t", $line);
    my ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc);

    $havana_ids =~ s/,/, /g;

    if($exons_id =~ /\,/){
      my $transcript_id;
      my @exons_ids = split(",", $exons_id);

      foreach my $exons_ids (@exons_ids){
	create_entry($chrom, $start, $end, $strand, $havana_ids, $exons_ids, $exon_len, $exon_num, $current_gene_id, $missing_label);
      }
    }
    else{
      create_entry($chrom, $start, $end, $strand, $havana_ids, $exons_id, $exon_len, $exon_num, $current_gene_id, $missing_label);
    }
  }
  close(IN);

  return $features;
}


sub create_entry {
  my ($chrom, $start, $end, $strand, $havana_ids, $exons_id, $exon_len, $exon_num, $current_gene_id, $missing_label) = @_;

  my %transcript = ();

  #build structure for transcript
  $transcript{'chrom'}        = $chrom;
  $transcript{'biotype'}      = "exon";
  $transcript{'type'}         = "transcript";
  $transcript{'start'}        = $start;
  $transcript{'end'}          = $end;
  $transcript{'strand'}       = $strand;
  $transcript{'id'}           = "MisEx_".$exons_id; ## prev.: "MisEx_".$ensembl_id;
  $transcript{'description'}  = "HAVANA-TRANSCR-IDS: ".$havana_ids."\n".
                                  "MISSING-EXON: ".$exons_id."\n".
				  "EXON-LENGTH: ".$exon_len."\n".
				  "EXON-NUM: ".$exon_num;

  #store as transcript
  if($VERBOSE){
    print "TRANSCRIPT:\n";
    #print_element(\%transcript);
  }
  my ($current_transcript_id, $updated) = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
					     $current_gene_id, 0, $user_id, $category_id);

  $features++;
}

1;

__END__

ENSEMBL-exons
->condense all issues to one project
delete p from projects p, issues i where i.project_id=p.id and i.category_id=4;
insert into projects set id=154477, name="Missing_Exons", description="Missing Exons when compared to Ensembl models", is_public=1, created_on="2008-09-08 17:00:07", updated_on="2008-09-08 17:00:07", identifier="misex", status_id=1, status=1, Gchrom=0, Gstart=0, Gend=0, Gstrand=0;
update issues set project_id=154477 where category_id =4;

#move existing entries to "archive"
update issues i set status_id=6 WHERE i.category_id=4;

-> remove unresolved missing_exon flags;
delete from flags where flag_name="missing_exon" and issue_id > 0 and checked_date is null;
select count(*) from flags where flag_name="missing_exon" and issue_id > 0 and checked_date is null;

-> set new missing_exon flags;
check_data.pl

#remove custom values for old projects:
delete cv from custom_values cv, issues i where cv.customized_id=i.id and i.category_id=4 and cv.customized_type="Project" and cv.customized_id != 154477;

>done for dev & prod!

-----
CONGO

delete p from projects p, issues i where i.project_id=p.id and i.category_id=11;
insert into projects set id=31617, name="CONGO_Exons", description="Exons prdicted by MITs CONGO method", is_public=1, created_on="2008-08-25 02:32:25", updated_on="2008-08-25 02:32:25", identifier="congoex", status_id=1, status=1, Gchrom=0, Gstart=0, Gend=0, Gstrand=0;
update issues set project_id=31617 where category_id=11;

-----
introns

delete p from projects p, issues i where i.project_id=p.id and i.category_id=9;
insert into projects set id=146021, name="Missing_Introns", description="Missing Introns when compared to Ensembl models", is_public=1, created_on="2008-09-08 15:14:14", updated_on="2008-09-08 15:14:14", identifier="misintron", status_id=1, status=1, Gchrom=0, Gstart=0, Gend=0, Gstrand=0;
update issues set project_id=146021 where category_id=9;

-----
cdnas
delete p from projects p, issues i where i.project_id=p.id and i.category_id=7;
insert into projects set id=203441, name="Missing_cDNAs", description="Missing cDNAs when compared to Ensembl models", is_public=1, created_on="2008-09-11 15:12:59", updated_on="2008-09-11 15:12:59", identifier="miscdna", status_id=1, status=1, Gchrom=0, Gstart=0, Gend=0, Gstrand=0;
update issues set project_id=203441 where category_id=7;





 delete cv from custom_values cv, issues i where cv.customized_id=i.id and i.category_id=11 and cv.customized_type="Project" and cv.customized_id != 31617;
