#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# Read tab-delimited file, store in internal tracking system db
#
# LOAD ENTRIES FOR INTRONS FOUND IN ENSEMBL BUT MISSING IN LOUTRE
#

package gencode_tracking_system::sources::ensembl_missing_introns;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

sub run_update {
  my ($tracking_dbh, $prepare_hash, $user_id, $category_id) = @_;

  my $das_source  = 'ensembl_introns';
  my $file        = "/nfs/acari/fsk/2_encode/data/ensembl/e54/unique_intron_patterns.das";

  my ($current_gene_id, $current_transcript_id, $use_ids);
  my $features    = 0;

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #get general project instead:
  $current_gene_id = get_project_for_category($prepare_hash, $tracking_dbh, $category_id);
  die "Can't find general project for category $category_id!\n" unless $current_gene_id;

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( !($line =~ /^[\d]/) );
    my ($num, $loc, $ensembl_id, $havana_ids, $exons_ids, $intron_coords, $intron_len, $intron_num)
      = split("\t", $line);
    my ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc);
    #add space
    $havana_ids =~ s/,/, /g;
    $use_ids = $exons_ids;
    $use_ids  =~ s/-/_/;
    $use_ids  =~ s/ENSE//g;

    my %transcript;
    $current_transcript_id = 0;

    #build structure for transcript
    $transcript{'chrom'}        = $chrom;
    $transcript{'biotype'}      = "intron";
    $transcript{'type'}         = "transcript";
    $transcript{'start'}        = $start;
    $transcript{'end'}          = $end;
    $transcript{'strand'}       = $strand;
    $transcript{'id'}           = "MisIn_".$use_ids; #"MisIn_".$ensembl_id;
    $transcript{'description'}  = "HAVANA-TRANSCR-IDS: ".$havana_ids."\n".
                                  "MISSING-INTRON: ".$exons_ids."\n".
				  "INTRON-LENGTH: ".$intron_len."\n".
				  "INTRON-COORDS: ".$intron_coords."\n".
				  "INTRON-NUM: ".$intron_num;

    #store as transcript
    if($VERBOSE){
      print STDERR "TRANSCRIPT:\n";
      print_element(\%transcript);
    }
    $current_transcript_id = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
					    $current_gene_id, 0, $user_id, $category_id);

    $features++;
  }
  close(IN);

  return $features;
}

1;



__DATA__

Count   Genomic location        Ensembl Transcript stable ID    Havana Transcripts      Ensembl Exon stable IDs Intron coordinates      Intron length
   Intron number (from 5')
2       chromosome:NCBI36:3:213454:426098:1     ENST00000397491 OTTHUMT00000207155      ENSE00000912306-ENSE00000912308 358766-361271   2506    7


->different ids required!

-> remove unresolved missing_intron flags:
delete from flags where flag_name="missing_intron" and issue_id > 0 and checked_date is null;
->retire old issues:
update issues set status_id=6 where category_id=9;

#remove custom values for old projects:
delete cv from custom_values cv, issues i where cv.customized_id=i.id and i.category_id=9 and cv.customized_type="Project" and cv.customized_id != 146021;
