#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# Read tab-delimited file, store in internal tracking system db
#
#

package gencode_tracking_system::sources::ensembl_rna_seq_loc;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

my %ids;
my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $ens_db, $ga, $upd, $hga, $file, $type);

my %intypes = (
	       "gene" => "/nfs/acari/fsk/2_encode/data/ensembl/rnaseq/new_genes/joined.2.txt",
	       "exon" => "/nfs/acari/fsk/2_encode/data/ensembl/rnaseq/new_exons/joined.2.txt",
	      );


sub run_update {
  ($tracking_dbh, $prepare_hash, $user_id, $category_id) = @_;

#  my $file        = "/nfs/acari/fsk/2_encode/data/ensembl/rnaseq/new_genes/joined.txt";
#  my $type        = "gene";
#  my $file        = "/nfs/acari/fsk/2_encode/data/ensembl/rnaseq/new_exons/joined.txt";
#  my $type        = "exon";

  foreach my $intype (keys %intypes){
    do_update($tracking_dbh, $prepare_hash, $user_id, $category_id, $intypes{$intype}, $intype);
  }
}


sub do_update{
  ($tracking_dbh, $prepare_hash, $user_id, $category_id, $file, $type) = @_;

  print STDERR "DOING $type WITH $file.\n";
  my $features = 0;
  %ids = ();

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #get general project instead:
  my $current_gene_id = get_project_for_category($prepare_hash, $tracking_dbh, $category_id);
  if(!$current_gene_id){
    $current_gene_id = store_as_gene();
  }
  else{ print "Have project $current_gene_id.\n"; }
  die "Can't find general project for category $category_id!\n" unless $current_gene_id;

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( $line =~ /^#/ );
    my ($loc, $loc2, $desc, $other) = split("\t", $line);
    my ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc);
    #my ($c2, $ass2, $chrom2, $start2, $end2, $strand2);
    if($loc2 ne '.'){
      $desc .= "\nGENE-LOC: $chrom $start-$end, $strand";
      ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc2);
    }

    if(!exists $ids{$chrom}){
      $ids{$chrom} = 0;
    }
    $ids{$chrom}++;
    $features++;
    create_entry($chrom, $start, $end, $strand, $current_gene_id, $desc, $type, $chrom.'_'.$ids{$chrom});

  }
  close(IN);

  print STDERR "WE FOUND $features locations.\n";

  return $features;

}


sub create_entry {
  my ($chrom, $start, $end, $strand, $current_gene_id, $desc, $type, $id) = @_;

  my %transcript = ();

  #build structure for transcript
  $transcript{'chrom'}        = $chrom;
  $transcript{'biotype'}      = $type;
  $transcript{'type'}         = "transcript";
  $transcript{'start'}        = $start;
  $transcript{'end'}          = $end;
  $transcript{'strand'}       = $strand;
  $transcript{'id'}           = "RNASeq_".$type."_".$id;
  $transcript{'description'}  = "DESC: ".$desc;

  #store as transcript
  if($VERBOSE){
    print "TRANSCRIPT:\n";
    print_element(\%transcript);
  }
  my ($current_transcript_id, $updated) = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
							 $current_gene_id, 0, $user_id, $category_id);

}


sub store_as_gene {

  my %gene = ();

  #build structure for transcript
  $gene{'chrom'}        = '0';
  $gene{'biotype'}      = "RNASeq";
  $gene{'type'}         = "gene";
  $gene{'start'}        = 0;
  $gene{'end'}          = 0;
  $gene{'strand'}       = '.';
  $gene{'id'}           = "novel_RNASeq_location";
  $gene{'description'}  = "New gene & exon locations inferred from RNA-Seq data";

  print "storing ueber-project ".$gene{'id'}."\n";

  #store as gene
  if($VERBOSE){
    print "GENE:\n";
    print_element(\%gene);
  }
  my ($new_gene_id, $updated) = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene',
					     0, 0, $user_id, $category_id);

  return $new_gene_id;
}


1;

__END__

insert into projects set name="novel_RNASeq_location", description="New gene & exon locations inferred from RNA-Seq data", is_public=1, created_on=now(), updated_on=now(), identifier="novel_RNASeq_location", status_id=1, Gchrom=0, Gstart=0, Gend=0;

