#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# Read tab-delimited file, store in internal tracking system db
#
# LOAD ENTRIES FOR CCDS ENTRIES FOUND IN ENSEMBL BUT MISSING IN LOUTRE
#

package gencode_tracking_system::sources::ensembl_transcribed_pseudos;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

my %ccds_ids;
my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $ens_db, $ga, $upd, $hga);

sub run_update {
  my ($tracking_dbhh, $prepare_hashh, $user_idd, $category_idd) = @_;

  my $das_source  = 'ensembl_transcribed_pseudos';
  my $file        = "/nfs/acari/fsk/2_encode/data/ensembl/cdnas_overlapping_genes.jen.das";
  my $features    = 0;
  $tracking_dbh   = $tracking_dbhh;
  $prepare_hash   = $prepare_hashh;
  $user_id        = $user_idd;
  $category_id    = $category_idd;
  $ens_db         = connect_ensembl($ENS_HOST, $ENS_PORT, $ENS_NAME, $ENS_USER, $ENS_PASS);
  $ga             = $ens_db->get_GeneAdaptor;
  my $hdbhost     = "otterlive";
  my $hdbport     = 3301;
  my $hdbname     = "loutre_human";
  my $hdbuser     = "ottro";
  my $hdbpass;

  my $h_db = new Bio::EnsEMBL::DBSQL::DBAdaptor(
					       -host    => $hdbhost,
					       -user    => $hdbuser,
					       -pass    => $hdbpass,
					       -port    => $hdbport,
					       -dbname  => $hdbname,
					      );
  $hga            = $h_db->get_GeneAdaptor;

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( !($line =~ /^[\d]/) );

    my ($num, $loc, $cdnas, $havana_ids, $havana_biotype) = split("\t", $line);

    my ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc);

    if($havana_ids =~ /\,/){
      my $transcript_id;
      my @havana_ids = split(",", $havana_ids);
      foreach my $havana_id (@havana_ids){
	my $use_id = $int_ensembl_ids;

	print "CREATING1 $chrom, $start, $end, $havana_id, $cdnas.\n";
	#create_entry($chrom, $start, $end, $strand, $havana_id, $cdnas);
      }

    }
    else{
      print "CREATING2 $chrom, $start, $end, $havana_ids, $cdnas.\n";
      #create_entry($chrom, $start, $end, $strand, $havana_ids, $cdnas);
    }

    $features++;

  }

  close(IN);

  print STDERR "WE FOUND ".(scalar keys %ccds_ids)." CCDS IDS.\n";

  return $features;

}


sub set_flag_for_entry {
  my ($hav_id, $flag_name) = @_;


  my $hav_t_id = get_transc_id($hav_id);

  print STDERR "  Setting flag $flag_name to ".$hav_t_id.".\n" if($VERBOSE);
      if($WRITE){ set_flag($tracking_dbh, $hav_t_id, $flag_name, undef,
			   $user_id, 0); }
      $flag_count++;

}

sub create_entry {
  my ($chrom, $start, $end, $strand, $havana_ids, $missing_type) = @_;
  my ($current_gene_id, $current_transcript_id);

  #keep track of ids
  if(defined $ccds_ids{$ccds_id}){
    print STDERR "SEEN!\n";
    return;
  }
  else{
    $ccds_ids{$ccds_id} = 1;
  }

  #if(!$strand){
    $strand = get_strand($int_ensembl_ids);
  #}

  #print "Splitting $havana_ids.\n";
  my ($hav_id) = split(',', $havana_ids);
  my $hav_t_id = get_transc_id($hav_id);

  #add space
  $havana_ids =~ s/,/, /g;


  my $description = "CCDS: ".$ccds_id."\nENSEMBL-IDS: ".$int_ensembl_ids.
    "\nENSEMBL-TRANSCRIPT-IDS: ".$etranscript_id.
    "\nHAVANA-GENE-IDS: ".$havana_ids."\nHAVANA-TRANSCRIPT-IDS: ".$hav_t_id.
    "\nNOTE: HAVANA-IDs supplied for orientation only.";

  my %transcript;
  $current_transcript_id = 0;

  #build structure for transcript
  $transcript{'chrom'}        = $chrom;
  $transcript{'biotype'}      = undef;
  $transcript{'status'}       = undef;
  $transcript{'type'}         = $missing_type;
  $transcript{'start'}        = $start;
  $transcript{'end'}          = $end;
  $transcript{'strand'}       = $strand;
  $transcript{'id'}           = "".$ccds_id;
  $transcript{'description'}  = $description;

  print "CCDS\t$ccds_id\t$strand\n";

  #clone gene from transcript
  my %gene = %transcript;

  #store as gene
  if($VERBOSE){
    print "GENE:\n";
    print_element(\%gene);
  }
  $gene{'type'} = "gene";
  ($current_gene_id, $upd) = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene',
				    0, 0, $user_id, $category_id);

  #store as transcript
  if($VERBOSE){
    print "TRANSCRIPT:\n";
    print_element(\%transcript);
  }
  ($current_transcript_id, $upd) = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
					  $current_gene_id, 0, $user_id, $category_id);

}


sub get_transc_id {
  my ($hav_g_id)= @_;

  if($hav_g_id eq "."){ return "--"; }
  my $gene = $hga->fetch_by_stable_id($hav_g_id);
  if(!$gene){
    warn "Can't get gene $hav_g_id. (1)\n";
    return "--";
  }
  my @h_transcripts = @{$gene->get_all_Transcripts};
  print "HAVE ".(scalar @h_transcripts)." TRANSCRIPTS.\n";
  my $h_transcrip = $h_transcripts[0];
  if(!$h_transcript){
    warn "Can't get transcripts from gene $hav_g_id. (1)\n";
    return "--";
  }

  return $h_transcript->stable_id;
}


sub get_strand {
  my ($hav_g_id) = @_;

  if($hav_g_id =~ /^([\w]+),.+/){
    $hav_g_id = $1;
  }
  my $gene = $ga->fetch_by_stable_id($hav_g_id);
  if(!$gene){
    warn "Can't get gene $hav_g_id. (2)\n";
    return 1;
  }

  return $gene->strand;
}



1;

__END__
