#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# Read tab-delimited file, store in internal tracking system db
#
# LOAD ENTRIES FOR CCDS ENTRIES FOUND IN ENSEMBL BUT MISSING IN LOUTRE
#

package gencode_tracking_system::sources::missing_ccds;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

my %ccds_ids;
my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $ens_db, $ga, $ta, $upd, $hga);
my ($related_id);

my $test = 0;

sub run_update {
  my ($tracking_dbhh, $prepare_hashh, $user_idd, $category_idd) = @_;

  my $file        = "/nfs/users/nfs_f/fsk/2_encode/data/ensembl/e56/unannotated_CCDS_loutre2.das";
  my $features    = 0;
  $tracking_dbh   = $tracking_dbhh;
  $prepare_hash   = $prepare_hashh;
  $user_id        = $user_idd;
  $category_id    = $category_idd;
  $ens_db         = connect_ensembl($ENS_HOST, $ENS_PORT, $ENS_NAME, $ENS_USER, $ENS_PASS);
  $ga             = $ens_db->get_GeneAdaptor;
  $ta             = $ens_db->get_TranscriptAdaptor;
  my $hdbhost     = "otterlive";
  my $hdbport     = 3301;
  my $hdbname     = "loutre_human";
  my $hdbuser     = "ottro";
  my $hdbpass;

  my $missing_type = "missing_ccds";

  my $h_db = new Bio::EnsEMBL::DBSQL::DBAdaptor(
					       -host    => $hdbhost,
					       -user    => $hdbuser,
					       -pass    => $hdbpass,
					       -port    => $hdbport,
					       -dbname  => $hdbname,
					      );
  $hga = $h_db->get_GeneAdaptor;

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( !($line =~ /^[\d]/) );

    #print STDERR "\n$line\n";

    my ($num, $loc, $int_ensembl_ids, $ccds_ids, $havana_ids, $havana_biotype, $missing_label, $tag2) = split("\t", $line);

    next unless $tag2 eq "Public";

    my ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc);

    $related_id = 0;

    if($ccds_ids =~ /\,/){
      my $transcript_id;
      my @ccds_ids = split(",", $ccds_ids);

      foreach my $ccds_id (@ccds_ids){
	my $use_id = $int_ensembl_ids;
	if($int_ensembl_ids =~ /\,/){

	  my $transcript_id = get_transcript_id_for_ccds_id($int_ensembl_ids, $ccds_id);

	  if($transcript_id){
	    $use_id = $transcript_id;
	  }
	  else{
	    warn ">>>cant find ccds-transcript for $line\n";
	  }
	}
	#print STDERR "ENTRY[1] $chrom, $start, $end, $havana_ids, $use_id, $ccds_id, $missing_type, $missing_label.\n";
	create_entry($chrom, $start, $end, $strand, $havana_ids, $use_id, $ccds_id, $missing_type, $transcript_id, $missing_label);

	
      }

    }
    else{
      #print STDERR "ENTRY[2] $chrom, $start, $end, $havana_ids, $int_ensembl_ids, $ccds_ids, $missing_label, $missing_type.\n";
      create_entry($chrom, $start, $end, $strand, $havana_ids, $int_ensembl_ids, $ccds_ids, $missing_type, $int_ensembl_ids, $missing_label);
    }

    $features++;

    if($test and $features > 20){ last }

  }

  close(IN);

  print STDERR "WE FOUND ".(scalar keys %ccds_ids)." CCDS IDS.\n";

  return $features;

}


sub create_entry {
  my ($chrom, $start, $end, $strand, $havana_ids, $int_ensembl_ids, $ccds_id, $missing_type, $etranscript_id, $missing_label) = @_;
  my ($current_gene_id, $current_transcript_id);

  #print STDERR "$ccds_id\n";

  #keep track of ids
  if(defined $ccds_ids{$ccds_id}){
    print STDERR "SEEN!\n";
    return;
  }
  else{
    $ccds_ids{$ccds_id} = 1;
  }

  if(!$etranscript_id and $int_ensembl_ids){
    my $use_id2;
    $etranscript_id = get_transcript_id_for_ccds_id($int_ensembl_ids, $ccds_id);
  }

  if(!$int_ensembl_ids and $etranscript_id){
    $int_ensembl_ids = {$ga->fetch_by_transcript_id($etranscript_id)}->stable_id;
  }

  if($int_ensembl_ids =~ /^ENST/){
    my %gids = ();
    foreach my $tid (split(',', $int_ensembl_ids)){
      my $gid = $ga->fetch_by_transcript_stable_id($tid)->stable_id;
      if($gid){ $gids{$gid} = 1; }
    }
    $int_ensembl_ids = join(", ", keys %gids);
  }

  if(!defined($strand)){
    $strand = get_strand($int_ensembl_ids);
  }

  #print "Splitting $havana_ids.\n";
  my ($hav_id) = split(',', $havana_ids);
  my $hav_t_id;
  if(!$hav_id){
    $hav_id = '--';
    $hav_t_id = '--';
  }
  else{
    $hav_t_id = get_transc_id($hav_id);
  }

  #add space
  $havana_ids =~ s/,/, /g;
  $havana_ids =~ s/\./\-\-/;
  if(!$hav_t_id){$hav_t_id='--'}

  #print $ccds_id."\t".$etranscript_id."\n";

  my $description = "CCDS: ".$ccds_id."\nENSEMBL-IDS: ".$int_ensembl_ids.
    "\nENSEMBL-TRANSCRIPT-IDS: ".$etranscript_id.
    "\nHAVANA-GENE-IDS: ".$havana_ids."\nHAVANA-TRANSCRIPT-IDS: ".$hav_t_id.
    "\nNOTE: HAVANA-IDs supplied for orientation only.\nNOTE: ".$missing_label;

  my %transcript;
  $current_transcript_id = 0;

  #build structure for transcript
  $transcript{'chrom'}        = $chrom;
  $transcript{'type'}         = 'transcript';
  $transcript{'start'}        = $start;
  $transcript{'end'}          = $end;
  $transcript{'strand'}       = $strand;
  $transcript{'id'}           = "".$ccds_id;
  $transcript{'description'}  = $description;

  #clone gene from transcript
  my %gene = %transcript;
  $gene{'type'} = "gene";

  #store as gene
  if($VERBOSE){
    print "GENE:\n";
    print_element(\%gene);
  }
  ($current_gene_id, $upd) = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene',
					    0, 0, $user_id, $category_id);

  #store as transcript
  $transcript{'parent'}  = "".$ccds_id;
  if($VERBOSE){
    print "TRANSCRIPT:\n";
    print_element(\%transcript);
  }
  ($current_transcript_id, $upd) = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
					  $current_gene_id, 0, $user_id, $category_id);

  if($related_id){
    #print "RELATION: $related_id\n";
    issue_relation($tracking_dbh, $current_transcript_id, $related_id);
    #add_issue_realtion($tracking_dbh, $prepare_hash, $current_transcript_id, $related_id);
  }
  else{
    $related_id = $current_transcript_id;
  }

}



sub get_transc_id {
  my ($hav_g_id)= @_;

  if($hav_g_id eq "."){ return "--"; }
  my $gene = $hga->fetch_by_stable_id($hav_g_id);
  if(!$gene){
    warn "Can't get gene $hav_g_id. (1)\n";
    return "--";
  }
  my $h_transcript = $gene->get_all_Transcripts->[0];
  if(!$h_transcript){
    warn "Can't get transcripts from gene $hav_g_id. (1)\n";
    return "--";
  }

  return $h_transcript->stable_id;
}


sub get_strand {
  my ($hav_g_id) = @_;

  if($hav_g_id =~ /^([\w]+),.+/){
    $hav_g_id = $1;
  }
  my $gene = $ga->fetch_by_stable_id($hav_g_id);
  if(!$gene){
    warn "Can't get gene $hav_g_id. (2)\n";
    return ".";
  }

  return $gene->strand;
}


sub get_ccds_id {
  my ($transcript) = @_;

  my $ccds_xref;
  my $translation = $transcript->translation;
  if (defined $translation) {
    my $tln_dbentries = $translation->get_all_DBEntries;
    foreach my $dbe (@$tln_dbentries) {
      if ($dbe->dbname eq 'CCDS') {
	$ccds_xref = $dbe->display_id;
	return($ccds_xref);
      }
    }
  }
  else{
    print STDERR "NO TRANSLATION DEFINED FOR ".$transcript->stable_id.".\n";
  }

  return($ccds_xref);
}

sub get_transcript_id_for_ccds_id {
  my ($transcript_ids, $ccds_id) = @_;

  my $ccds_xref;
  foreach my $transcript_id (split(',', $transcript_ids)){
    my $transcript = $ta->fetch_by_stable_id($transcript_id);
    if(!$transcript){
      print STDERR "Cant get $transcript_id ($transcript_ids) (4).\n";
      next;
    }
    $ccds_xref = get_ccds_id($transcript);
    if($ccds_xref and ($ccds_xref eq $ccds_id)){
      return($transcript->stable_id);
    }
  }

  return('--');
}





1;

__END__

link: http://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA=CCDS4613

lines: 
104     chromosome:NCBI36:6:157140756:157572093 ENSG00000049618 CCDS5251.1      OTTHUMG00000015887,OTTHUMG00000015890   processed_transcript    havana_missing_ccds
105     chromosome:NCBI36:6:158991034:159105889 ENSG00000164674 CCDS34563.1     OTTHUMG00000015916      protein_coding  havana_missing_ccds
14      chromosome:NCBI36:3:49429216:49441761   ENSG00000145020,ENSG00000145029 CCDS2797.1,CCDS2798.1   .       .       no_havana_genes
30      chromosome:NCBI36:6:32057780:32111173   ENSG00000204342,ENSG00000204319 CCDS4734.1,CCDS34418.1  OTTHUMG00000031186,OTTHUMG00000031187   protein_coding  havana_missing_ccds


---------

fix strands:
grep "^CCDS" ccds.fix | grep "\-1" | awk '{print "UPDATE issues set Tstrand=\"-\" WHERE subject=\""$2"\";"}' > ccds.sql
grep "^CCDS" ccds.fix | grep "\-1" | awk '{print "UPDATE projects set Gstrand=\"-\" WHERE name=\""$2"\";"}' >> ccds.sql
perl -p -i -e "s/\./_/g" ccds.sql
