#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# Read tab-delimited file, store in internal tracking system db
#
# LOAD ENTRIES FOR CCDS ENTRIES FOUND IN ENSEMBL BUT MISSING IN LOUTRE
#

package gencode_tracking_system::sources::missing_ccds;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

my %ccds_ids;
my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $ens_db, $ga);

sub run_update {
  my ($tracking_dbhh, $prepare_hashh, $user_idd, $category_idd) = @_;

  my $das_source  = 'CCDS_error';
  my $file        = "/nfs/acari/fsk/2_encode/data/ensembl/unannotated_CCDS.das";
  my $features    = 0;
  $tracking_dbh   = $tracking_dbhh;
  $prepare_hash   = $prepare_hashh;
  $user_id        = $user_idd;
  $category_id    = $category_idd;
  $ens_db         = connect_ensembl();
  $ga             = $ens_db->get_GeneAdaptor;

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( !($line =~ /^[\d]/) );

    my ($num, $loc, $int_ensembl_ids, $ccds_ids, $havana_ids, $havana_biotype, $missing_type) = split("\t", $line);

    my ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc);

    if($ccds_ids =~ /\,/){
      foreach my $ccds_id (split(",", $ccds_ids)){

	if($int_ensembl_ids =~ /\,/){
	  my $use_id = check_ccds_id($int_ensembl_ids, $ccds_id);
	  if($use_id){
	    $int_ensembl_ids = $use_id;
	  }
	}
	create_entry($chrom, $start, $end, $strand, $havana_ids, $int_ensembl_ids, $ccds_id, $missing_type);
      }

    }
    else{
      create_entry($chrom, $start, $end, $strand, $havana_ids, $int_ensembl_ids, $ccds_ids, $missing_type);
    }

    $features++;

  }

  close(IN);

  print STDERR "WE FOUND ".(scalar keys %ccds_ids)." CCDS IDS.\n";

  return $features;

}


sub create_entry {
  my ($chrom, $start, $end, $strand, $havana_ids, $int_ensembl_ids, $ccds_id, $missing_type) = @_;
  my ($current_gene_id, $current_transcript_id);

  #keep track of ids
  if(defined $ccds_ids{$ccds_id}){
    print STDERR "SEEN!\n";
    return;
  }
  else{
    $ccds_ids{$ccds_id} = 1;
  }

  #add space
  $havana_ids =~ s/,/, /g;

  my $description = "CCDS: ".$ccds_id."\nENSEMBL-IDS: ".$int_ensembl_ids."\nHAVANA-GENE-IDS: ".$havana_ids;

  my %transcript;
  $current_transcript_id = 0;

  #build structure for transcript
  $transcript{'chrom'}        = $chrom;
  $transcript{'biotype'}      = undef;
  $transcript{'status'}       = undef;
  $transcript{'type'}         = $missing_type;
  $transcript{'start'}        = $start;
  $transcript{'end'}          = $end;
  $transcript{'strand'}       = $strand;
  $transcript{'id'}           = $ccds_id;
  $transcript{'description'}  = $description;

  #clone gene from transcript
  my %gene = %transcript;

  #store as gene
  if($VERBOSE){
    print STDERR "GENE:\n";
    print_element(\%gene);
  }
  $gene{'type'} = "gene";
#  $current_gene_id = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene',
#				    0, 0, $user_id, $category_id);

  #store as transcript
  if($VERBOSE){
    print STDERR "TRANSCRIPT:\n";
    print_element(\%transcript);
  }
#  $current_transcript_id = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
#					  $current_gene_id, 0, $user_id, $category_id);

}


sub check_ccds_id {
  my ($gene_ids, $ccds_id) = @_;
  my $ccds_xref;

  foreach my $gene_id (split(',', $gene_ids)){
    my $genes = $ga->fetch_by_stable_id($gene_id);
    my $gene = $genes->[0];

    foreach my $transcript (@{$gene->get_all_Transcripts}) {
      my $translation = $transcript->translation;
      if (defined $translation) {
	my $tln_dbentries = $translation->get_all_DBEntries;
	foreach my $dbe (@$tln_dbentries) {
	  if ($dbe->dbname eq 'CCDS') {
	    print STDERR "Translation ".$translation->stable_id.
                         " has DBEntry with display_id ".$dbe->display_id.
                         " from dbname ".$dbe->dbname."\n" if $VERBOSE;
	    $ccds_xref = $dbe->display_id;
	    if($ccds_xref eq $ccds_id){
	      return $gene_id;
	    }
	  }
	}
      }
    }
  }

  return 0;
}

1;

__END__

link: http://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA=CCDS4613

lines: 
104     chromosome:NCBI36:6:157140756:157572093 ENSG00000049618 CCDS5251.1      OTTHUMG00000015887,OTTHUMG00000015890   processed_transcript    havana_missing_ccds
105     chromosome:NCBI36:6:158991034:159105889 ENSG00000164674 CCDS34563.1     OTTHUMG00000015916      protein_coding  havana_missing_ccds
14      chromosome:NCBI36:3:49429216:49441761   ENSG00000145020,ENSG00000145029 CCDS2797.1,CCDS2798.1   .       .       no_havana_genes
30      chromosome:NCBI36:6:32057780:32111173   ENSG00000204342,ENSG00000204319 CCDS4734.1,CCDS34418.1  OTTHUMG00000031186,OTTHUMG00000031187   protein_coding  havana_missing_ccds
