#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# Read tab-delimited file, store in internal tracking system db
#
# LOAD ENTRIES FOR HGNC ENTRIES FOUND IN ENSEMBL BUT MISSING IN LOUTRE
#

package gencode_tracking_system::sources::missing_hgnc;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

my %ccds_ids;
my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $ens_db, $ga, $upd, $hga);

sub run_update {
  my ($tracking_dbhh, $prepare_hashh, $user_idd, $category_idd) = @_;

  my $file        = "/nfs/th_group/fsk/data/encode_data/ensembl/e56/unannotated_HGNCs.das";
  my $features    = 0;
  $tracking_dbh   = $tracking_dbhh;
  $prepare_hash   = $prepare_hashh;
  $user_id        = $user_idd;
  $category_id    = $category_idd;
  $ens_db         = connect_ensembl($ENS_HOST, $ENS_PORT, $ENS_NAME, $ENS_USER, $ENS_PASS);
  $ga             = $ens_db->get_GeneAdaptor;
  my $hdbhost     = "otterlive";
  my $hdbport     = 3301;
  my $hdbname     = "loutre_human";
  my $hdbuser     = "ottro";
  my $hdbpass;
  my $flag_name   = 'missing_hgnc';

  my $h_db        = connect_ensembl($hdbhost, $hdbport, $hdbname, $hdbuser, $hdbpass);
  $hga            = $h_db->get_GeneAdaptor;

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( !($line =~ /^[\d]/) );

    my ($num, $loc, $ensembl_id, $ensembl_hgnc, $ccds_ids, $havana_ids, $havana_hgnc, $havana_ccds, $vega_synonym) = split("\t", $line);

    my ($c, $ass, $chrom, $start, $end, $strand) = split(":", $loc);

    if($havana_ids =~ /\,/){
      print STDERR "MultVals: $line\n";
      next;
    }
    if($havana_ids eq "."){
      #missing locus
      print STDERR "MISSING HAVANA GENE.\n";
    }
    elsif(($ensembl_hgnc ne ".") and ($havana_hgnc eq ".")){
      #create entry
      #print "CREATING $flag_name $chrom, $start, $end, $havana_ids, $ensembl_id, $ensembl_hgnc.\n";
      create_entry($chrom, $start, $end, $strand, $havana_ids, $ensembl_id, $ensembl_hgnc, $flag_name);

      $features++;

      #die "\n" if($features>10);
    }


  }

  close(IN);

  print STDERR "WE FOUND ".(scalar keys %ccds_ids)." FEATURES.\n";

  return $features;

}


sub create_entry {
  my ($chrom, $start, $end, $strand, $havana_ids, $int_ensembl_ids, $ccds_id, $flag_name) = @_;
  my ($current_gene_id, $current_transcript_id);

  #keep track of ids
  if(defined $ccds_ids{$ccds_id}){
    print STDERR "SEEN!\n";
    return;
  }
  else{
    $ccds_ids{$ccds_id} = 1;
  }

  $strand = get_strand($int_ensembl_ids);

  #print "Splitting $havana_ids.\n";
  my ($hav_id) = split(',', $havana_ids);
  my $hav_t_id = get_transc_id($hav_id);

  #add space
  $havana_ids =~ s/,/, /g;
  $int_ensembl_ids =~ s/,/, /g;

  my $description = "HGNC-IDS: ".$ccds_id."\nENSEMBL-IDS: ".$int_ensembl_ids.
    "\nHAVANA-GENE-IDS: ".$havana_ids."\nHAVANA-TRANSCRIPT-IDS: ".$hav_t_id."\n".
    "NOTE: HAVANA-IDs supplied for orientation only.";

  my %transcript;
  $current_transcript_id = 0;

  #build structure for transcript
  $transcript{'chrom'}        = $chrom;
  #$transcript{'biotype'}      = undef;
  #$transcript{'status'}       = undef;
  $transcript{'type'}         = "gene";
  $transcript{'start'}        = $start;
  $transcript{'end'}          = $end;
  $transcript{'strand'}       = $strand;
  $transcript{'id'}           = "HGNC_".$ccds_id;
  $transcript{'description'}  = $description;

  print "HGNC\n";

  #clone gene from transcript
  my %gene = %transcript;

  #store as gene
  if($VERBOSE){
    print "GENE:\n";
    print_element(\%gene);
  }
  $gene{'type'} = "transcript";
  ($current_gene_id, $upd) = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene',
					    0, 0, $user_id, $category_id);

  #store as transcript
  $transcript{'parent'}  = $transcript{'id'};
  if($VERBOSE){
    print "TRANSCRIPT:\n";
    print_element(\%transcript);
  }
  ($current_transcript_id, $upd) = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
					  $current_gene_id, 0, $user_id, $category_id);

}


sub get_transc_id {
  my ($hav_g_id)= @_;

  if($hav_g_id eq "."){ return "--"; }
  my $gene = $hga->fetch_by_stable_id($hav_g_id);
  if(!$gene){
    warn "Can't get gene $hav_g_id. (1)\n";
    return "--";
  }
  my $h_transcript = $gene->get_all_Transcripts->[0];
  if(!$h_transcript){
    warn "Can't get transcripts from gene $hav_g_id. (1)\n";
    return "--";
  }

  return $h_transcript->stable_id;
}


sub get_strand {
  my ($hav_g_id) = @_;

  if($hav_g_id =~ /^([\w]+),.+/){
    $hav_g_id = $1;
  }
  my $gene = $ga->fetch_by_stable_id($hav_g_id);
  if(!$gene){
    warn "Can't get gene $hav_g_id. (2)\n";
    return 1;
  }

  return $gene->strand;
}



1;

__END__

Count   Genomic_location        Ensembl_Gene_stable_id  Ensembl_HGNC_symbol     Ensembl_CCDS    Havana_Gene_stable_id   Havana_HGNC_symbol      Havana_CCDS
     Synonym_display_name

