=head1 NAME

gencode_tracking_system:::sources::missing_ccds

=head1 DESCRIPTION

Helper script for the GENCODE tracking system, to be run nightly.

Read tab-delimited file, store in internal tracking system db, 
LOAD ENTRIES FOR CCDS ENTRIES FOUND IN ENSEMBL BUT MISSING IN LOUTRE.
Uses Ensembl to look up ids.

=head1 CONTACT

Felix Kokocinski, fsk@sanger.ac.uk

=head1 COPYRIGHT

Copyright Felix Kokocinski, 2008-2010, 
supported by Wellcome Trust Sanger Institute (UK) 
and National Human Genome Research Institute (USA).

Released under GPL v2.

=cut

package gencode_tracking_system::sources::missing_ccds;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

my %ccds_ids;
my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $ens_db, $ga, $ta, $upd, $hga, $hta, $current_gene_id);

my $test = 0;

sub run_update {
  my ($tracking_dbhh, $prepare_hashh, $user_idd, $category_idd, $das_source) = @_;

  my $file        = $OTHER_SERVERS{$das_source}->{'file'};
  print STDERR "Using file $file\n";

  my $features    = 0;
  $tracking_dbh   = $tracking_dbhh;
  $prepare_hash   = $prepare_hashh;
  $user_id        = $user_idd;
  $category_id    = $category_idd;
  $ens_db         = connect_ensembl($ENS_HOST, $ENS_PORT, $ENS_NAME, $ENS_USER, $ENS_PASS);
  $ga             = $ens_db->get_GeneAdaptor;
  $ta             = $ens_db->get_TranscriptAdaptor;

  #get general project instead:
  $current_gene_id = get_project_for_category($prepare_hash, $tracking_dbh, $category_id);
  if(!$current_gene_id){
    $current_gene_id = create_general_project($tracking_dbhh, $das_source);
  }
  die "Can't find general project for category $category_idd!\n" unless $current_gene_id;

  my $missing_type = $OTHER_SERVERS{$das_source}->{'category'};

  my $h_db = new Bio::EnsEMBL::DBSQL::DBAdaptor(
					       -host    => $HAV_HOST,
					       -user    => $HAV_USER,
					       -pass    => $HAV_PASS,
					       -port    => $HAV_PORT,
					       -dbname  => $HAV_NAME,
					      );
  $hga = $h_db->get_GeneAdaptor;
  $hta = $h_db->get_TranscriptAdaptor;


  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if( !($line =~ /^[\d]/) );

    my ($num, $loc, $int_ensembl_ids_old, $ccds_ids, $havana_ids, $havana_biotype, 
	$missing_label, $tag2, $clonename, $int_ensembl_ids) = split("\t", $line);

    #only use official / finished CCDS entries
    next unless $tag2 eq "Public";

    #ignore correct annotation
    next if($missing_label eq "match_found");

    $clonename = "" if(!$clonename);

    # "loc" is the CLUSTER location only...

    if($ccds_ids =~ /\,/){
      my @ccds_ids = split(",", $ccds_ids);

      foreach my $ccds_id (@ccds_ids){
	create_entry($havana_ids, $int_ensembl_ids, $ccds_id, $missing_type, $missing_label, $clonename);
      }

    }
    else{
      create_entry($havana_ids, $int_ensembl_ids, $ccds_ids, $missing_type, $missing_label, $clonename);
    }

    $features++;

    if($test and $features > 20){ last }

  }

  close(IN);
  print STDERR "WE FOUND ".(scalar keys %ccds_ids)." CCDS IDS.\n";

  return $features;

}


sub create_entry {
  my ($havana_ids, $int_ensembl_ids, $ccds_id, $missing_type, $missing_label, $clonename) = @_;

  my ($current_transcript_id, $ccds_transcript);

  #keep track of ids
  if(defined $ccds_ids{$ccds_id}){
    print STDERR "SEEN $ccds_id!\n";
    return 1;
  }
  else{
    $ccds_ids{$ccds_id} = 1;
  }

  #fetch ccds entry
  my $ccds_transcripts = $ta->fetch_all_by_external_name($ccds_id);
  if(!$ccds_transcripts){
    warn "can't get entry for $ccds_id!\n";
    return 1;
  }
  elsif(scalar @$ccds_transcripts > 1){
    warn "Multiple entries for $ccds_id!\n";
    $ccds_transcript = get_transcript($ccds_transcripts, $ccds_id);
  }
  else{
    $ccds_transcript = $ccds_transcripts->[0];
  }
  if(!$ccds_transcript){
    warn "can't get entry for $ccds_id!\n";
    return 1;
  }
  print STDERR "HAVE CCDS $ccds_id, ".$ccds_transcript->stable_id."\n";

  #get the location data from Ensembl
  my $chrom  = $ccds_transcript->seq_region_name;
  my $start  = $ccds_transcript->seq_region_start;
  my $end    = $ccds_transcript->seq_region_end;
  my $strand = $ccds_transcript->strand;
  my $e_transcript_id = $ccds_transcript->stable_id;
  my $e_gene = $ga->fetch_by_transcript_id($ccds_transcript->dbID);
  my $e_gene_id = $e_gene->stable_id;


  #look at the core annotation ids
  #only use the first one
  my ($hav_id) = split(',', $havana_ids);
  my ($hav_t_id, $hav_g_id);
  if($hav_id){
    if($hav_id=~ /^OTT...T.+/){
      $hav_t_id = $hav_id;
      $hav_g_id = get_gene_id($hav_t_id);
      if(!$hav_g_id){$hav_g_id='--'}
    }
    elsif($hav_id =~ /^OTT...G.+/){
      $hav_g_id = $hav_id;
      $hav_t_id = get_transc_id($hav_g_id);
      if(!$hav_t_id){$hav_t_id='--'}
    }
  }
  else{
    $hav_g_id = '--';
    $hav_t_id = '--';
  }

  #add space
  $havana_ids =~ s/,/, /g;
  $havana_ids =~ s/\./\-\-/;

  $clonename =~ s/^clone\:\://;
  $clonename =~ s/\:$//;

  #print $ccds_id."\t".$etranscript_id."\n";

  my $description = "CCDS: ".$ccds_id.
    "\nENSEMBL-TRANSCRIPT-ID: ".$e_transcript_id;
  if($hav_g_id and ($hav_g_id ne '--')){
    $description .= "\nHAVANA-GENE-IDS: ".$havana_ids;
  }
  if($hav_t_id and ($hav_t_id ne '--')){
    $description .= "\nHAVANA-TRANSCRIPT-IDS: ".$hav_t_id;
  }
  $description .= "\nNOTE: HAVANA-IDs supplied for orientation only, please use location data.".
                  "\nNOTE: ".$missing_label;
  if($clonename ne "clone_name_unknown"){
    $description .= "\nCLONE: ".$clonename;
  }

  my %transcript;
  $current_transcript_id = 0;

  #build structure for AnnoTrack transcript
  $transcript{'chrom'}        = $chrom;
  $transcript{'type'}         = 'transcript';
  $transcript{'start'}        = $start;
  $transcript{'end'}          = $end;
  $transcript{'strand'}       = $strand;
  $transcript{'id'}           = $ccds_id;
  $transcript{'description'}  = $description;

  #store as transcript
  $transcript{'parent'}  = "".$ccds_id;  #???

  if($VERBOSE){
    print "TRANSCRIPT:\n";
    print_element(\%transcript);
  }
  ($current_transcript_id, $upd) = store_features($tracking_dbh, $prepare_hash, \%transcript, 'transcript',
					  $current_gene_id, 0, $user_id, $category_id);

  return 0;
}



sub get_transc_id {
  my ($hav_g_id)= @_;

  if($hav_g_id eq "."){ return "--"; }
  my $gene = $hga->fetch_by_stable_id($hav_g_id);
  if(!$gene){
    warn "Can't get gene $hav_g_id. (1)\n";
    return "--";
  }
  my $h_transcript = $gene->get_all_Transcripts->[0];
  if(!$h_transcript){
    warn "Can't get transcripts from gene $hav_g_id. (1)\n";
    return "--";
  }

  return $h_transcript->stable_id;
}

sub get_gene_id {
  my ($hav_t_id)= @_;

  if($hav_t_id eq "."){ return "--"; }
  my $gene = $hga->fetch_by_transcript_stable_id($hav_t_id);
  if(!$gene){
    warn "Can't get gene for $hav_t_id. (1)\n";
    return "--";
  }

  return $gene->stable_id;
}


sub get_ccds_id {
  my ($transcript) = @_;

  my $ccds_xref;
  my $dbentries = $transcript->get_all_DBEntries;
  foreach my $dbe (@$dbentries) {
    if ($dbe->dbname eq 'CCDS') {
      $ccds_xref = $dbe->display_id;
      return($ccds_xref);
    }
  }
  my $translation = $transcript->translation;
  if (defined $translation) {
    $dbentries = $translation->get_all_DBEntries;
    foreach my $dbe (@$dbentries) {
      if ($dbe->dbname eq 'CCDS') {
	$ccds_xref = $dbe->display_id;
	return($ccds_xref);
      }
    }
  }

  return($ccds_xref);
}


sub get_transcript {
  my ($transcripts, $ccds_id) = @_;

  my $ccds_xref;
  foreach my $transcript (@$transcripts){
    $ccds_xref = get_ccds_id($transcript);
    if($ccds_xref and ($ccds_xref eq $ccds_id)){
      return($transcript);
    }
  }

  return(undef);
}





1;

__END__

link: http://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA=CCDS4613

lines: 
Count   Genomic_location        Ensembl_Gene_stable_ids CCDS_ids        Havana_Gene_stable_ids  Havana_biotype  Note    ccds_status
1       chromosome:GRCh37:11:133711934:133715341:1      ENST00000299140 CCDS8493.1      .       .       no_havana_genes Public
2       chromosome:GRCh37:11:133788934:133826649:1      ENST00000321016 CCDS44779.1     .       .       no_havana_genes Public
3       chromosome:GRCh37:11:132290087:133402219:1      ENST00000374778,ENST00000331898 CCDS31722.1,CCDS8492.1  .       .       no_havana_genes Public


---------

fix strands:
grep "^CCDS" ccds.fix | grep "\-1" | awk '{print "UPDATE issues set Tstrand=\"-\" WHERE subject=\""$2"\";"}' > ccds.sql
grep "^CCDS" ccds.fix | grep "\-1" | awk '{print "UPDATE projects set Gstrand=\"-\" WHERE name=\""$2"\";"}' >> ccds.sql
perl -p -i -e "s/\./_/g" ccds.sql


#use 1 project for all existing CCDS entries
projects.id=440200
delete pt, cv from projects p, issues i, projects_trackers pt, custom_values cv where p.id=i.project_id and i.category_id=19 and p.id=pt.project_id and p.id=cv.id and cv.customized_type="Project";
delete p from projects p, issues i where p.id=i.project_id and i.category_id=19;
update issues set project_id=440200 where category_id=19;


