=head1 NAME

gencode_tracking_system:::sources::havana_db

=head1 DESCRIPTION

Helper script for the GENCODE tracking system, to be run nightly.

Connect to the OTTER DATABASE mysql server with manual genome annotation,
fetch data, store/update in internal tracking system db

=head1 CONTACT

Felix Kokocinski, fsk@sanger.ac.uk

=head1 COPYRIGHT

Copyright Felix Kokocinski, 2008-2010, 
supported by Wellcome Trust Sanger Institute (UK) 
and National Human Genome Research Institute (USA).

Released under GPL v2.

=cut

package gencode_tracking_system::sources::havana_db;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use Bio::Otter::Lace::Defaults;
use Bio::Otter::MFetcher;
use Date::Format;
use base 'Exporter';

our @EXPORT = qw( run_update );

my $g = 0;
my $t = 0;

my $test = 0;

sub run_update {
  my ($tracking_dbh, $prepare_hash, $user_id, $category_id, $chrom_lens) = @_;

  my $current_gene_id;
  my $current_transcript_id;
  my $current_feature_id;
  my %genes = ();
  my %transcripts   = ();
  my %new_features  = ();
  my %checked_genes = ();
  my %ignore = ();
  my $updated = 0;
  my $subfeatureupdate = 0;
  my $c = 0;

  my $dataset_name = "human";
  my $ncbi_version = "GRCh37";


  my $cl = Bio::Otter::Lace::Defaults::make_Client();
  my $dataset = $cl->get_DataSet_by_name($dataset_name);
  my $loutre_dba  = $dataset->get_cached_DBAdaptor;
  my $slice_Ad = $loutre_dba->get_SliceAdaptor;
  my $gene_Ad  = $loutre_dba->get_GeneAdaptor;

  my %sets;
  my $ssets = $dataset->get_all_visible_SequenceSets;
  foreach my $sset ( @$ssets ){
#    my $chrom = $sset->name;
#    next if($chrom =~ /preanalysis/);
#    $chrom =~ s/-\d\d$//;
#    $chrom =~ s/^Chr//i;
    my ($chrom) = ($sset->name) =~ /Chr(\d+|\w+)-.*/i;
    next unless $chrom;
    $sets{$chrom} = $sset->name;
  }

  my $do_prefetch_transcripts = 1;
  # Hash structured like this: $aeh->{'NCBI36'}{'11'} = 'chr11-02';
  my $aeh = fetch_otter_asm_equiv_hash($loutre_dba);

  my @ordered_chroms = sort _sortbychrnum keys %sets;

  foreach my $chromosome (@ordered_chroms){
    print "CHROMOSOME $chromosome\n";

    if($test){ next unless $chromosome eq "21" }

    my $hav_chr_name = $sets{$chromosome};
    my $ncbi_genes = [];
    my $hav_slice;

    if($test){ $hav_slice = $slice_Ad->fetch_by_region('chromosome', $sets{$chromosome},
					       30954891, 30985000, undef, 'Otter'); }
    else{
      $hav_slice = $slice_Ad->fetch_by_region('chromosome', $sets{$chromosome},
					       undef, undef, undef, 'Otter');
    }
    die "\nCould not get slice for ".$sets{$chromosome}."\n\n" if(!$hav_slice);

    $ncbi_genes = $gene_Ad->fetch_all_genes_on_reference_slice($chromosome, $sets{$chromosome},
							       $hav_slice, $ncbi_version);

    foreach my $gene (@$ncbi_genes) {
      #create objects and store in GT
      create_objects($gene, $chromosome, $prepare_hash, $tracking_dbh, $user_id, $category_id);
    }

  }
  print "\nFound $g genes and $t transcripts\n\n";
}


sub create_objects {
  my ($gene, $chromosome, $prepare_hash, $tracking_dbh, $user_id, $category_id) = @_;

  #store gene
  my %gene;
  my @gene_description = ();
  my @gene_alias = ();
  my ($gene_description, $gene_name);
  my ($atts) = $gene->get_all_Attributes();
  my ($created, $modified);
  my $updated_g = 0;
  my ($current_gene_id, $current_transcript_id);

  $gene_description = $gene->description;
  push(@gene_description, 'Description: '.$gene_description) if($gene_description);

  foreach my $att (@$atts) {
    if($att->name eq "Name"){
      $gene_name = $att->value;
    }
    elsif($att->name eq "Synonym"){
      push(@gene_alias, $att->value);
    }
  }
  if($gene_name){
    if ( $gene_name =~ /:/) {
      return 0;
    }
    push(@gene_description, 'Genename: '.$gene_name) if($gene_name);
  }
  push(@gene_description, 'Genealias: '.(join(', ', @gene_alias))) if(scalar @gene_alias);

  #get modified date of gene in YYYY-MM-DDTHH:MM:SSZ format
  $created  = time2str("%Y-%m-%dT%H:%M:%S%z", $gene->created_date);
  $modified = time2str("%Y-%m-%dT%H:%M:%S%z", $gene->modified_date);

  #build structure for gene
  $gene{'chrom'}       = $chromosome;
  $gene{'biotype'}     = $gene->biotype;
  $gene{'status'}      = $gene->status;
  $gene{'type'}        = "gene";
  $gene{'start'}       = $gene->seq_region_start;
  $gene{'end'}         = $gene->seq_region_end;
  $gene{'strand'}      = $gene->strand;
  $gene{'created_on'}  = $created;
  $gene{'updated_on'}  = $modified;
  $gene{'description'} = join("\n", @gene_description);
  $gene{'id'}          = $gene->stable_id;
  $gene{'version'}     = $gene->version;

  #store/update entry for gene
  #print "NEXTGENE: ".$gene{'id'}."\n";
  $g++;
  print_element(\%gene) if($VERBOSE);

  ($current_gene_id, $updated_g) = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene',
						  0, 0, $user_id, $category_id);

  #store transcripts
  foreach my $transcript (@{$gene->get_all_Transcripts}){
    my $updated_t = 0;
    my $transcript_name = "";
    my @transcript_alias = ();
    my @transcript_description = ();
    my @note = ();
    my ($atts) = $transcript->get_all_Attributes();
    foreach my $att (@$atts) {
      if ($att->name eq "Name"){
	$transcript_name = $att->value;
      }
      elsif($att->name eq "Synonym"){
	push(@transcript_alias, $att->value);
      }
      elsif ( ($att->name eq "mRNA start not found") || ($att->name eq "mRNA end not found") ||
	      ($att->name eq "CDS start not found")  || ($att->name eq "CDS end not found")  ) {
	if ( $att->value != 0 ) {
	  push(@note, "Note:".$att->name);
	}
      }
      elsif ( $att->value ) {
	push(@note, "Note:".$att->value);
      }
    }

    push(@transcript_description, 'Genename: '.$gene_name) if($gene_name);
    if($transcript_name){
      if ( $transcript_name =~ /:/) {
	return 0;
      }
      push(@transcript_description, 'Transcriptname: '.$transcript_name) if($transcript_name);
    }
    push(@transcript_description, 'Transcriptalias: '.(join(', ', @transcript_alias))) if(scalar @transcript_alias);
    my $transcript_description = $transcript->description;
    push(@transcript_description, 'Description: '.$transcript_description) if($transcript_description);
    push(@transcript_description, @note) if(scalar @note);

    $created  = time2str("%Y-%m-%dT%H:%M:%S%z", $transcript->created_date);
    $modified = time2str("%Y-%m-%dT%H:%M:%S%z", $transcript->modified_date);

    #build structure for transcript
    my %transcript;
    $transcript{'chrom'}       = $chromosome;
    $transcript{'biotype'}     = $transcript->biotype;
    $transcript{'status'}      = $transcript->status;
    $transcript{'type'}        = "transcript";
    $transcript{'start'}       = $transcript->seq_region_start;
    $transcript{'end'}         = $transcript->seq_region_end;
    $transcript{'strand'}      = $transcript->strand;
    $transcript{'created_on'}  = $created;
    $transcript{'updated_on'}  = $modified;
    $transcript{'description'} = join("\n", @transcript_description);
    $transcript{'id'}          = $transcript->stable_id;
    $transcript{'version'}     = $transcript->version;
    $transcript{'parent'}      = $gene->stable_id;

    #print "NEXTTRANSCRIPT: ".$transcript{'id'}."\n";
    $t++;
    print_element(\%transcript) if($VERBOSE);

    ($current_transcript_id, $updated_t) = store_features($tracking_dbh, $prepare_hash, \%transcript,
							  'transcript', $current_gene_id, 0,
							  $user_id, $category_id);

    if($updated_t){
      print "updated transcript! Refreshing subfeatures for $current_transcript_id!\n" if $VERBOSE;
      remove_subfeatures($tracking_dbh, $prepare_hash, $current_transcript_id);
      my @subfeatures = ();

      foreach my $exon (@{$transcript->get_all_Exons}){
	my $stype;
	my %sub_element;
	$created  = time2str("%Y-%m-%dT%H:%M:%S%z", $exon->created_date);
	$modified = time2str("%Y-%m-%dT%H:%M:%S%z", $exon->modified_date);

	$sub_element{'chrom'}      = $chromosome;
	$sub_element{'parent'}     = $transcript->stable_id;
	$sub_element{'created_on'} = $created;
	$sub_element{'updated_on'} = $modified;
	$sub_element{'start'}      = $exon->seq_region_start;
	$sub_element{'end'}        = $exon->seq_region_end;
	$sub_element{'strand'}     = $exon->seq_region_strand;
	$sub_element{'type'}       = "exon";

	my $coding_start = $exon->coding_region_start($transcript);
	my $coding_end   = $exon->coding_region_end($transcript);

	if($coding_start and $VERBOSE){ print "CONGS: ".$exon->seq_region_start." - $coding_start.\n"; }

	if ( !defined $coding_start ) {
	  #non-coding exon
	  $sub_element{'type'} = "UTR";
	}
	else{
	  if ($coding_start > $exon->seq_region_start) {
	    #create separate 5'UTR exon
	    my %sub_element2 = %sub_element;
	    $sub_element2{'type'}  = "UTR";
	    $sub_element2{'end'}   = $coding_start-1;
	    $sub_element{'start'}  = $coding_start;
	    push(@subfeatures, \%sub_element2);
	  }
	  if ($coding_end < $exon->seq_region_end) {
	    #create separate 5'UTR exon
	    my %sub_element3 = %sub_element;
	    $sub_element3{'type'}  = "UTR";
	    $sub_element3{'start'} = $coding_end+1;
	    $sub_element{'end'}    = $coding_end;
	    push(@subfeatures, \%sub_element3);
	  }
	  $sub_element{'phase'}    = $exon->phase;
	}
	#store exon
	push(@subfeatures, \%sub_element);
      }



      foreach my $sub_element (sort {$a->{'start'} <=> $b->{'start'}} @subfeatures){
	print "storing_subfeature: ".$sub_element->{'type'}.", ".$sub_element->{'start'}
	      ."-".$sub_element->{'end'}."\n" if $VERBOSE;
	my ($current_feature_id, $updated_s) = store_features($tracking_dbh, $prepare_hash, $sub_element,
							      'subfeature', $current_gene_id, $current_transcript_id,
							      $user_id, $category_id);
      }

    }

  }

}


sub fetch_otter_asm_equiv_hash {
    my ($dba) = @_;

    my $sth = $dba->dbc()->prepare(qq{
        SELECT ae_val.value
          , cn_val.value
          , sr.name
        FROM seq_region sr
          , seq_region_attrib ae_val
          , seq_region_attrib cn_val
          , attrib_type ae_at
          , attrib_type cn_at
        WHERE sr.seq_region_id = ae_val.seq_region_id
          AND ae_val.attrib_type_id = ae_at.attrib_type_id
          AND ae_at.code = 'equiv_asm'
          AND sr.seq_region_id = cn_val.seq_region_id
          AND cn_val.attrib_type_id = cn_at.attrib_type_id
          AND cn_at.code = 'chr'
        });
    $sth->execute;

    my $aeh = {};
    while( my ($equiv_asm, $equiv_chr, $atype) = $sth->fetchrow()) {
        $aeh->{$equiv_asm}{$equiv_chr} = $atype;
    }
    return $aeh;
}


sub _sortbychrnum {

  my @awords = split /_/,$a;
  my @bwords = split /_/,$b;

  my $anum = $awords[0];
  my $bnum = $bwords[0];

  $anum =~ s/chr//;
  $bnum =~ s/chr//;

  if ($anum !~ /^[0-9]*$/) {
    if ($bnum !~ /^[0-9]*$/) {
      return $anum cmp $bnum;
    } else {
      return 1;
    }
  }
  if ($bnum !~ /^[0-9]*$/) {
    return -1;
  }

  if ($anum <=> $bnum) {
    return $anum <=> $bnum;
  } else {
    if ($#awords == 0) {
      return -1;
    } elsif ($#bwords == 0) {
      return 1;
    } else {
	return $awords[1] cmp $bwords[1];
      }
  }
}


1;

__END__


