#!/usr/local/ensembl/bin/perl -w

# Helper script for the GENCODE tracking system.
#
# sub-project Zfish
# Read tab-delimited file, store in internal tracking system db
#

package gencode_tracking_system::sources::zfishlist;

use strict;
use gencode_tracking_system::core;
use gencode_tracking_system::config;
use base 'Exporter';

our @EXPORT    = qw( run_update );

my (%genes, %exons, %starts, %ends, %chroms, %strands, %problems);
my $exons = 0;
my $features = 0;
my ($tracking_dbh, $prepare_hash, $user_id, $category_id);
my ($current_gene_id, $updated);

sub run_update {
  my ($tracking_dbhh, $prepare_hashh, $user_idd, $category_idd) = @_;

  my $file        = "/nfs/acari/fsk/2_encode/data/other/ZFIN_Otter_comparison.final"; #"/nfs/acari/fsk/0_tmp/zebra.1.txt";
  $tracking_dbh   = $tracking_dbhh;
  $prepare_hash   = $prepare_hashh;
  $user_id        = $user_idd;
  $category_id    = $category_idd;

  #open data file
  open(IN, "<$file") or die "cant open file $file!\n";

#  my %transcript;
#  $transcript{'chrom'}        = "1";
#  $transcript{'biotype'}      = "general";
#  $transcript{'status'}       = "new";
#  $transcript{'type'}         = "gene";
#  $transcript{'start'}        = 1;
#  $transcript{'end'}          = 1;
#  $transcript{'strand'}       = 1;
#  $transcript{'description'}  = "New data in need of annotation";
#  $transcript{'id'}           = "missing-annotation";
#
#  my $current_gene_id2 = store_features($tracking_dbh, $prepare_hash, \%transcript, 'gene', 0, 0, 
#				    $user_id, $category_id);
#  return 0;
#  die;


  #get general project instead:
  #$current_gene_id = 423758;  #get_project_for_category($prepare_hash, $tracking_dbh, $category_id);
  #die "Can't find general project for category $category_id!\n" unless $current_gene_id;

  print STDERR "FOUND $category_id, $current_gene_id.\n";

  #go through file, read and store data as genes & transcripts
  while (my $line = <IN>){
    chomp $line;
    #ignore non-data lines
    next if($line =~ /^#/);

    #Needs annotation: tbx22 AL926996.1:1-77 CR293501.10:48991-49067(1)              chr14_20080612:4806760-4806836(1)

    print STDERR ">$line\n";

    $line =~ /^([\w\s]+)\:\s([\w\d\-\.\:]+)\s([\w\d\:\-\.]+)\s+([\w\d\:\-\.\(\)]+)\s\s([\w\d\:\-\.\(\)]+)/;

    my $problem     = $1;
    my $gene        = $2;
    my $clone       = $3;
    my $cdna        = $4;
    my $chromosomal = $5;

    if(!($problem && $gene && $clone && $cdna && $chromosomal)){
      die "FORMAT $line\n";
    }

    my $description = "Issue: $problem\nGene: $gene\nClone: $clone\ncDNA: $cdna\n Location: $chromosomal";

    $chromosomal =~ /^chr([\w\d]+)_\d+\:(\d+)-(\d+)\(?(.*)\)?$/;

    my %exon = ();
    $exon{"chrom"}  = $1;
    $exon{"start"}  = $2;
    $exon{"end"}    = $3;
    $exon{"strand"} = $4;
    $exon{"strand"} =~ s/\(//;
    $exon{"strand"} =~ s/\)//;
    $exon{"type"}   = "subfeature";
    $exon{"parent"} = $cdna;

    print STDERR "ENTRY ".$exon{"chrom"}.", ".$exon{"start"}.", ".$exon{"end"}.", ".$exon{"strand"}.", $problem, $gene, $clone, $cdna.\n";

    #store exons with transcript/gene
    #new subfeature
    if(!exists($transcripts{$cdna})){
      my %transcript = (
			'chrom'       => $exon{"chrom"},
			'start'       => $exon{"start"},
			'end'         => $exon{"end"},
			'strand'      => $exon{"strand"},
			'parent'      => $gene;
		       );

    #new gene
    if(!exists($genes{$gene})){
      my %gene = (
		  'chrom'       => $exon{"chrom"},
		  'start'       => 1,
		  'end'         => 1,
		  'strand'      => '+',
		  'description' => '',
		  'id'          => $gene,
		  'type'        => 'gene',
		  'transcripts' => [],
		 );
      $genes{$gene} = \%gene;
      push(%genes
    }

    #new transcript
    if(!exists($genes{$gene})){
      my %gene = (
		  'chrom'       => $exon{"chrom"},
		  'start'       => 1,
		  'end'         => 1,
		  'strand'      => '+',
		  'description' => '',
		  'id'          => $gene,
		  'type'        => 'gene',
		 );
      $genes{$gene} = \%gene;
      push(%genes
    }


      $exons{$gene}    = [];
      $starts{$gene}   = 999999999;
      $ends{$gene}     = 0;
      $chroms{$gene}   = $exon{"chrom"};
      $strands{$gene}  = $exon{"strand"};
      $problems{$gene} = $problem;
      $genes{$gene}    = $description;
    }
    #store exons
    push(@{ $exons{$gene} }, \%exon);
    $exons++;

    #adjust gene coords
    if($exon{"start"} < $starts{$gene}){
      $starts{$gene} = $exon{"start"};
    }
    if($exon{"end"} > $ends{$gene}){
      $ends{$gene} = $exon{"end"};
    }
  }

  close(IN);

  foreach my $gene (keys %genes){

    my $c = 0;

    ($current_gene_id, $updated) = store_features($tracking_dbh, $prepare_hash, \%gene, 'gene',
							0, 0, $user_id, $category_id);


    #build structure for transcript
    my %transcript = ();
    $transcript{'chrom'}        = $chroms{$gene};
    $transcript{'type'}         = 'transcript';
    $transcript{'start'}        = $starts{$gene};
    $transcript{'end'}          = $ends{$gene};
    $transcript{'strand'}       = $strands{$gene};
    $transcript{'id'}           = $gene;
    $transcript{'description'}  = $genes{$gene};

    #store transcript
    my ($current_transcript_id, $updated_1) = store_features($tracking_dbh, $prepare_hash, \%transcript,
							     'transcript', $current_gene_id, 0, 
							     $user_id, $category_id);
    $features++;
    if($updated_1){
      print STDERR "updated transcript! Refreshing subfeatures for $current_transcript_id!\n";
      remove_subfeatures($tracking_dbh, $prepare_hash, $current_transcript_id);

      #store exons
      foreach my $exon (@{ $exons{$gene} }){
	$exon->{'id'} = $gene."_".$c++;
	my ($current_feature_id, $updated_2) = store_features($tracking_dbh, $prepare_hash, $exon,
							      'subfeature', $current_gene_id, $current_transcript_id, 
							      $user_id, $category_id);
      }
    }

  }
	
  print STDERR "WE FOUND ".(scalar keys %genes)." genes & ".(scalar $exons)." exons.\n";

  return $features;

}


1;

__END__


cbi4: /lustre/cbi4/work1/zfish/ZFIN/cDNA_placements/200907/ZFIN_Otter_comparison.final

Needs annotation: tbx22 AL926996.1:1-77 CR293501.10:48991-49067(1)              chr14_20080612:4806760-4806836(1)
Needs annotation: tbx22 AL926996.1:78-245       CR293501.10:49154-49321(1)              chr14_20080612:4806923-4807090(1)
