#!/usr/bin/env perl

=head1 LICENSE

Copyright [2018-2025] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

      https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut

=head1 NAME

simulate_variation

=head1 DESCRIPTION

Script to generate all possible SNPs (VCFv4 file) for protein_coding gene(s) given specific species or chromosome or gene.

=cut

use strict;
use warnings;
use Bio::EnsEMBL::Registry;
use Getopt::Long qw(GetOptions);

my ($help, $chromOnly, $gene, $registryFile,
$exonOnly, $intronOnly, $codingOnly,$onlyMane,$refseq_genes,$spliceai
);
my ($species, $assembly, $edges, $outFile) = ("homo_sapiens", "grch38", 0, "simulated.vcf");

usage() if (!scalar(@ARGV));

GetOptions( "species=s" => \$species,
    "chrom=s" => \$chromOnly,
    "gene=s"  => \$gene,
    "edge=i"  => \$edges,
    "output|o=s"  => \$outFile,
    "exonsOnly"   => \$exonOnly,
    "codingOnly"  => \$codingOnly,
    "onlyMane"    => \$onlyMane,
    "intronsOnly" => \$intronOnly,
    "assembly=s"  => \$assembly,
    "refseq"      => \$refseq_genes,
    "spliceai"    => \$spliceai,
    "registry=s"  => \$registryFile,
    "help!" => \$help,
) or usage();

usage() if ($help);

die "Illegal arguments -exonsOnly -intronsOnly can't be used together !" if $exonOnly && $intronOnly;
die "Illegal arguments -intronsOnly -codingsOnly can't be used together !" if $codingOnly && $intronOnly;
die "Illegal arguments -onlyMane can only be used alone !" if $onlyMane && ($exonOnly || $codingOnly || $intronOnly);

my $registry = 'Bio::EnsEMBL::Registry';
if (defined $registryFile){
  $registry->load_all($registryFile);
} else {
  my $port = 3306;
  if ($assembly eq 'grch37') {
    $port = 3337;
  }
  $registry->load_registry_from_db(
    -host => 'ensembldb.ensembl.org',
    -user => 'anonymous',
    -port => $port,
  );
}

print "getting species ($species)";
print ", chrom ($chromOnly)" if $chromOnly;
print ", gene ($gene)", if $gene;
print ", out: $outFile\n";
my $dbtype = $refseq_genes ? "otherfeatures" : "core";
my $logic_name = $refseq_genes ? "refseq_import" : undef;

my $slice_adaptor = $registry->get_adaptor( $species, $dbtype, 'slice' );
my $gene_adaptor = $registry->get_adaptor( $species, $dbtype, "gene" );

my $biotype = 'protein_coding';

my $FHO;
open ($FHO, ">$outFile") or die "Can't open file to write: $outFile, $!\n";

print_header($FHO, $assembly, $spliceai);

my @geneIDs;
if ($chromOnly || $gene){ #if chrom or gene specified then use the most stringent
  if ($gene){
    if ($gene !~ /^ENS.*/){
      @geneIDs = @{ $gene_adaptor->fetch_all_by_external_name($gene)};
    } else {
      @geneIDs = $gene_adaptor->fetch_by_stable_id($gene);
    }
    processGenes(\@geneIDs, $FHO);
  } else{
    my $chr_slice = $slice_adaptor->fetch_by_region( 'chromosome', $chromOnly);
    @geneIDs = @{ $chr_slice->get_all_Genes_by_type($biotype, $logic_name, 1)} if defined($chr_slice);
    print "processing genes: ", scalar @geneIDs, "\n";
    processGenes(\@geneIDs, $FHO);
  }
} else{ # print for all protein_coding genes for the species
  my $chroms = $slice_adaptor->fetch_all('chromosome');
  print "nr of chroms: ", scalar @$chroms, "\n";
  while (my $slice = shift @$chroms) {
    print "processing chrom: ", $slice->seq_region_name, "\n";
    @geneIDs = @{ $slice->get_all_Genes_by_type($biotype, $logic_name, 1)};
    processGenes(\@geneIDs, $FHO);
  }
}

close($FHO);


sub print_header {
  my $FHO = shift;
  my $assembly = shift;
  my $spliceai = shift;

  print $FHO "##fileformat=VCFv4.2", "\n";
  if(defined $spliceai) {
    my $reference = $assembly eq "grch38" ? "GRCh38/hg38" : "GRCh37/hg19";
    print $FHO "##reference=$reference", "\n";
    print $FHO "##contig=<ID=1,length=248956422>", "\n";
    print $FHO "##contig=<ID=2,length=242193529>", "\n";
    print $FHO "##contig=<ID=3,length=198295559>", "\n";
    print $FHO "##contig=<ID=4,length=190214555>", "\n";
    print $FHO "##contig=<ID=5,length=181538259>", "\n";
    print $FHO "##contig=<ID=6,length=170805979>", "\n";
    print $FHO "##contig=<ID=7,length=159345973>", "\n";
    print $FHO "##contig=<ID=8,length=145138636>", "\n";
    print $FHO "##contig=<ID=9,length=138394717>", "\n";
    print $FHO "##contig=<ID=10,length=133797422>", "\n";
    print $FHO "##contig=<ID=11,length=135086622>", "\n";
    print $FHO "##contig=<ID=12,length=133275309>", "\n";
    print $FHO "##contig=<ID=13,length=114364328>", "\n";
    print $FHO "##contig=<ID=14,length=107043718>", "\n";
    print $FHO "##contig=<ID=15,length=101991189>", "\n";
    print $FHO "##contig=<ID=16,length=90338345>", "\n";
    print $FHO "##contig=<ID=17,length=83257441>", "\n";
    print $FHO "##contig=<ID=18,length=80373285>", "\n";
    print $FHO "##contig=<ID=19,length=58617616>", "\n";
    print $FHO "##contig=<ID=20,length=64444167>", "\n";
    print $FHO "##contig=<ID=21,length=46709983>", "\n";
    print $FHO "##contig=<ID=22,length=50818468>", "\n";
    print $FHO "##contig=<ID=X,length=156040895>", "\n";
    print $FHO "##contig=<ID=Y,length=57227415>", "\n";
    print $FHO "##contig=<ID=MT,length=16569>", "\n";
  }
  else {
    print $FHO '##INFO=<ID=GENE,Number=1,Type=String,Description="Gene symbol or Gene stable id">',"\n";
    print $FHO '##INFO=<ID=FEATURE,Number=1,Type=String,Description="Feature id">',"\n";
  }
  print $FHO "#".join("\t", qw/CHROM POS ID REF ALT QUAL FILTER INFO/), "\n";
}

#subroutine: for a given set of genes, extract the needed features + call generateSNPs (writes out all possible SNPs)
sub processGenes{
  my $geneArr = shift;
  my $fho = shift;

  foreach my $geneID (@$geneArr) {
    next unless $geneID->biotype eq $biotype;

    my @parts; # set of exons OR introns
    if ($codingOnly){
      my %exons;
      my @transc = @{ $geneID->get_all_Transcripts() };
      while ( my $transcript = shift @transc ) {
        next unless $transcript->biotype() eq $biotype;
        foreach my $exon (@{ $transcript->get_all_translateable_Exons() }) {
            $exons{$exon->stable_id} = $exon;
        }
      }
      @parts = values %exons;
    } elsif ($exonOnly){
      @parts = @{$geneID->get_all_Exons()};
    } elsif($intronOnly){
      @parts = @{$geneID->get_all_Introns()};
    } elsif ($onlyMane){
      my @transc = @{ $geneID->get_all_Transcripts() };
      while ( my $transcript = shift @transc ) {
        if($transcript->is_mane()) {
          @parts = ($transcript);
        }
      }
    }
      else {
      @parts = ($geneID);
    }

    generateSNPs(\@parts,defined $geneID->external_name ? $geneID->external_name : $geneID->stable_id, $fho, $spliceai);
  }
}  

#subroutine: for a given set of features + given edge, write out all possible SNPs
sub generateSNPs {
  my $elements = shift;
  my $gene_name = shift;
  my $fho = shift;
  my $spliceai = shift;

  my @bases = qw/A C T G/;
  while( my $part = shift (@$elements)){
    my $start = $part->seq_region_start - $edges;
    my $pos = $start;
    my $chrom = $part->seq_region_name;
    my $partID;
    $partID = $part->stable_id if $exonOnly;
    $partID = $part->display_id if $intronOnly;
    $partID = $part->display_id;
    my $seq = $slice_adaptor->fetch_by_region("chromosome", $chrom, $start ,$part->seq_region_end + $edges)->seq;
    my @seqAr = split("", $seq);
    my ($id, $qual, $filter, $info) = (".", ".", ".", ".");
    for ( my $i = 0; $i < @seqAr; $i++ ) {
         my $ref_seq = $seqAr[$i];
         my @tmp_alts = grep {$_ ne $ref_seq} @bases;
         $pos = $start + $i;
         if(!$spliceai) {
           $info = 'GENE='.$gene_name.';'.'FEATURE='.$partID;
         }
         while(my $tmp_alt = shift @tmp_alts) {
           if(!$spliceai) {
             $id = $chrom."-".$pos."-".$ref_seq."-".$tmp_alt;
           }
           printf $fho "%s\t%s\t%s\t%s\t%s\t", $chrom, $pos, $id, $ref_seq, $tmp_alt;
           printf $fho "%s\t%s\t%s\n", $qual, $filter, $info;
        }
    }
  }
}

sub usage {

  print qq{
  Help: dev\@ensembl.org , helpdesk\@ensembl.org

  Usage:  ./simulate_variation [OPTIONS]

  Generate a VCFv4 file with all possible SNPs for protein_coding genes given specific species or chromosome or gene.

  Options:

      -species           Generate SNPs for specified species (Default: human)
      -assembly          Assembly to use if species is human (Default: grch38)
      -refseq            Use RefSeq genes/transcripts if species is human
      -registry          Ensembl registry file containing database connection details

      -output|o          Output file (Default: simulated.vcf)

      -chrom             Generate SNPs only for specified chromosome
      -gene              Generate all SNPs for specified gene
      -spliceai          Generate files for SpliceAI tool
      -codingOnly        Generate SNPs only for translatable exons of protein_coding transcripts
      -exonsOnly         Generate SNPs only for all exons of the genes
      -intronsOnly       Generate SNPs only for all introns of the genes
      -onlyMane          Generate SNPs only for MANE transcripts

      -edge              Length of flanking region in which to generate SNPs (Default: 0)

      -help              Print this message
  } . "\n";
  exit(0);
}
