#!/usr/bin/env perl

=head1 LICENSE

Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2025] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut


=head1 CONTACT

 Please email comments or questions to the public Ensembl
 developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.

 Questions may also be sent to the Ensembl help desk at
 <http://www.ensembl.org/Help/Contact>.

=cut

=head2 import_clinvar_xml

  - parse ClinVar XML and import data for dbSNP or dbVar variants
  - add the short variant data if ClinVar is releasing ahead of dbSNP
  - only import ClinVar records for structural variants already in ensembl
  - use the positions of the features held in ensembl where possible

  - attribs entered
  - review_status    = Status
  - external_id      = Acc
  - clinvar_clin_sig = Desc
  - risk_allele      = HGVS allele (dbSNP variants only)

=cut

use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
use Date::Manip::Date;
use XML::LibXML::Reader;
use XML::Hash::XS qw();
use Text::ParseWords;
use Scalar::Util 'blessed';
use List::MoreUtils qw(uniq);

use Bio::EnsEMBL::Variation::Utils::Sequence qw( get_hgvs_alleles SO_variation_class);
use Bio::EnsEMBL::Variation::Utils::SpecialChar qw(replace_char decode_text);
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
use Bio::EnsEMBL::Variation::Utils::Config qw(%ATTRIBS);
use Bio::EnsEMBL::Variation::Utils::Reports qw(report_counts);
use Bio::EnsEMBL::Variation::VariationFeature;
use Bio::EnsEMBL::Slice;
use Bio::DB::Fasta;
use Bio::EnsEMBL::Registry;

our $DEBUG = 0;

my ($data_file, $registry_file, $assembly, $structvar, $done_file, $clean, $insert_tv);

GetOptions ("data_file=s"  => \$data_file,
            "registry=s"   => \$registry_file,
            "assembly=s"   => \$assembly,
            "structvar"    => \$structvar, 
            "done_file=s"  => \$done_file,
            "debug"        => \$DEBUG,
            "clean"        => \$clean,
            "insert_tv=s"  => \$insert_tv
            );

usage() unless defined $data_file && defined $registry_file && defined $assembly;
die "File not found: $data_file\n" unless -e $data_file;

my $reg = 'Bio::EnsEMBL::Registry';
$reg->load_all($registry_file);
my $dba = $reg->get_DBAdaptor('homo_sapiens','variation');
## only merging records by name, so include fails
$dba->include_failed_variations(1);
my $dbh = $dba->dbc;
$dbh->reconnect_when_lost(1);

my $variation_adaptor     = $dba->get_VariationAdaptor('human', 'variation', );
my $var_feat_adaptor      = $dba->get_VariationFeatureAdaptor('human', 'variation', );
my $tva                   = $dba->get_TranscriptVariationAdaptor('human', 'variation', );
my $structvar_adaptor     = $dba->get_StructuralVariationAdaptor('human', 'variation', );
my $attrib_adaptor        = $dba->get_AttributeAdaptor();

my $pheno_feat_adaptor    = $reg->get_adaptor('homo_sapiens', 'variation', 'phenotypefeature');
my $phenotype_adaptor     = $reg->get_adaptor('homo_sapiens', 'variation', 'phenotype');

my $slice_adaptor         = $reg->get_adaptor('homo_sapiens', 'core', 'slice');
$slice_adaptor->dbc->reconnect_when_lost(1);

# If insert_tv = 1
# update can only continue if MTMP_transcript_variation table exists
if($insert_tv){
  check_mtmp_table($dba);
}

## fetch ClinVar source object and update version based on input file name
# fetch ClinVar and OMIM and updates version for both
my $omimstr= 'OMIM';
my $source      = get_source($data_file, 'ClinVar');
my $omim_source = get_source($data_file, $omimstr);

my $phenotype_evidence = 'Phenotype_or_Disease';
my $pheno_evidence_id = get_attrib_id('evidence',$phenotype_evidence);

## pre-load submitter ids - shortcutting the API for speed
## get ClinVar submitters names from the database
my $submitters = get_submitter_ids($dba);

# count number of rows before import
# report transcript_variation counts the even if the table is not going to be populated to make sure the count is correct
my @count_tables = qw(variation variation_feature transcript_variation);
report_counts($dba, "before", \@count_tables);

## remove old data
clean_old_data() if defined $clean; 

## OMIM set id
my $omim_set_id   = get_set($dbh, 'ph_omim');
my %omimVarSet;

my $pheno_set_id   = get_set($dbh, "ph_variants"); #All phenotype/disease variants
my $clinvar_set_id = get_set($dbh, "ClinVar"); #All ClinVar variants
my $clinassoc_set_id = get_set($dbh, "clin_assoc"); #ClinVar pathogenic variants


## default ClinVar phenotype description if non available
my $default_pheno  = "ClinVar: phenotype not specified";
my $non_specified_class_id = get_attrib_id('phenotype_type', 'non_specified');
my %non_specified_pheno = ("None"=>1,
                     "Not provided"=> 1,
                     "not specified" => 1,
                     "Not in OMIM" => 1,
                     "Variant of unknown significance" => 1,
                     "not_provided" => 1,
                     "ClinVar: phenotype not specified" => 1,
                     "See cases" => 1,
                     "?" => 1, "." => 1);
my $haplotype_type = "Haplotype";

## get current phenotypes list
## the phenotype list includes a clean version of the descriptions
my %phenotype_cache = %{get_phenotype_cache()};

## accepted clinsig values (germline)
my $def_clinsig = $ATTRIBS{clinvar_clin_sig};
my %known_clinsig = map { $_ => 1  } @{$def_clinsig};

## accepted clinsig values (somatic + somatic oncogenicity)
my $def_clinsig_somatic = $ATTRIBS{clinvar_somatic};
my %known_clinsig_somatic = map { $_ => 1  } @{$def_clinsig_somatic};

## handle incomplete runs - takes list of ClinVar RC accessions 
my %done;
if (defined $done_file){
  open my $donelist, $done_file or die "Unable to open list of variants already entered : $!\n";
  while(<$donelist>){
    my $id = (split)[1];
    $done{$id} = 1;
  }
  close $donelist;
}


## parse input file
my $data_file_fh;
if ($data_file =~/.gz/) {
  open ($data_file_fh, '<:gzip', $data_file )
    or die "Cannot read file $data_file: $!\n"; #.xml.gz
} else {
  open ( $data_file_fh, '<', $data_file )
    or die "Cannot read file $data_file: $!\n"; #.xml
}

my $reader = XML::LibXML::Reader->new(IO => $data_file_fh);

my $step=1;
while($reader->read) {
  next unless $reader->nodeType == XML_READER_TYPE_ELEMENT;
  next unless $reader->name eq 'ReleaseSet' || $reader->name eq 'ClinVarSet';

  #get ReleaseSet date from within file
  if ($reader->name eq 'ReleaseSet') {
    my $releaseDate = $reader->getAttribute('Dated'); #2019-04-04
    if (defined $releaseDate) {
      my $versionDate = new Date::Manip::Date;
      $versionDate->parse($releaseDate);
      print "INFO: ReleaseSet version $releaseDate.\n";
    }
  } else {
    my $xml = $reader->readOuterXml;
    my $conv = XML::Hash::XS->new(utf8 => 0, encoding => 'utf-8');
    my $set  = $conv->xml2hash($xml, encoding => 'latin1'); #nice encoding of special char
    process_clinvar_set($set);
  }

}

close ($data_file_fh);

# count number of rows after import
report_counts($dba, "after", \@count_tables);


sub process_clinvar_set {
  my $set = shift;

  ## dump current structure on exit
  my $current = Dumper $set;

  my %record;
  eval{
    ## get accesion & version
    $record{Acc} = get_accession($set->{ReferenceClinVarAssertion}->{ClinVarAccession});
    print "\nProcessing $record{Acc}\n" if $DEBUG == 1;

    ## recover from partial loads with file of accessions already loaded
    if ($done{$record{Acc}}){
      print "INFO: skipping $record{Acc}\n" if $DEBUG == 1;
      return undef;
    }

    ## clinical significance, confidence of assertation and date of assertion
    $record{Classifications} = get_clinsig($set->{ReferenceClinVarAssertion}->{Classifications});

    # Skip invalid clinical significance (germline)
    if (defined $record{Classifications}->{germline} && 
        ($record{Classifications}->{germline}->{clin_sign} =~ "no classifications from unflagged records" ||
         $record{Classifications}->{germline}->{clin_sign} eq "")) {
      return undef;
    }

    ## check for somatic, Autosomal dominant, etc.
    $record{inheritance_type} = get_inheritance($set->{ReferenceClinVarAssertion}->{AttributeSet});

    # Some entries do not have mode of inheritance
    # Try to get the somatic status from the sample origin
    if(!$record{inheritance_type}) {
      my $sample_origin = get_sample_origin($set->{ReferenceClinVarAssertion}->{ObservedIn});
      if(defined $sample_origin && $sample_origin eq "somatic") {
        $record{inheritance_type} = "Somatic mutation";
      }
    }

    ## trait info (using Acc for error logging)
    ($record{disease}, $record{ontology_accession}) = get_disease($set->{ReferenceClinVarAssertion}->{TraitSet}->{Trait}, $record{Acc});

    ## extract arrayref of PMIDs for citations
    # The germline classification has the citations under ObservedData
    # The somatic classification has the citations under ClinVarAssertion - Classification - "SomaticClinicalImpact"
    $record{citations} = get_citations($set->{ReferenceClinVarAssertion}->{ObservedIn}, "ObservedData");
    $record{citations_somatic} = get_citations($set->{ClinVarAssertion}, "Classification");

    ## Variant name, HGVS, OMIM id and location
    if(defined $set->{ReferenceClinVarAssertion}->{GenotypeSet} && $DEBUG == 1) {
      print "WARNING: Found GenotypeSet(type: $set->{ReferenceClinVarAssertion}->{GenotypeSet}->{Type}) found for $record{Acc}.\n";
    }

    if (! defined $set->{ReferenceClinVarAssertion}->{MeasureSet} ||
      !defined $set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type}){
        print "WARNING: No MeasureSet Type found for $record{Acc}.\n" if $DEBUG == 1;
        return undef;
    }

    if ( $set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type} ne "Variant" &&
      $set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type} ne $haplotype_type ) {
        print "WARNING: Unsupported MeasureSet Type found for $record{Acc}: $set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type} .\n" if $DEBUG == 1;
        return undef;
    }

    $record{clinvar_variant_id} = $set->{ReferenceClinVarAssertion}->{MeasureSet}->{Acc};
    $record{feature_info} = get_feature($set->{ReferenceClinVarAssertion}->{MeasureSet}->{Measure}, $record{Acc},$set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type});

    ($record{submitters}, $record{submitters_somatic}) = get_submitters($set->{ClinVarAssertion}, $record{feature_info});

    if (defined $structvar ){
      if( defined $record{feature_info}->{dbVar} && $record{feature_info}->{dbVar}->[0] =~/\d+/ ){
        print "Importing SV :  $record{feature_info}->{dbVar}->[0]\n" if $DEBUG == 1;
        import( \%record);
      }
    }
    else{
      if( exists $record{feature_info}->{dbSNP} && $record{feature_info}->{dbSNP}->[0] =~/\d+/ ){
         print "Importing Var :  $record{feature_info}->{dbSNP}->[0] ($record{Acc})\n" if $DEBUG == 1;
         import( \%record);
      } elsif (exists $record{feature_info}{Chr} && exists $record{feature_info}{start}) {
        print "Importing Var based on location+allele_ref lookup of existing match in DB\n" if $DEBUG == 1;
        import( \%record, 1);
      }
      else{
    	my $message =  "Not importing var: ";
    	$message .= " rs: $record{feature_info}->{dbSNP} "        if defined $record{feature_info}->{dbSNP} ;
    	$message .= " on chr: $record{feature_info}->{Chr} "      if defined $record{feature_info}->{Chr} ;
    	$message .= " with HGVS: $record{feature_info}->{hgvs_g}" if defined $record{feature_info}->{hgvs_g} ;
    	$message .= " due to missing data ($record{Acc})\n";
      print $message if $DEBUG == 1;
      }
    }
  };
  if( $@ ne ''){
      #die "ERROR: $@\n";
      die "ERROR: $@\n$current\n\n";
  } 
}

## find old set id ( and remove linked variants)
##  or enter new one
sub get_set{

  my $dbh = shift;
  my $set = shift; ## short attrib name for set

  ## info on sets
  my $data =      {
    'ph_omim'           => {
      'desc' => 'Variants linked to entries in the Online Mendelian Inheritance in Man (OMIM) database',
      'name' => 'OMIM phenotype variants'},
    'ph_variants'       => {
      'desc' => 'Variants that have been associated with a phenotype or a disease',
      'name' => 'All phenotype/disease-associated variants'},
    'ClinVar'           => {
      'desc' => 'Variants with ClinVar annotation',
      'name' => 'All ClinVar'},
    'clin_assoc' => {
      'desc' => 'Variants described by ClinVar as being probable-pathogenic, pathogenic, drug-response or histocompatibility',
      'name' => 'Clinically associated variants'
     },

  };

  my $set_ext_sth = $dbh->prepare(qq[ select variation_set_id
                                      from variation_set, attrib
                                      where variation_set.short_name_attrib_id = attrib.attrib_id
                                      and attrib.value =? ]);

  my $set_ins_sth = $dbh->prepare(qq[ insert into variation_set
                                     (name, description, short_name_attrib_id)
                                      values ( ?,?,?  ) ]);

  my $set_del_sth = $dbh->prepare(qq[ delete from variation_set_variation where variation_set_id = ?]);

  ### look for old set record
  $set_ext_sth->execute( $set);
  my $set_id = $set_ext_sth->fetchall_arrayref();

  if (defined $set_id->[0]->[0] ){
    ## remove old set content
    $set_del_sth->execute($set_id->[0]->[0]) if defined $clean;
    return $set_id->[0]->[0] ;
  }

  ### enter new set record
  my $attrib_id = get_attrib_id('short_name',$set);

  $set_ins_sth->execute( $data->{$set}->{name}, $data->{$set}->{desc}, $attrib_id );
  $set_id = $dbh->db_handle->last_insert_id(undef, undef, qw(variation_set set_id)) or die "no insert id for set $set\n";

  return $set_id;

}


## get ClinVar accession & version
sub get_accession{

  my $ClinVarAccession = shift;

  my $accession =  $ClinVarAccession->{Acc} .".".  $ClinVarAccession->{Version} ;

  return $accession;
}

## Fetch and clean clinical significance
## There are three types of classification:
##   - GermlineClassification
##   - SomaticClinicalImpact
##   - OncogenicityClassification
## A single submission can have more than one type of classification.
sub get_clinsig {
  my $classifications = shift;

  my $desc;
  my $clinical_impact_assertion;
  my $clinical_impact_clin_sig;
  my $status;
  my $date;
  my $type;
  my %result_by_type; # clinical significance by type

  # Check by type of classification
  if (defined $classifications->{GermlineClassification}) {
    $type = "germline";
    # Clinical significance - conflicting interpretations need an explanation
    if ($classifications->{GermlineClassification}->{Description}->{content} =~/Conflicting interpretations of pathogenicity/) {
      ## Description = 'conflicting data from submitters' - the values are in the explanation
      defined $classifications->{GermlineClassification}->{Explanation} ?
      $desc = "\L$classifications->{GermlineClassification}->{Explanation}->{content}" :
      print "warning: conflicting ClinicalSignificance but no Explanation\n";
      $desc |='';
      $desc =~ s/\(\d+\)//g; ## remove bracketed counts
      $desc =~ s/\;/\,/g;    ## switch to comma delimited for set
    }
    else {
      $desc = "\L$classifications->{GermlineClassification}->{Description}->{content}";
    }

    # Remove empty space from beginning and end of the description
    $desc =~ s/^\s*(.*?)\s*$/$1/;

    my %germline_data = (
      "clin_sign" => $desc,
      "status" => $classifications->{GermlineClassification}->{ReviewStatus},
      "date" => $classifications->{GermlineClassification}->{Description}->{DateLastEvaluated}
    );

    $result_by_type{$type} = \%germline_data;
  }

  # Somatic classification
  if (defined $classifications->{SomaticClinicalImpact}) {
    my @somatic_desc;
    my @dates;
    my @clinical_assertions;
    my @clinical_impact_cs;
    $type = "somatic";
    # {SomaticClinicalImpact}->{Description} can be array or hash
    if (ref($classifications->{SomaticClinicalImpact}->{Description}) eq "ARRAY") {
      for my $description (@{$classifications->{SomaticClinicalImpact}->{Description}}) {
        # clinical significance
        my $tmp_desc = $description->{content};
        # clinical impact assertion type
        if($description->{ClinicalImpactAssertionType}) {
          $tmp_desc .= ":" . $description->{ClinicalImpactAssertionType};
        }
         # clinical impact clinical significance
        if($description->{ClinicalImpactClinicalSignificance}) {
          $tmp_desc .= ":" . $description->{ClinicalImpactClinicalSignificance};
        }
        push @somatic_desc, $tmp_desc;

        # date last evaluated
        push @dates, $description->{DateLastEvaluated};
      }
      $desc = join(",", @somatic_desc);
      $date = join(",", @dates);
    }
    else {
        $desc = $classifications->{SomaticClinicalImpact}->{Description}->{content};
        if($classifications->{SomaticClinicalImpact}->{Description}->{ClinicalImpactAssertionType}) {
          $desc .= ":" . $classifications->{SomaticClinicalImpact}->{Description}->{ClinicalImpactAssertionType};
        }
        if($classifications->{SomaticClinicalImpact}->{Description}->{ClinicalImpactClinicalSignificance}) {
          $desc .= ":" . $classifications->{SomaticClinicalImpact}->{Description}->{ClinicalImpactClinicalSignificance};
        }
        $date = $classifications->{SomaticClinicalImpact}->{Description}->{DateLastEvaluated};
    }

    # Remove empty space from beginning and end of the description
    $desc =~ s/^\s*(.*?)\s*$/$1/;

    my %somatic_data = (
      "clin_sign" => $desc,
      "status" => $classifications->{SomaticClinicalImpact}->{ReviewStatus},
      "date" => $date
    );

    $result_by_type{$type} = \%somatic_data;
  }

  # Oncogenic classification
  if (defined $classifications->{OncogenicityClassification}) {
    $type = "oncogenicity";
    $desc = $classifications->{OncogenicityClassification}->{Description}->{content};

    # Remove empty space from beginning and end of the description
    $desc =~ s/^\s*(.*?)\s*$/$1/;

    my %oncogenic_data = (
      "clin_sign" => $desc,
      "status" => $classifications->{OncogenicityClassification}->{ReviewStatus},
      "date" => $classifications->{OncogenicityClassification}->{Description}->{DateLastEvaluated}
    );

    $result_by_type{$type} = \%oncogenic_data;
  }

  return \%result_by_type;
}

=head2 get_inheritance
Mode of inheritance attribute holds somatic status.
The 'ModeOfInheritance' depends on the submitter,
most of the submitters do not provide a value.
=cut
sub get_inheritance{

  my $attribute_set = shift;

  my $moi;

  my $attributes = to_array($attribute_set);
  foreach my $attribute(@{$attributes}){
    next unless defined $attribute->{Attribute} &&
                      $attribute->{Attribute}->{Type} eq 'ModeOfInheritance';

    $moi = $attribute->{Attribute}->{content};
  }

  return $moi;
}

=head2 get_sample_origin
Method to fetch the origin of the sample:
germline, somatic
=cut
sub get_sample_origin {
  my $observed_in = shift;

  my $origin;

  my $attributes = to_array($observed_in);

  foreach my $attribute(@{$attributes}){
    next unless defined $attribute->{Sample} && defined $attribute->{Sample}->{Origin};

    $origin = $attribute->{Sample}->{Origin};
  }

  return $origin;
}

=head2 get_feature

- get variant/ structural variant info + location on required assembly
- inlude OMIM ids and HGVS for short variants 
=cut

sub get_feature{

  my $Measure   = shift;
  my $accession = shift;
  my $type      = shift;

  my %feature;

  my $measures = to_array($Measure);
  foreach my $measure(@{$measures}){

#    next unless(ref($measure) eq 'HASH'); ##
    if(ref($measure) ne 'HASH'){
      print "Multiple measures for $accession - not loading\n" if $DEBUG == 1;
      next;
    } 

    ## dbSNP/ dbVAR and OMIM ids
    if(defined $measure->{XRef}){
      my $xref_set = to_array($measure->{XRef});
      foreach my $xref( @{$xref_set} ){
        next unless ref($xref) eq  'HASH';
        if (defined $xref->{Type}) {
          #save eg: measure_id(15437) -> xref_db(OMIM) -> type(Allelic variant) -> id(612779.003)
          push @{$feature{measureXrefs}{$measure->{ID}}{$xref->{DB}}{$xref->{Type}} }, $xref->{ID} ;
        } else {
          $xref->{DB} eq 'dbVar' ?
          push @{$feature{$xref->{DB}}},$xref->{ID} :
          push @{$feature{measureXrefs}{$measure->{ID}}{$xref->{DB}} }, $xref->{ID} ;
        }
      }
      #TODO: if + haplo push refactor here for non dbSNP ids
      if (defined $feature{measureXrefs}{$measure->{ID}}{$omimstr} &&
          defined $feature{measureXrefs}{$measure->{ID}}{$omimstr}{'Allelic variant'}) {

          if (defined $feature{measureXrefs}{$measure->{ID}}{'dbSNP'}) {
            my %tmpAllelicID;
            foreach my $allele (@{$feature{measureXrefs}{$measure->{ID}}{$omimstr}{'Allelic variant'}}){
              foreach my $tmpRS (@{$feature{measureXrefs}{$measure->{ID}}{'dbSNP'}{'rs'}}){
                push @{$tmpAllelicID{$allele}}, $tmpRS;
              }
            }
            # for haplotype entries: allelic variant ids are not real synonyms & this will result in them not being saved
            $feature{measureXrefs}{$measure->{ID}}{OmimAllele2dbSNP} = \%tmpAllelicID unless $type eq $haplotype_type;

          } else {
            if (scalar @{$feature{measureXrefs}{$measure->{ID}}{$omimstr}{'Allelic variant'}} > 1 && $DEBUG == 1) {
              print "Multiple OMIM Allelic variant xrefs for $accession!\n";
            }
            $feature{$omimstr} = $feature{measureXrefs}{$measure->{ID}}{$omimstr}{'Allelic variant'}->[0];
          }
      }
      push @{$feature{'haplo'}{'rs'}}, @{$feature{measureXrefs}{$measure->{ID}}{'dbSNP'}{'rs'}} if $type eq $haplotype_type && defined $feature{measureXrefs}{$measure->{ID}}{'dbSNP'};
   }

    ## position on required assembly
    next unless defined $measure->{SequenceLocation};
    my $seqLocs = to_array($measure->{SequenceLocation});

    my $feature_found;

    foreach my $loc(@{$seqLocs}){

      next unless ref($loc) eq  'HASH'; 
      next unless $loc->{Assembly} eq $assembly;

      if($feature_found) {
        $feature{ypar}{Chr} = $loc->{Chr};
        $feature{ypar}{start} = $loc->{start};
        $feature{ypar}{end} = $loc->{stop};
        $feature{ypar}{accession} = $loc->{Accession};
      }
      else{
        $feature{Chr}       = $loc->{Chr};
        $feature{start}     = $loc->{start};
        $feature{end}       = $loc->{stop};
        $feature{accession} = $loc->{Accession};

        if($type eq $haplotype_type) {
          $feature{measureXrefs}{$measure->{ID}}{Chr}   = $loc->{Chr};
          $feature{measureXrefs}{$measure->{ID}}{start} = $loc->{start};
          $feature{measureXrefs}{$measure->{ID}}{end}   = $loc->{stop};
        }
        $feature_found = 1;
      }
    }

    ## save SPDI canonical
    ## For Y PAR variants there is only one canonical spdi
    $feature{canonicalSPDI} = $measure->{CanonicalSPDI} if defined $measure->{CanonicalSPDI};

    ## HGVS genomic - used for allele extraction for novel variants 
    my $assembly_number  = $assembly; ## uses assembly without GRCh
    $assembly_number     =~ s/\D+//;

    my $attrib_set = to_array($measure->{AttributeSet});
    foreach my $attrib (@{$attrib_set}){

      next unless (defined $attrib->{Attribute}->{integerValue} &&
                           $attrib->{Attribute}->{integerValue} == $assembly_number );

      next unless (defined $attrib->{Attribute}->{Type} &&
                           $attrib->{Attribute}->{Type} =~ /HGVS,\s+genomic,\s+top\s+level/ );

      # Make sure the chr matches the hgvs chromosome
      if(defined $feature{accession} && $feature{accession} =~ $attrib->{Attribute}->{Accession}) {
        $feature{hgvs_g} = $attrib->{Attribute}->{Change};
      }
      elsif(defined $feature{ypar} && $feature{ypar}{accession} =~ $attrib->{Attribute}->{Accession}) {
        $feature{ypar}{hgvs_g} = $attrib->{Attribute}->{Change};
      }

      $feature{measureXrefs}{$measure->{ID}}{hgvs_g} = $attrib->{Attribute}->{Change} if $type eq $haplotype_type;
    }

    ## find reported genes, note: each measure set can have a MeasureRelationship gene
    if(defined $measure->{MeasureRelationship}){
      my @genes;          
      my $meas_rels = to_array($measure->{MeasureRelationship});
      foreach my $meas (@{$meas_rels}){
        push @genes, $meas->{Symbol}->{ElementValue}->{content} if $meas->{Symbol}->{ElementValue}->{content} =~ /\w/;
      }
      $feature{gene} = join(",", @genes);
      $feature{measureXrefs}{$measure->{ID}}{gene} = $feature{gene} if $type eq $haplotype_type;
    }

    #last processing of this specific measure
    # if multiple rsIDs for the same measure set in ReferenceClinVarAssertion print warning
    if(defined $feature{measureXrefs}{$measure->{ID}}{dbSNP} && scalar @{$feature{measureXrefs}{$measure->{ID}}{dbSNP}{rs}} > 1 && $DEBUG == 1) {
      print "multiple rsIDs for same measure ($measure->{ID}) in RCV ($accession), rsIDs: ".join(",",@{$feature{measureXrefs}{$measure->{ID}}{dbSNP}{rs}} )."\n";
    }

    #populate higher level hash of rsIDs and specific coordinates, hgvs_g
    if (defined $feature{'haplo'} && defined $feature{measureXrefs}{$measure->{ID}}{'dbSNP'} ){
      foreach my $rsID (@{$feature{measureXrefs}{$measure->{ID}}{'dbSNP'}{'rs'}}){
        my $varID = 'rs'.$rsID;
        #check if the existing record has same values as the latest measure set: {measureXrefs}{$measure->{ID}
        if (defined $feature{$varID} &&
            (defined $feature{$varID}{'Chr'} &&
             $feature{$varID}{Chr} != $feature{measureXrefs}{$measure->{ID}}{Chr} ||
             $feature{$varID}{start} != $feature{measureXrefs}{$measure->{ID}}{start} ||
             $feature{$varID}{end} != $feature{measureXrefs}{$measure->{ID}}{end} ||
             $feature{$varID}{hgvs_g} != $feature{measureXrefs}{$measure->{ID}}{hgvs_g}) ){

              # these phenotypes will end up not being imported unless the rsID already exists
             print "WARNING: removing location/hgvs for rsID with multiple locations/hgvs, as either of them can be correct: rs$rsID\n" if $DEBUG == 1;
             delete @{$feature{$varID}}{qw/Chr start end hgvs_g gene/}; # $feature{'rs'.$rsID} left in palce as a mark
             delete @feature{qw/Chr start end hgvs_g gene/};
        } else {
          @{$feature{$varID}}{qw/Chr start end hgvs_g gene/} = @{$feature{measureXrefs}{$measure->{ID}}}{qw/Chr start end hgvs_g gene/};
        }
      }
    }
  }

  # if haplotype present then save corresponding rsIDs for phenotype_feature_attrib entry
  if ($type eq $haplotype_type && defined $feature{haplo} && defined $feature{haplo}{rs} ) {
    my @rsIDs = keys %{ { map { $_ => 1 } @{$feature{haplo}{rs}} } };
    $feature{haplo}{rs} = \@rsIDs;
    # only if more than 1 rsID present make them phenotype_feature_attribs
    if( scalar @rsIDs >1 && (scalar keys %{$feature{measureXrefs}}  == scalar @rsIDs) ){
      my %index;
      @index{@rsIDs} = (0..$#rsIDs);
      foreach my $rs (@{$feature{haplo}{rs}}){
        foreach my $rs2 (@{$feature{haplo}{rs}}){
          next if $rs eq $rs2;
          push @{$feature{haplo}{'rs'.$rs}}, 'rs'.$rs2;
        }
      }
      #remove global details as multiple rsIDs present, each with own coordinates and hgvs_g
      delete @feature{qw/Chr start end hgvs_g gene/};
    } else {
      delete $feature{haplo};
    }
  }

  return \%feature;
}

=head2 get_disease

- extract prefered disease name & ontology terms
- only take first disease but report if more are present
 
=cut
sub get_disease{

  my ($Trait, $accession) = @_;

  my ($disease, $ontology_accession);

  my $traits = to_array($Trait);
  if(scalar(@{$traits}) > 1 && $DEBUG == 1) {
    print "Multiple traits for $accession\n";
  }

  foreach my $trait (@{$traits}){

    next if defined $disease; ## How should multi-disease assertions be handled? - log to monitor

    my $names = to_array($trait->{Name});

    foreach my $name ( @{$names} ){

      next unless $name->{ElementValue}->{Type} eq "Preferred";
 
      $disease =  $name->{ElementValue}->{content};

      my $xrefs = to_array($name->{XRef});
      my @ontology_accs;
      foreach my $xref (@{$xrefs}){
        next unless $xref->{DB};
        push(@ontology_accs, $xref->{ID})
          if $xref->{DB} eq "Human Phenotype Ontology";

        push(@ontology_accs, 'Orphanet:' .$xref->{ID})
          if $xref->{DB} eq "Orphanet";
      }
      $ontology_accession = $ontology_accs[0]; #only first ontlogy accession will be returned
     }
  }
  return ($disease, $ontology_accession);
}

=head2 get_citations

- extract any pubmed ids supporting this ascertation
 
=cut
sub get_citations{

  my $structure = shift;
  my $label = shift;

  my @citations;
 
  my $observed_in = to_array($structure);

  foreach my $observed_in (@{$observed_in}){

    my $observed_data = to_array($observed_in->{$label});
    foreach my $obs( @{$observed_data} ){

      if($label eq "Classification" && $obs->{GermlineClassification}) {
        next;
      }

      if ($obs->{Citation}){
        my $citations = to_array($obs->{Citation});
        foreach my $cit(@{$citations}){
          my $ids = to_array($cit->{ID});
          foreach my $id (@{$ids}){
            push @citations, $id->{content} if $id->{Source} && $id->{Source} eq 'PubMed';
          }
        }
      }
    }
  }

  my @citations_uniq = uniq @citations;

  return \@citations_uniq;
}

=head2 get_submitters

- extract any submitters of assertations from the ClinVar release

=cut
sub get_submitters{

  my $structure = shift;
  my $mainXrefs = shift;

  my @submitters;
  my @submitters_somatic;

  my $assertions = to_array($structure);
  foreach my $assert(@{$assertions}){
    my $submitter_id = $assert->{ClinVarSubmissionID}->{submitter};
    my $submitter_type;

    if($assert->{Classification}->{GermlineClassification}) {
      push @submitters, $submitter_id;
      $submitter_type = "germline";
    }
    else {
      push @submitters_somatic, $submitter_id;
      $submitter_type = "somatic";
    }

    # if submitter OMIM then get OMIM allelic variant ID and corresponding rsID from reference record
    # TODO update to support all submitter types
    next unless $submitter_id eq $omimstr && $submitter_type eq "germline";

    if (defined $assert->{ExternalID} && $assert->{ExternalID}->{DB} eq $omimstr) {
      my $omimAlleleID = $assert->{ExternalID}->{ID};

      if($DEBUG == 1 && $assert->{ExternalID}->{Type} ne 'Allelic variant') {
        print "OMIM submitter with ExternalID BUT not for variation type, assertion(", $assert->{ClinVarAccession}->{Acc}, ") id type(",$assert->{ExternalID}->{Type} ,")!!\n";
      }

      #iterate over the measureSet xrefs saved from the ReferenceClinVarAssertion, in case of multiple sets of xref, the one with the expected OMIM submitter allele is used
      foreach my $tmpMeasure (keys %{$mainXrefs->{'measureXrefs'}}) {
        if (defined $mainXrefs->{'measureXrefs'}{$tmpMeasure} &&
            defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'} &&
            defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'}{$omimAlleleID}) {
          if (scalar @{$mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'}{$omimAlleleID}} > 1 && $DEBUG == 1) {
            print "multiple rsIDs for same OMIM allelic variant id in same measure: $tmpMeasure, rs:", join(",", @{$mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'}{$omimAlleleID}}), "!\n";
          }
          $mainXrefs->{$omimstr} ||= $omimAlleleID;
          $mainXrefs->{'MIM'} = (split/\./, $omimAlleleID)[0] unless defined $mainXrefs->{'MIM'};
          # if there is a OMIM submitter record, then use the OMIM alleleID and corresponding rsID for the record (for haplotype records multiple Omim allelic ids can be mentioned in the ReferenceClinVarAssertion measures)
          $mainXrefs->{'dbSNP'} = $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'OmimAllele2dbSNP'}{$omimAlleleID} unless defined $mainXrefs->{'dbSNP'};
        }
      }
      $mainXrefs->{'MIM'} = (split/\./, $omimAlleleID)[0] unless defined $mainXrefs->{'MIM'};
    } elsif ($DEBUG == 1){
      print "OMIM submitter but no ExternalID, assertion(", $assert->{ClinVarAccession}->{Acc}, ") gene(",$mainXrefs->{gene} ,")!!\n";
    }
  }

  #if no OMIM submitter than make sure rsID from dbSNP is at upper hash level
  if (! defined $mainXrefs->{'dbSNP'}){
    foreach my $tmpMeasure (keys %{$mainXrefs->{'measureXrefs'}}) {
      push @{$mainXrefs->{'dbSNP'}}, @{$mainXrefs->{'measureXrefs'}{$tmpMeasure}{'dbSNP'}{'rs'}} if defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'dbSNP'} && defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'dbSNP'}{'rs'};
    }
    # make sure the list contains unique rsIDs to deal with haplotypes/records encountered with multiple sets using same rsID
    @{$mainXrefs->{'dbSNP'}} = keys %{ { map { $_ => 1 } @{$mainXrefs->{'dbSNP'}} } } if defined $mainXrefs->{'dbSNP'};
  }

  return \@submitters, \@submitters_somatic;
}

sub to_array{

  my $structure = shift;

  return undef unless defined $structure;

  my @array;
  if (ref($structure ) eq 'ARRAY'){
    @array = @{$structure};
  }
  else{
    push @array, $structure;
  }
  return \@array;
}



=head2  import

- check all required info present & update db with a single ClinVar
 
=cut

sub import{

  my $record = shift;
  my $try_match = shift // 0;

  my $feature_object;
  my $alt_allele; ## disease associated allele from HGVS
  my $feat;       ## use stored *variation_features where possible

  if(defined $record->{feature_info}->{dbSNP}){
    foreach my $rs (@{ $record->{feature_info}->{dbSNP} }){
      my $rsID = 'rs'.$rs;
      # for haplotypes (multiple rsIDs) each rsID has it's own hgvs_g string
      if (defined $record->{feature_info}->{$rsID}){
        @{$record->{feature_info}}{qw/hgvs_g Chr start end gene/} = @{$record->{feature_info}->{$rsID}}{qw/hgvs_g Chr start end gene/};
      }

      # Get variant - insert if not found
      ($feature_object, $feat, $alt_allele) = get_variant($record, $rs);

      next unless defined $feature_object;

      add_clinvar_data($feature_object, $feat, $alt_allele, $record);

    }
  }
  elsif($try_match ){
    print "try to match ", $record->{Acc}, "\n" if $DEBUG == 1;
    ($feature_object, $feat, $alt_allele) = get_variant($record, undef);
    if (! defined $feature_object){
      print "no variation found for $record->{Acc}, $record->{clinvar_variant_id}\n" if $DEBUG == 1;
      return undef;
    }
    add_clinvar_data($feature_object, $feat, $alt_allele, $record);
  }
  elsif ($DEBUG == 1){
    print "Can't import ". $record->{Acc} ." as no dbSNP or location\n" if $DEBUG == 1;
  }
}

sub add_clinvar_data{
  my $variation_object = shift;
  my $feat             = shift;
  my $alt_allele       = shift;
  my $record           = shift;

  return undef if ! defined $variation_object;

  ## add phenotype evidence attrib to variation (either existing one or a newly inserted one)
  $variation_object->add_evidence_value($phenotype_evidence);
  $variation_adaptor->update($variation_object);

  ## add synonym
  add_synonyms($variation_object, $record->{clinvar_variant_id}, $source);
  add_synonyms($variation_object, $record->{Acc}, $source);
  add_synonyms($variation_object, $record->{feature_info}{$omimstr}, $omim_source) if defined $record->{feature_info}{$omimstr};

  ## add phenotype_feature & attrib (there may not be an alt_allele)

  import_phenotype_feature($record, $variation_object, 'Variation', $feat, $alt_allele);

}

sub import_phenotype_feature{

  my $record         = shift;
  my $feature_object = shift;
  my $type           = shift;
  my $feat           = shift;
  my $alt_allele     = shift;

  # deal with risk alleles longer than varchar 255
  if ( defined $alt_allele && length($alt_allele) > 255 ) {
    warn "alt_allele longer than 255 and will be removed from risk_allele: $alt_allele\n";
    $alt_allele = '';
  }

  ## deal with non-specified phenos
  $record->{disease} = $default_pheno unless $record->{disease} =~/\w+/;
  $record->{disease} = $default_pheno if $record->{disease} eq "not provided";
  $record->{disease} = $default_pheno if $record->{disease} eq "not specified";

  ## check for new clin_sig
  my ($found, $term) = check_known_clinsig($record->{Classifications});

  if (!$found && $term) {
    warn "clin_sig not known: >". $term ."< feature: ". $feature_object->name ."\n";
  }

  # Remove special characters from the phenotype description and submitter name
  $record->{disease} = decode_text( $record->{disease});
  $record->{disease} = replace_char( $record->{disease});

  # TODO: fix
  foreach my $sub (@{$record->{submitters}}){
    $sub = replace_char($sub);
  }

  ## look for existing or enter new phenotype object
  my $pheno="";
  eval {
    $pheno = get_phenotype($record->{disease}, $record->{ontology_accession});
  };
  if ($@) {
    warn "EXCEPTION in fetch of phenotype (RCV: ". $record->{Acc} .", VCV:". $record->{clinvar_variant_id}.") via SQL statement: $@";
    return;
  }

  for my $classification (keys %{$record->{Classifications}}) {
    my %attribs;
    my %submitter_ids;
    my $dna_type = undef;

    # Germline classification
    if ($classification eq "germline") {
      # avoids empty entry if explanation is missing for conflicting evidence
      $attribs{clinvar_clin_sig} = $record->{Classifications}->{germline}->{clin_sign};
      $dna_type = "Germline";
      $attribs{review_status} = $record->{Classifications}->{germline}->{status};

      if(defined $record->{Classifications}->{germline}->{date}){
        $attribs{DateLastEvaluated} = $record->{Classifications}->{germline}->{date};
      }
    }
    # Somatic classification
    elsif ($classification eq "somatic") {
      # avoids empty entry if explanation is missing for conflicting evidence
      $attribs{somatic_clin_sig} = $record->{Classifications}->{somatic}->{clin_sign};
      $dna_type = "Somatic";
      $attribs{somatic_status} = $record->{Classifications}->{somatic}->{status};

      if(defined $record->{Classifications}->{somatic}->{date}){
        $attribs{somatic_date} = $record->{Classifications}->{somatic}->{date};
      }
    }
    # Oncogenicity classification
    # In the phenotype feature this classification has DNA_type "Somatic"
    elsif ($classification eq "oncogenicity") {
      # avoids empty entry if explanation is missing for conflicting evidence
      $attribs{oncogenic_clin_sig} = $record->{Classifications}->{oncogenicity}->{clin_sign};
      $dna_type = "Somatic";
      $attribs{oncogenic_status} = $record->{Classifications}->{oncogenicity}->{status};

      if($record->{Classifications}->{oncogenicity}->{date}) {
        $attribs{oncogenic_date} = $record->{Classifications}->{oncogenicity}->{date};
      }
    }
    else {
      print "Warning: no classification type\n";
    }

    $attribs{external_id}      = $record->{Acc};
    $attribs{risk_allele}      = $alt_allele if defined $alt_allele && $alt_allele ne "-" && $alt_allele ne '';
    $attribs{associated_gene}  = $record->{feature_info}->{gene} if defined $record->{feature_info}->{gene};
    $attribs{MIM}              = $record->{feature_info}->{MIM} if defined $record->{feature_info}->{MIM};

    # Prepare the citations
    # Germline and somatic have publications in different places
    my $citations_data = $classification eq "germline" ? $record->{citations} : $record->{citations_somatic};
    $attribs{pubmed_id} = join(",", @{$citations_data}) if $citations_data && exists $citations_data->[0];    

    if (defined $attribs{pubmed_id} && length($attribs{pubmed_id}) > 255) {
      $attribs{pubmed_id} = substr($attribs{pubmed_id}, 0, 255);
      $attribs{pubmed_id} = substr($attribs{pubmed_id}, 0,rindex($attribs{pubmed_id}, ","));
    }

    $attribs{inheritance_type} = $record->{inheritance_type} if defined $record->{inheritance_type};
    $attribs{variation_names}  = join(",", @{$record->{feature_info}->{haplo}->{$feature_object->name}}) if defined $record->{feature_info}->{haplo} && defined $record->{feature_info}->{haplo}->{$feature_object->name};

    # Submitters
    my $submitters_data = $classification eq "germline" ? $record->{submitters} : $record->{submitters_somatic};
    foreach my $sub (@{$submitters_data}){
      ##enter submitter unless already available
      unless ($submitters->{$sub}){
        $submitters->{$sub} = add_submitter($sub);
        print "Added submitter $sub id " . $submitters->{$sub}  ." for sub\n" if $DEBUG == 1;
      }
      $submitter_ids{ $submitters->{$sub} } = 1;
    }
    $attribs{submitter_id} = join(",", keys %submitter_ids) if (keys %submitter_ids) >0;

    #if OMIM is a submitter save the variation in the ph_omim variation_set_variation
    if (exists $submitters->{$omimstr} &&
        exists $submitter_ids{$submitters->{$omimstr}} &&
        ! defined $omimVarSet{$feature_object->dbID()}){
      update_variation_set($feature_object->dbID());
    }

    # feat is the VariationFeature which contains the genome coordinates for this variation
    foreach my $feature (@{$feat}){
      print "entering phenotype_feature type : $type & object id: ".  $feature_object->dbID() . ", position " .$feature->seq_region_start() . "-".  $feature->seq_region_end() . "\n" if $DEBUG == 1;

      ## add evidence and variation_set_id to variation_feature
      if ($classification eq "germline") {
        update_variation_feature($feature, $attribs{clinvar_clin_sig});
      }

      my $phenofeat = Bio::EnsEMBL::Variation::PhenotypeFeature->new(
        -slice          => $feature->slice(),
        -start          => $feature->seq_region_start(),
        -strand         => $feature->seq_region_strand(),
        -end            => $feature->seq_region_end(),
        -phenotype      => $pheno,
        -is_significant => 1,
        -type           => $type,
        -object         => $feature_object,
        -source         => $source,
        -dna_type       => $dna_type,
        -attribs        => \%attribs
        );
        $pheno_feat_adaptor->store($phenofeat);

    }

  }

}


=head2 update_variation_set
- update variation set variation ph_omim for the new variation
=cut

sub update_variation_set{
  my $variation_id      = shift;

  my $vsv_ins_sth = $dbh->prepare(qq[ insert ignore into variation_set_variation
                                     (variation_id, variation_set_id)
                                      values (?,?)] );
  $vsv_ins_sth->execute( $variation_id, $omim_set_id );
  $omimVarSet{$variation_id} =1;

}

=head2 update_variation_feature
- update variation_feature evidence and variation_set_id
=cut

sub update_variation_feature{
  my $vf      = shift;
  my $clinsig = shift;

  my $vf_upd_sth = $dbh->prepare(qq[
      UPDATE variation_feature
      SET
        variation_set_id = CONCAT_WS(",",
                                     if(variation_set_id='',NULL,variation_set_id),
                                      ?),
        evidence_attribs = CONCAT_WS(",", evidence_attribs, '$pheno_evidence_id')
      WHERE variation_feature_id = ? ] );
  my $new_sets = "$pheno_set_id,$clinvar_set_id";
  if ($omimVarSet{$vf->variation->dbID}) {
    $new_sets .=",$omim_set_id";
  }
  $new_sets .=",$clinassoc_set_id " if ($clinsig =~ /pathogenic|drug-response|histocompatibility/i && $clinsig !~ /non/);
  $vf_upd_sth->execute( $new_sets,  $vf->dbID );

}


=head2 get_phenotype

 - retrieve existing or enter new phenotype object

=cut
sub get_phenotype{

  my $desc      = shift;
  my $accession = shift;

  $desc =~s /\\x2c|\\X2C/\,/g; ## decode commas
  $desc =~s /\'//g;            ## remove '

  my $pheno = $phenotype_adaptor->fetch_by_description( $desc )->[0];

  # try to fetch the phenotype using the phenotype cache
  if(!$pheno) {
    my $description = $desc;
    $description = clean_phenotype_desc($description);
    my @parse_desc = parse_line('\s+', 0, $description);
    my @parse_desc_sorted = sort @parse_desc;
    my $parse_desc_sorted_join = join(',', @parse_desc_sorted);
    if($phenotype_cache{$parse_desc_sorted_join}) {
      my @phenos = @{$phenotype_cache{$parse_desc_sorted_join}};
      # use the first match
      $pheno = $phenotype_adaptor->fetch_by_dbID($phenos[0]->{id});
    }
  }

  unless ( ref($pheno) eq 'Bio::EnsEMBL::Variation::Phenotype' ) {

    $pheno = Bio::EnsEMBL::Variation::Phenotype->new(-description => $desc );
    $pheno->class_attrib_id($non_specified_class_id) if exists $non_specified_pheno{$pheno};
    $phenotype_adaptor->store($pheno);
  }

  if($accession){
    $pheno->add_ontology_accession({ accession      => $accession,
                                     mapping_source => 'Data source',
                                     mapping_type   => 'is'
                                     } );
    $phenotype_adaptor->store_ontology_accessions($pheno);
  }

  return $pheno;
}

=head2 check_mtmp_table

 - check if MTMP_transcript_variation exists

=cut

sub check_mtmp_table {
  my $dba = shift;

  my $table_sth = $dba->dbc->prepare(qq[ show tables like 'MTMP_transcript_variation' ]);
  $table_sth->execute() or die ("ERROR: cannot run command to check MTMP_transcript_variation\n");

  my $check_table = $table_sth->fetchall_arrayref();
  die "ERROR: MTMP_transcript_variation table does not exist. Create table before ClinVar import\n" if(!$check_table->[0]);
}

=head2 get_submitter_ids

 - retrieve existing ids held for assertation submitters

=cut
sub get_submitter_ids{

  my $dba = shift;

  my %submitters;

  my $submitter_ext_sth = $dba->dbc->prepare(qq[ select submitter_id, description from submitter]);
  $submitter_ext_sth->execute() or die ("ERROR: cannot select from submitter table\n");

  my $dat = $submitter_ext_sth->fetchall_arrayref();
  foreach my $l (@{$dat}){
    $submitters{$l->[1]} = $l->[0];
  }

  return \%submitters;
}

=head2 add_submitter

 - add ids for new assertation submitters

=cut

sub add_submitter{

  my $submitter_name = shift;

  my $submitter_ins_sth = $dba->dbc->prepare(qq[ INSERT INTO submitter (description) values (?) ]);
  $submitter_ins_sth->execute($submitter_name);

  my $submitter_ext_sth = $dba->dbc->prepare(qq[ select submitter_id from submitter where description=?]);
  $submitter_ext_sth->execute($submitter_name) or die;
  my $dat = $submitter_ext_sth->fetchall_arrayref();
  print "added submitter : $submitter_name & $dat->[0]->[0]\n" if $DEBUG == 1;
  return $dat->[0]->[0];

}


=head2 get_variant 

  - look up or enter variation
  - returns variation & variation_feature objects & associated allele (from HGVS)
  - TODO: return $ref_allele

=cut
sub get_variant{

  my $record = shift;
  my $rs_id  = shift;

  my $dbSNP = defined $rs_id ? "rs" . $rs_id : "rs_NA";

  $record->{feature_info}->{hgvs_g} |= "";
  ## need alleles to input for standard variation & for risk allele attribute
  ## take from HGVS string; should there be multiple rsIDs for the same feature_info, they all will use the same one hgvs_g string
  if ($record->{feature_info}->{hgvs_g} !~ /.*:.*/) {
    defined $record->{feature_info}->{Chr} && $record->{feature_info}->{hgvs_g}  ne "" ?
      $record->{feature_info}->{hgvs_g} = $record->{feature_info}->{Chr} . ":" . $record->{feature_info}->{hgvs_g} :
      ($record->{feature_info}->{hgvs_g} = "unknown" . ":" . $record->{feature_info}->{hgvs_g} );
  }
  print "Seeking $dbSNP ". $record->{feature_info}->{hgvs_g} . " for RCV:". $record->{Acc} ." VCV:". $record->{clinvar_variant_id} ."\n" if $DEBUG ==1;

  my ($ref_allele, $alt_allele);
  eval{
    ($ref_allele, $alt_allele) = get_hgvs_alleles( $record->{feature_info}->{hgvs_g} ) unless $record->{feature_info}->{hgvs_g} eq "unknown:" ;
  };
  ## not printing bulky error message
  print "Problem finding allele for hgvs ". $record->{feature_info}->{hgvs_g} ." \n" unless $@ eq '';

  unless (defined $ref_allele && defined $alt_allele && $ref_allele ne $alt_allele){
    print "Ref + Alt alleles not available for $dbSNP (" . $record->{feature_info}->{hgvs_g} .
          ") RCV:". $record->{Acc} ." VCV:". $record->{clinvar_variant_id} ."\n";
  }

  if (defined $rs_id) {
    ## look for existing variation object to return
    my $var_ob = $variation_adaptor->fetch_by_name($dbSNP);

    if (defined $var_ob){
      my @features = $var_ob->get_all_VariationFeatures();
      return ($var_ob, @features , $alt_allele);
    }
    print "No record found for $dbSNP\n" if $DEBUG ==1;

    ## ClinVar can be ahead of dbSNP - is there enough data to create a variation record?

    if( !defined $record->{feature_info}->{hgvs_g} || !defined $record->{feature_info}->{Chr} ||
        !defined $ref_allele || !defined $alt_allele ||
        (defined $ref_allele && $ref_allele eq '') ||
        (defined $alt_allele && $alt_allele eq '') ||
        (defined $ref_allele && defined $alt_allele && $ref_allele eq $alt_allele) ) {
      print "Not entering new refSNP: $rs_id as no parsable HGVS available for alleles ($record->{feature_info}->{hgvs_g})\n" if $DEBUG == 1;
      return undef;
    }

    ## enter new records
    my ($new_var_ob, $var_feat) = enter_var($record,  $ref_allele, $alt_allele, $rs_id, $record->{inheritance_type});
    return ($new_var_ob,  $var_feat, $alt_allele);

  }

  ## no rs id - try to look up based on positon & alleles : shortest allele format
  if (defined $ref_allele && defined $alt_allele &&
      defined $record->{feature_info}->{Chr} && defined $record->{feature_info}->{start}) {
    # try to match existing rsIDs in variation DB on location and allele string
    my $location_str = $record->{feature_info}->{Chr} . ":".
                    $record->{feature_info}->{start} . ":".
                    $ref_allele."_".$alt_allele ;
    my $var_feats = $var_feat_adaptor->fetch_all_by_location_identifier($location_str);

    if ( defined $var_feats && scalar @{$var_feats} == 1 ){
      return ($var_feats->[0]->variation(), $var_feats, $alt_allele);
    } elsif (defined $var_feats && scalar @{$var_feats} > 1 && $DEBUG == 1) {
      # check that variation will always have same name for multiple variation features!
      print "multiple variation_features found for $location_str : \n";
      foreach my $vf (@$var_feats){
        print "vf:". $vf->name . " allele string: ". $vf->allele_string . "\n";
      }
    } elsif($DEBUG == 1) {
      print "no variation feature found via search by location identifier: $location_str\n";
    }
  } elsif($DEBUG == 1) {
    print "no ref or alt or chr or start, skipping lookup by location identifier\n";
  }

  #try to match by SPDI: longest allele format
  #returns undef for ref==alt SPDIs
  my ($var, $var_feats) = get_variant_via_SPDI ($record);
  return ($var, $var_feats, $alt_allele) unless !defined($var);

  ## try to match to multi-allelic variants by location
  return get_variant_by_slice_allele($record, $ref_allele, $alt_allele);
}


=head2 get_variant_via_SPDI

  - create a variation feature object based on ClinVar canonical SPDI
  - use the variation feature object to create a location identifier
  - search using the location identifier for any matching entries in the DB
  - returns variation & variation_feature objects & associated allele (same as caller)

=cut
sub get_variant_via_SPDI{
  my $record     = shift;

  return undef unless defined $record;

  if (defined $record->{feature_info}->{canonicalSPDI} ) {
    print "search via SPDI + location\n" if $DEBUG == 1;
    my $spdi_str = $record->{feature_info}->{canonicalSPDI};
    my $var_feat;
    eval {
      #NOTE: this var_feat will have source 'Parsed from SPDI notation' and dbID undef
      $var_feat = $var_feat_adaptor->fetch_by_spdi_notation($spdi_str);
    };
    if ($@) {
      warn "SPDI error: " . $@;
      return undef;
    }

    my $location_identifier = $var_feat->location_identifier();
    print "search SPDI_location_identifier: $location_identifier\n" if $DEBUG == 1;

    my $vfs_from_db = $var_feat_adaptor->fetch_all_by_location_identifier($location_identifier);
    # check for > 1 here
    if ( scalar @$vfs_from_db == 1 ){
      return ($vfs_from_db->[0]->variation(), $vfs_from_db);
    } elsif (scalar @$vfs_from_db > 1 && $DEBUG == 1) {
      # check that variation will always have same name for multiple variation features!
      print "multiple variation_features found for $spdi_str : \n";
      foreach my $vf (@$vfs_from_db){
        print "vf:". $vf->name . " allele string: ". $vf->allele_string . "\n";
      }
    } elsif($DEBUG == 1) {
      print "no variation feature found via search by SPDI + location identifier\n";
    }
  } elsif($DEBUG == 1) {
    print "no SPDI available for entry, skipping lookup by SPDI + location identifier\n";
  }
  return undef;
}

=head2 get_variant_by_slice_allele

  - look up variants at the same location as a ClinVar assertion
  - only handles single base variants
  - compare ClinVar allele to variation_feature.allele_string
  - returns variation & variation_feature objects & associated allele (same as caller)

=cut
sub get_variant_by_slice_allele{

  my $record    = shift;
  my $cv_ref    = shift;
  my $cv_allele = shift;

  return undef unless defined $cv_allele;

  if(!defined $record->{feature_info}->{start} || !defined $record->{feature_info}->{end} || ($record->{feature_info}->{start} != $record->{feature_info}->{end})) {
     print "too long for slice look up: ", $record->{feature_info}->{hgvs_g}, "\n" if $DEBUG == 1;
     return undef;
   }

  my $location_string = $record->{feature_info}->{Chr} .":" . $record->{feature_info}->{start}."-".$record->{feature_info}->{end};

  my $slice =  $slice_adaptor->fetch_by_region( 'chromosome',
                                                  $record->{feature_info}->{Chr},
                                                  $record->{feature_info}->{start},
                                                  $record->{feature_info}->{end} );
   unless ($slice){
     print "No slice found at $location_string\n" if $DEBUG == 1;
     return undef;
   }

  my $var_feats = $var_feat_adaptor->fetch_all_by_Slice($slice);
  if (scalar @{$var_feats} > 1 && $DEBUG == 1) {
     print "multiple variation_features found for slice $location_string - picking first with matching alleles\n";
  }

  foreach my $vf (@$var_feats){

    my %dballeles;
    my @alleles = split/\//, $vf->allele_string;
    my $db_ref = shift @alleles;

    foreach my $a(@alleles){ $dballeles{$a} = 1;}

    if( $dballeles{$cv_allele}){
      if ($cv_ref eq $db_ref){
        return ($vf->variation(), [$vf], $cv_allele);
      } else {
        warn "Variation feature with REF missmatch for $location_string, $cv_ref/$cv_allele\n";
      }
    }
  }

  ## if we get here there is no matching variant
  print "No var feats for this slice $location_string\n" if $DEBUG == 1;
  return undef;
}



=head2 enter_var

  - ClinVar releases more frequently than dbSNP so may have new data
  - enter variation, alleles and variation_feature
  - returns variation & variation_feature objects

=cut
sub enter_var{

  my $data       = shift;
  my $ref_allele = shift;
  my $alt_allele = shift;
  my $rs_id      = shift;
  my $inherit    = shift;

  unless (defined $rs_id) {
    print "ERROR: no rs_id for ". $data->{feature_info}->{hgvs_g} ."\n" if $DEBUG == 1;
    return undef;
  }

  my $dbSNP = 'rs'.$rs_id;

   unless (defined $ref_allele && defined $alt_allele){
     print "ERROR: missing alleles for $data->{feature_info}->{hgvs_g} / $dbSNP\n" if $DEBUG == 1;
     return undef;
   }

  my $somatic = 0;
  $somatic = 1 if defined $inherit && $inherit =~ /Somatic/i;

  my $allele_str = $ref_allele ."/". $alt_allele;
  my $vf_start = $data->{feature_info}->{start};
  my $vf_end = $data->{feature_info}->{end};

  ## get slice for new variationfeature
  ## strand not reported - assumes forward
  my $slice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{feature_info}->{Chr} );

  # Get the allele_string, vf start/end based on the type of variant
  ($allele_str, $vf_start, $vf_end) = format_coords($ref_allele, $alt_allele, $allele_str, $vf_start, $vf_end, $data->{feature_info});

  # Skip duplications that don't have a start coinciding with the hgvs parsed elements
  if(!$allele_str || !$vf_start || !$vf_end) {
    return undef;
  }

  my $so_term = SO_variation_class($allele_str);

  # MTMP transcript variation - biotypes to skip
  my %biotypes_to_skip = (
    'lncRNA' => 1,
    'processed_pseudogene' => 1,
    'unprocessed_pseudogene' => 1,
  );

  my $var = Bio::EnsEMBL::Variation::Variation->new
    ( -name              => $dbSNP,
      -source            => $source,
      -is_somatic        => $somatic,
      -adaptor           => $variation_adaptor,
      -class_SO_term     => $so_term,
    );
  $variation_adaptor->store($var);

  my @vfs; #return vfs to attach phenotype feature to

  # Create the variation feature
  my $vf = Bio::EnsEMBL::Variation::VariationFeature->new
    (-start           => $vf_start,
     -end             => $vf_end,
     -strand          => 1,
     -slice           => $slice,
     -variation_name  => $dbSNP,
     -map_weight      => 1,
     -allele_string   => $allele_str,
     -variation       => $var,
     -source          => $source,
     -is_somatic      => 0,
     -adaptor         => $var_feat_adaptor,
     -class_so_term    => $so_term,
    );

  $var_feat_adaptor->store($vf);
  push @vfs, $vf;

  # If variant in Y PAR region store the two variation features
  #  One of the vf has already been checked
  #  The other vf can have different chr, start, end and/or alleles (from hgvs genomic)
  if($data->{feature_info}->{ypar} && $data->{feature_info}->{ypar}->{hgvs_g} ne "unknown:") {
    $slice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{feature_info}->{ypar}->{Chr} );

    my ($ref_allele_ypar, $alt_allele_ypar) = get_hgvs_alleles( $data->{feature_info}->{ypar}->{Chr}.":".$data->{feature_info}->{ypar}->{hgvs_g} );
    $allele_str = $ref_allele_ypar ."/". $alt_allele_ypar;

    ($allele_str, $vf_start, $vf_end) = format_coords($ref_allele_ypar, $alt_allele_ypar, $allele_str, $data->{feature_info}->{ypar}->{start}, $data->{feature_info}->{ypar}->{end}, $data->{feature_info}->{ypar});

    # Skip duplications that don't have a start coinciding with the hgvs parsed elements
    if(!$allele_str || !$vf_start || !$vf_end) {
      return undef;
    }

    $so_term = SO_variation_class($allele_str);

    my $vf_par = Bio::EnsEMBL::Variation::VariationFeature->new(
        -start           => $vf_start,
        -end             => $vf_end,
        -strand          => 1,
        -slice           => $slice,
        -variation_name  => $dbSNP,
        -map_weight      => 1,
        -allele_string   => $allele_str,
        -variation       => $var,
        -source          => $source,
        -is_somatic      => 0,
        -adaptor         => $var_feat_adaptor,
        -class_so_term   => $so_term,
        );

      $var_feat_adaptor->store($vf_par);
      push @vfs, $vf_par;
  }

  if($insert_tv) {
    for my $var_feat (@vfs) {
      my $count_tv = 0;
      my $all_tv = $var_feat->get_all_TranscriptVariations();
      foreach my $tv (@{$all_tv}) {
        # Do not include upstream and downstream consequences
        next unless overlap($var_feat->start, $var_feat->end, $tv->transcript->start - 0, $tv->transcript->end + 0);

        $count_tv += 1;

        # only include valid biotypes in MTMP_transcript_variation
        my $write_biotype = $biotypes_to_skip{$tv->transcript->biotype} ? 0 : 1;

        # write to MTMP table if transcript is MANE (GRCh38)
        my $write_mane = $tv->transcript->is_mane ? 1 : 0;
        my $mtmp = $write_mane && $write_biotype;

        # MANE is not available for GRCh37
        if($assembly =~ /GRCh37/) {
          $mtmp = $write_biotype;
        }

        $tva->store($tv, $mtmp);
      }

      # get variation_feature_id
      my $vf_dbid = $var_feat->dbID;

      my $update_vf_smt = qq { UPDATE variation_feature
                              SET consequence_types = ?
                              WHERE variation_feature_id = ?
                            };

      # If variation_feature has no entry in transcript_variation then we need to set the consequence_types to default
      if(!$count_tv) {
        my $vf_sth = $dba->dbc()->prepare($update_vf_smt);
        $vf_sth->execute('intergenic_variant', $var_feat->dbID) or die "Error updating consequence_types to default in table variation_feature\n";
      }

      # By default group_concat has maximum length 1024
      # some variants have consequence_types longer than 1024
      my $stmt_len = qq {set session group_concat_max_len = 100000};
      my $sth_len = $dbh->prepare($stmt_len);
      $sth_len->execute();

      # Update consequence_types in variation_feature table
      my $tv_sth    = $dba->dbc()->prepare(qq[ SELECT variation_feature_id, GROUP_CONCAT(DISTINCT(consequence_types))
                                                        FROM transcript_variation
                                                        WHERE variation_feature_id = ?
                                                        GROUP BY variation_feature_id;
                                                  ]);

      $tv_sth->execute($vf_dbid) or die "Error selecting consequence_types from transcript_variation\n";
      my $data_tv = $tv_sth->fetchall_arrayref();
      if (defined $data_tv->[0]->[0]) {
        my $update_vf_sth = $dba->dbc()->prepare($update_vf_smt);
        $update_vf_sth->execute($data_tv->[0]->[1], $data_tv->[0]->[0]) or die "Error updating consequence_types in table variation_feature\n";
      }
    }
  }

  return ($var, \@vfs);
}

=head2 format_coords

  - Return the allele string, vf start and vf end based on the HGVS genomic
  - Return undef if a duplication doesn't have a start coinciding with the hgvs parsed elements
=cut
sub format_coords {
  my $ref_allele = shift;
  my $alt_allele = shift;
  my $allele_str = shift;
  my $vf_start = shift;
  my $vf_end = shift;
  my $data = shift;

  if ($ref_allele eq '-' && $allele_str ne '-/') {
    if ($vf_start == $vf_end){
      $vf_start = $vf_end+1 ;
    } elsif ( $vf_start +1 == $vf_end) {
      $vf_start = $data->{end};
      $vf_end = $data->{start};
    } elsif ($data->{hgvs_g} =~ /\d+dup/){ # "3-6dupGAGA"
      $vf_start = $vf_end + 1;
    }
  } elsif ($allele_str eq '-/' && $data->{hgvs_g} =~ /g\.\d+dup/){ #new style formated "g.123dup"
    my $ref_slice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{Chr}, $data->{end}, $data->{end});
    my $ref_seq = $ref_slice->seq();
    $allele_str = "-/".$ref_seq;
    $vf_start= $vf_end + 1;
  } elsif ($allele_str eq '-/' && $data->{hgvs_g} =~ /g\.\d+\_\d+dup/){ #new style formated "g.123_126dup"
    my $ref_slice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{Chr}, $data->{start}, $data->{end});
    my $ref_seq = $ref_slice->seq();
    $allele_str = "-/".$ref_seq;
    $vf_start= $vf_end + 1;
  } elsif ($alt_allele =~/^$ref_allele/ && $data->{hgvs_g} =~ m/\[/i ) {
    # check if HGVS was a repeat and reference already contains the a repeat and the number of inserted repeats has to be adjusted
    # eg. rs1555092425 (11:g.108282799A[5]) -> get_hgvs_alleles produces -> A(1) -> A(5) while correct is: ref AAA(3) -> alt AAAAA(5)
    my $refSlice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{Chr}, $vf_start, $vf_start + length($alt_allele) - 1);
    my @refSeq = split //, $refSlice->seq;
    my @altSeq = split //, $alt_allele;
    my ($i,$match) = (0,1);

    # skip duplications that don't have start coinciding with hgvs parsed elements
    if ($refSeq[0] ne $altSeq[0]) {
      print "INFO: skipping variation (duplications) with ref[0] ne alt[0] for $data->{hgvs_g} \n" if $DEBUG == 1;
      return undef;
    }

    while ($i< scalar @altSeq && $match){
      $match=0 if ($refSeq[$i] ne $altSeq[$i]);
      $i++;
    }
    $ref_allele = substr($refSlice->seq, 0, $i-1);
    $allele_str = $ref_allele ."/". $alt_allele;
  }
  # case eg. NC_000009.12:g.137233961_137234061del
  elsif ($allele_str eq '/-' && $data->{hgvs_g} =~ /\d+_\d+del$/){
      my ($start, $end) = $data->{hgvs_g} =~ m/(\d+)_(\d+)del$/i;
      print "WARNING: HGVS contains different start/end for deletion ($data->{hgvs_g})\n" if ($start ne $vf_start || $end != $vf_end );
      my $refSlice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{Chr}, $vf_start, $vf_end);
      $ref_allele = $refSlice->seq;
      $allele_str = $ref_allele ."/". $alt_allele;
  }
  # case eg. NC_000007.14:g.5978687_5978689delinsC
  elsif ($data->{hgvs_g} =~ /\d+_\d+delins[A-Z]+/i){
      my ($start, $end, $alt) = $data->{hgvs_g} =~ m/(\d+)_(\d+)delins([A-Z]+)$/i;
      print "WARNING: HGVS contains different start/end/alt for delins ($data->{hgvs_g})\n" if ($start ne $vf_start || $end != $vf_end  || $alt ne $alt_allele);
      my $refSlice = $slice_adaptor->fetch_by_region( 'chromosome', $data->{Chr}, $vf_start, $vf_end);
      $ref_allele = $refSlice->seq;
      $allele_str = $ref_allele ."/". $alt_allele;
  }

  return ($allele_str, $vf_start, $vf_end);
}


=head2 get_structural_variant

  - get structural variant record to use in phenotype feature
  - new data is not entered
  - returns variation & variation_feature objects

=cut
sub get_structural_variant{

  my $record = shift;

  ## sort to get struct var not genotype
  my @ids = sort @{$record->{feature_info}->{dbVar}};
  my $dbvar = pop @ids;

  ## look for existing structural variation object
  my $struct_var_ob = $structvar_adaptor->fetch_by_name($dbvar);

  unless (defined $struct_var_ob && $struct_var_ob ne ''){
    print "Not entering SV: $record->{feature_info}->{dbVar}->[0] as not in db \n" if $DEBUG == 1;
    return undef;
  }
  my @features = $struct_var_ob->get_all_StructuralVariationFeatures();

  return ($struct_var_ob, @features );
}

=head2 get_source

  - retrieve source object

=cut

sub get_source{
  my $file = shift;  ##/path_to_file/ClinVarRCVRelease_2024-01.xml
  my $source_name = shift;

  my $version = $file;
  # The file name can include the full path - remove everything until file name
  # File can be compressed (.xml.gz) or not (.xml)
  $version =~ s/.*ClinVarRCVRelease_|.xml|.gz|-//g;
  warn "VERSION looks suspicious: $version\n" if length($version) >6;

  my $source_adaptor  = $reg->get_adaptor('homo_sapiens', 'variation', 'source');

  if (defined $source_name) {
    my $source = $source_adaptor->fetch_by_name( $source_name );
    die ("Source information not held for ",$source_name, "\n") unless defined $source ;

    $source->version($version);
    $source_adaptor->update_version($source);

    return $source;
  }

  return;
}

=head2 save_source_version

  - updates source object in db

=cut

sub save_source_version {
  my $source = shift;

  my $source_adaptor  = $reg->get_adaptor('homo_sapiens', 'variation', 'source');
  $source_adaptor->update_version($source);

  return $source;
}

sub add_synonyms{

  my $var               = shift;
  my $synonym_accession = shift;
  my $synonym_source    = shift;

  if ($synonym_source->name eq 'ClinVar') {
    $synonym_accession =~ s/\.\d+$//; ##remove version for synonym
  }

  ## multiple rs id can be attached to the same ClinVar id - usually identical duplicates
  ## but we cannot support 2 variants with the same synonym/source
  my $syn_ins_sth = $dba->dbc->prepare(qq[ insert ignore into variation_synonym  
                                           (variation_id, source_id, name)  
                                           values (?,?,?)
                                          ]);

  $syn_ins_sth->execute($var->dbID(), $synonym_source->dbID(), $synonym_accession);


}

## get the identifier for a specific attrib_type and value
## eg. 'Phenotype_or_Disease' attrib of type 'evidence'
sub get_attrib_id {
  my ($type, $value) = @_;

  my $attrib_id = $attrib_adaptor->attrib_id_for_type_value($type, $value);

  if (!$attrib_id){
    die("Couldn't find the $value attrib of $type type\n");
  } else {
    return $attrib_id;
  }
}


## delete previous ClinVar data
## a few entries are withdrawn each time

sub clean_old_data{

  my $clinvar_source_id_sth = $dba->dbc->prepare(qq[ select source_id from source where name = 'ClinVar' ]);
  my $omim_source_id_sth = $dba->dbc->prepare(qq[ select source_id from source where name = 'OMIM' ]);

  $clinvar_source_id_sth->execute() or die "Error selecting ClinVar from source\n";
  my $clinvar_data = $clinvar_source_id_sth->fetchall_arrayref();
  my $clinvar_id = $clinvar_data->[0]->[0] ? $clinvar_data->[0]->[0] : die "Source ClinVar is not defined\n";
  
  $omim_source_id_sth->execute() or die "Error selecting OMIM from source\n";
  my $omim_data = $omim_source_id_sth->fetchall_arrayref();
  my $omim_id = $omim_data->[0]->[0] ? $omim_data->[0]->[0] : die "Source OMIM is not defined\n";

  print "Deleting old phenotype, synonym, citation and clinical_significance data\n";

  my $phenfeatat_del_sth = $dba->dbc->prepare(qq[ delete from phenotype_feature_attrib where phenotype_feature_id in
                                             ( select phenotype_feature_id from phenotype_feature where source_id = ? )
                                            ]);
  $phenfeatat_del_sth->execute($clinvar_id) or die "Error deleting entries from phenotype_feature_attrib for source ClinVar\n";

  my $phenfeat_del_sth = $dba->dbc->prepare(qq[ delete from phenotype_feature where source_id = ? ]);
  $phenfeat_del_sth->execute($clinvar_id) or die "Error deleting entries from phenotype_feature for source ClinVar\n";

  # remove previously inserted variation from ClinVar + associated records
  # remove from allele_synonym
  # first check which entries to remove
  print "Deleting old allele synonym\n";
  my $var_syn_sel_sth    = $dba->dbc()->prepare(qq[ select allele_synonym_id from allele_synonym where variation_id in
                                                    (select variation_id from variation where source_id = ? )
                                                ]);
  $var_syn_sel_sth->execute($clinvar_id) or die "Error selecting allele synonyms to be removed\n";
  my $data = $var_syn_sel_sth->fetchall_arrayref();
  # remove entries from allele_synonym
  foreach my $allele_syn (@{$data}){
    my $allele_syn_id = $allele_syn->[0];
    my $allele_syn_delete_sth = $dba->dbc()->prepare(qq[ delete from allele_synonym where allele_synonym_id = ? ]);
    $allele_syn_delete_sth->execute($allele_syn_id) or die "Could not delete entry allele_synonym_id = $allele_syn_id from allele_synonym\n";
  }

  if($insert_tv) {
    print "Deleting old variation feature, transcript variation and MTMP transcript variation\n";
  }
  else {
    print "Deleting old variation feature\n";
  }

  # remove from variation_feature
  my $tv_del_sth = $dba->dbc()->prepare(qq[ select variation_feature_id from variation_feature where source_id = ? ]);
  $tv_del_sth->execute($clinvar_id) or die "Error selecting variation_feature entries for source ClinVar\n";
  my $tv_to_del = $tv_del_sth->fetchall_arrayref();
  foreach my $to_del (@{$tv_to_del}){
    my $vf_id_del = $to_del->[0];
    my $del_vf_sth = $dba->dbc()->prepare(qq[ delete from variation_feature where variation_feature_id = ? ]);
    $del_vf_sth->execute($vf_id_del) or die "Could not delete entry with variation_feature_id = $vf_id_del from variation_feature\n";

    # remove from transcript_variation and MTMP_transcript_variation
    if($insert_tv) {
      my $del_sth = $dba->dbc()->prepare(qq[ delete from transcript_variation where variation_feature_id = ? ]);
      my $del_mtmp_sth = $dba->dbc()->prepare(qq[ delete from MTMP_transcript_variation where variation_feature_id = ? ]);
      $del_sth->execute($vf_id_del) or die "Could not delete entry with variation_feature_id = $vf_id_del from transcript_variation\n";
      $del_mtmp_sth->execute($vf_id_del) or die "Could not delete entry with variation_feature_id = $vf_id_del from MTMP_transcript_variation\n";
    }
  }

  print "Deleting old data from other tables\n";
  # remove from other tables
  my @tables = qw/variation_synonym failed_variation variation_set_variation variation_citation variation_hgvs/;
  for my $table (@tables){
    my $var_recs_del_sth = $dba->dbc->prepare(qq[ delete from $table where variation_id in
                                                    (select variation_id from variation where source_id = ?)
                                                 ]);
    $var_recs_del_sth->execute($clinvar_id) or die "Error deleting entries from $table for source ClinVar\n";
  }
  my $var_del_sth        = $dba->dbc->prepare(qq[ delete from variation where source_id = ? ]);
  my $synonym_del_sth    = $dba->dbc->prepare(qq[ delete from variation_synonym where source_id = ? ]);
  $var_del_sth->execute($clinvar_id) or die "Error deleting entries from variation for source ClinVar\n";
  $synonym_del_sth->execute($omim_id) or die "Error deleting entries from variation_synonym for source OMIM\n";

  my $var_updt_sth       = $dba->dbc->do(qq[ update variation set clinical_significance=NULL ]);
  my $var_feat_updt_sth  = $dba->dbc->do(qq[ update variation_feature set clinical_significance=NULL ]);

  print "Old data deleted!\n";
}

# Check if the clinical significance is known for each type of classification
sub check_known_clinsig {
  my $current_classifications = shift;

  my $found;
  my $term = undef;

  foreach my $type (keys %{$current_classifications}) {
    my $tmpClinsig = $current_classifications->{$type}->{clin_sign};

    # Specific checks for germline classification
    if ($type eq "germline") {
      $tmpClinsig  =~ s/\//\,/ ; # convert 'pathogenic/likely pathogenic' to 'pathogenic,likely pathogenic'
      $tmpClinsig  =~ s/\,\s+/\,/ ; # convert 'likely benign, other' to 'likely benign,other'
      #replace 'conflicting interpretations of pathogenicity' with 'uncertain significance'
      #for the purpose of variation, variation_feature clin_sig entry
      $tmpClinsig  =~ s/conflicting interpretations of pathogenicity/uncertain significance/g;
      #similar replace of 'association not found' to 'other'
      $tmpClinsig  =~ s/association not found/other/g;
      #remove comma from accepted values
      $tmpClinsig  =~ s/pathogenic,low penetrance/pathogenic low penetrance/;

      my @tmpClinsigs = split(/,\s*|;\s*/, $tmpClinsig);
      my $found = 1;
      foreach my $clinsig (@tmpClinsigs) {
        if (!defined $known_clinsig{$clinsig}) {
          $found = 0;
          $term = $clinsig;
        }
      }
    }
    # Somatic classifications and somatic oncogenicity classifications
    elsif ($type eq "somatic" || $type eq "oncogenicity") {
      my @tmpClinsigs = split(/,\s*|;\s*/, $tmpClinsig);
      my $found = 1;
      foreach my $clinsig (@tmpClinsigs) {
        my @clinsig_clean = split(/:/, $clinsig);
        if (!defined $known_clinsig_somatic{$clinsig_clean[0]}) {
          $found = 0;
          $term = $clinsig_clean[0];
        }
      }
    }
  }

  return $found, $term;
}

sub get_phenotype_cache {
  my %new_phenotype_cache;

  my %phenotype_cache = map {$_->description() => $_->dbID()} @{$phenotype_adaptor->fetch_all};

  foreach my $phenotype_desc (keys %phenotype_cache) {
    my $original_desc = $phenotype_desc;
    $phenotype_desc = clean_phenotype_desc($phenotype_desc);
    my @parse_desc = parse_line('\s+', 0, $phenotype_desc);
    my @parse_desc_sorted = sort @parse_desc;
    my $parse_desc_sorted_join = join(',', @parse_desc_sorted);

    my %aux;
    $aux{original_desc} = $original_desc;
    $aux{id} = $phenotype_cache{$original_desc};
    push @{$new_phenotype_cache{$parse_desc_sorted_join}}, \%aux;
  }

  return \%new_phenotype_cache;
}

sub clean_phenotype_desc {
  my ($description) = @_;

  $description =~ s/^\s+|\s+$//g; # Remove spaces at the beginning and the end of the description
  $description =~ s/\n//g; # Remove 'new line' characters
  $description =~ s/[\(\)]//g; # Remove characters ( )

  # Replace special characters in the phenotype description
  $description = replace_char($description);

  $description = lc($description);
  $description =~ s/\“//g;
  $description =~ s/\”//g;
  $description =~ s/\s+/ /g; # Remove extra space

  # remove a few extra characters
  $description =~ s/, / /g; # remove commas
  $description =~ s/-/ /g;
  $description =~ s/\'//g;

  return $description;
}

sub usage{

    die "\n\tUsage: import_clinvar_xml -data_file [ClinVar xml] -registry [registry file] -assembly [GRCh37/GRCh38]

\t\toptions: -structvar  (only import ClinVar statuses for structural variations)
\t\toptions: -clean      ( delete old phenotype_feature, phenotype_feature_attrib, variation(ClinVar), variation_feature (ClinVar), transcript_variation (ClinVar), variation_set_variation (sets: ClinVar, OMIM, All phenotypes set ), clinical_significance and synonym data)
\t\toptions: -insert_tv  (populate transcript_variation)
\t\toptions: -done_file  (tsv file containing RCVs to be skipped, expected format: header and 2nd column containing RCVs)
\n\n";

}
