/*===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
*/

/*==========================================================================
 * VDB Alignment types, functions and tables
 */
version 1;

include 'vdb/vdb.vschema';
include 'ncbi/seq.vschema';
include 'ncbi/sra.vschema';
include 'ncbi/stats.vschema';
include 'align/seq.vschema';
include 'align/qstat.vschema';
include 'sra/abi.vschema';
include 'align/mate-cache.vschema';


/*--------------------------------------------------------------------------
 * data types
 */

/* ploidy
 *  the number of sets of chromosomes in a cell
 */
typedef U32 NCBI:align:ploidy;

/* ro_type
 *  the type of event causing ref-offset
 */
typedef U8 NCBI:align:ro_type;

const NCBI:align:ro_type NCBI:align:ro_normal            = 0; // normal ref-offset
const NCBI:align:ro_type NCBI:align:ro_soft_clip         = 1; // soft-clipping
const NCBI:align:ro_type NCBI:align:ro_intron_plus       = 2; // intron on positive strand
const NCBI:align:ro_type NCBI:align:ro_intron_minus      = 3; // intron on negative strand
const NCBI:align:ro_type NCBI:align:ro_intron_unknown    = 4; // intron strand not specified
const NCBI:align:ro_type NCBI:align:ro_complete_genomics = 5; // 


/*--------------------------------------------------------------------------
 * functions
 */


/* cigar
 *  construct "cigar" alignment string or length arrays
 *
 * "ctype" [ CONST ] - select variant of format
 *   0 => both matches and mismatches represented as M
 *   1 => matches represented as '=' mismatches as 'X'
 *
 *  "has_mismatch" [ DATA ] - a boolean for each base in aligned sequence
 *   where a value of false means the base aligned to the reference
 *
 *  "has_ref_offset" [ DATA ] - a boolean for each base in the aligned sequence
 *   where a value of true means there is a corresponding offset to position on reference
 *
 *  "ref_offset" [ DATA ] - a packed sequence of signed offsets to aligned position
 *   one entry for every true in "has_ref_offset"
 *
 *  "read_len" [ DATA ] - v2: elem_count defines PLOIDY and values are an actual length of reads in spot
 */
extern function
ascii NCBI:align:cigar #1 < U8 ctype > ( bool has_mismatch, bool has_ref_offset,
    I32 ref_offset, * INSDC:coord:len ref_len ) = ALIGN:cigar;

/* history:
 *  2.1 - added "ref_offset_type" optional parameter
 *  NB - reverting to 2.0 due to linker bug in older code
 */
extern function < type T >
T NCBI:align:cigar #2.0 < U8 ctype > ( bool has_mismatch, bool has_ref_offset,
        I32 ref_offset, INSDC:coord:len read_len, * INSDC:coord:len ref_len, NCBI:align:ro_type ref_offset_type )
    = ALIGN:cigar_2;

extern function U32 NCBI:align:edit_distance #1
    ( bool has_mismatch, bool has_ref_offset, I32 ref_offset );

extern function U32 NCBI:align:edit_distance #2
    ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len, *INSDC:coord:len read_len)
  = NCBI:align:edit_distance_2;

extern function U32 NCBI:align:edit_distance #3
    ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len)
  = NCBI:align:edit_distance_3;

/* rna_orientation
 *  reads column REF_OFFSET_TYPE
 *  returns '+' if has:
 *      at least one NCBI:align:ro_intron_plus
 *      none of NCBI:align:ro_intron_minus
 *  returns '-' if has:
 *      at least one NCBI:align:ro_intron_minus
 *      none of NCBI:align:ro_intron_plus
 *  returns empty string otherwise
 */
extern function
ascii NCBI:align:rna_orientation #1 ( NCBI:align:ro_type ref_offset_type );

/* project_from_sequence
 *  projects column from SEQUENCE
 *
 *  "T" [ TYPE ]
 *
 *  "col" [ CONST ]
 *  "use_read_len" [ CONST ] whether subset by read_len or by read_id only
 *
 *  "seq_spot_id" [ DATA ]
 *
 *  "seq_read_id" [ DATA ]
 */
extern function < type T >
T NCBI:align:project_from_sequence #1 < ascii col> ( I64 seq_spot_id, INSDC:coord:one seq_read_id )
    = ALIGN:project_from_sequence;


/* align_restore_read
 *  restores read by applying alignment-based difference to ref_read
 *
 *  "ref_read" [ DATA ]
 *
 *  "has_mismatch" [ DATA ] and "mismatch" [ DATA ]
 *
 *  "has_ref_offset" [ DATA ] and "ref_offset" [ DATA ]
 */
extern function
INSDC:4na:bin NCBI:align:align_restore_read #1 ( INSDC:4na:bin ref_read, bool has_mismatch,
        INSDC:4na:bin mismatch, bool has_ref_offset, I32 ref_offset * INSDC:coord:len read_len)
    = ALIGN:align_restore_read;


/* raw_restore_read
 *  restores read by applying alignment-based difference to align_read
 *
 *  "align_read" [ DATA ]
 *
 *  "ref_orientation" [ DATA ]
 */
extern function
INSDC:4na:bin NCBI:align:raw_restore_read #1 ( INSDC:4na:bin align_read, bool ref_orientation )
    = ALIGN:raw_restore_read;


/* raw_restore_qual
 *  restores quality by applying alignment-based difference to align_qual
 *
 *  "align_qual" [ DATA ]
 *
 *  "ref_orientation" [ DATA ]
 */
extern function
INSDC:quality:phred NCBI:align:raw_restore_qual #1 ( INSDC:quality:phred align_qual, bool ref_orientation );


/* ref_sub_select
 *  projects reference from sequence
 *
 *  "id" [ DATA ]
 *
 *  "start" [ DATA ] and "len" [ DATA ]
 *
 *  "ref_ploidy" [ DATA, OPTIONAL ]
 */
extern function
INSDC:4na:bin NCBI:align:ref_sub_select #1 ( I64 id, INSDC:coord:zero start,
        INSDC:coord:len len * U32 ref_ploidy)
    = ALIGN:ref_sub_select;


/* ref_restore_read
 *  restores read from central storage
 *
 *  "cmp_rd" [ DATA ]
 *
 *  "seq_id" [ DATA ]
 *
 *  "seq_start" [ DATA ] and "seq_len" [ DATA ]
 */
extern function
INSDC:4na:bin NCBI:align:ref_restore_read #1 ( INSDC:4na:bin cmp_rd, ascii seq_id,
        INSDC:coord:one seq_start, INSDC:coord:len seq_len)
    = ALIGN:ref_restore_read;


/* seq_restore_read
 *  projects read from align_deflate table to SEQUENCE
 *
 *  "cmp_rd" [ DATA ]
 *
 *  "align_id" [ DATA ]
 *
 *  "read_len" [ DATA ]
 *
 *  "rd_type" [ DATA ]
 */
extern function
INSDC:4na:bin NCBI:align:seq_restore_read #1 ( INSDC:4na:bin cmp_rd, I64 align_id,
        INSDC:coord:len read_len, INSDC:SRA:xread_type rd_type )
    = ALIGN:seq_restore_read;


/* seq_restore_linkage_group
 *  projects LINKAGE_GROUP from PRIMARY_ALIGNMENT table to SEQUENCE
 *
 *  "cmp_linkage_group" [ DATA ]
 *
 *  "align_id" [ DATA ]
 */
extern function
ascii NCBI:align:seq_restore_linkage_group #1 ( ascii cmp_linkage_group,
                                                I64 align_id )
    = ALIGN:seq_restore_linkage_group;


/* generate_has_mismatch
 *  generates has mismatch by doing actual compare of reference and subject,
 *  *ref_offsets move comparisons reference-wise
 *
 *  "reference" [ DATA ]
 *
 *  "subject" [ DATA ]
 *
 *  "has_ref_offset" [ DATA ]
 *
 *  "ref_offset" [ DATA ]
 */
extern function
bool NCBI:align:generate_has_mismatch #1 ( INSDC:4na:bin reference,
       INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset)
    = ALIGN:generate_has_mismatch;


/* generate_mismatch
 *
 *  "reference" [ DATA ]
 *
 *  "subject" [ DATA ]
 *
 *  "has_ref_offset" [ DATA ]
 *
 *  "ref_offset" [ DATA ]
 */
extern function
INSDC:4na:bin NCBI:align:generate_mismatch #1 ( INSDC:4na:bin reference,
        INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset )
        = ALIGN:generate_mismatch;


/* ref_pos
 *  retrieves the alignment's positions on the reference
 *  one per PLOIDY
 *
 *  "ref_id" [ DATA ]
 *
 *  "ref_start" [ DATA ] - one per PLOIDY
 */
extern function
INSDC:coord:zero NCBI:align:ref_pos #1 ( I64 ref_id, INSDC:coord:zero ref_start );


/* ref_name
 *  retrieve the name from the reference
 *
 *  "ref_id" [ DATA ]
 */
extern function
ascii NCBI:align:ref_name #1 ( I64 ref_id );


/* ref_seq_id
 *  retrieve the seq_id from the reference
 *
 *  "ref_id" [ DATA ]
 */
extern function
ascii NCBI:align:ref_seq_id #1 ( I64 ref_id );


/* local_ref_id
 *  convert global ref_start into ref_id
 */
extern function
I64 NCBI:align:local_ref_id #1 ( U64 global_ref_start );


/* global_ref_id
 *  convert global ref_start into ref_id
 */
extern function
INSDC:coord:zero NCBI:align:local_ref_start #1 ( U64 global_ref_start );

/* not_my_row
 *  removes current row_id from the list
 */
extern function I64 NCBI:align:not_my_row #1 ( I64 list );

/* template_len
 *  compute template length, i.e. the distance from the left-most to the
 *  right-most matching reference position
 */
extern function I32 NCBI:align:template_len #1 (
    INSDC:coord:zero pos, INSDC:coord:zero mate_pos,
    INSDC:coord:len  reflen, INSDC:coord:len mate_reflen,
    ascii ref_name, ascii mate_ref_name, INSDC:coord:one read_id);

/* get_sam_flags
 *  compute the flags that would be in a SAM file
 *
 * version 1 works with full Alignment databases.
 * version 2 works with Alignment databases that have had SEQUENCE removed.
 */
extern function U32 NCBI:align:get_sam_flags #1 (
    INSDC:coord:len read_len, INSDC:coord:one read_id, I32 template_len,
    bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter);

extern function U32 NCBI:align:get_sam_flags #2 (
    I64 mate_id, INSDC:coord:one read_id, I32 template_len,
    bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter)
  = NCBI:align:get_sam_flags_2;

/* get_left_soft_clip
 *  compute the length of the soft clip on the left edge of the alignment
 */
extern function INSDC:coord:len NCBI:align:get_left_soft_clip #1
    ( bool has_ref_offset, I32 ref_offset );

extern function INSDC:coord:len NCBI:align:get_left_soft_clip #2
    ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len )
   = NCBI:align:get_left_soft_clip_2;

/* get_right_soft_clip
 *  compute the length of the soft clip on the right edge of the alignment
 */
extern function INSDC:coord:len NCBI:align:get_right_soft_clip #1
    ( bool has_mismatch, INSDC:coord:len left_clip * bool has_ref_offset );

extern function INSDC:coord:len NCBI:align:get_right_soft_clip #2
    ( bool has_mismatch, INSDC:coord:len left_clip, bool has_ref_offset, I32 ref_offset )
   = NCBI:align:get_right_soft_clip_2;

extern function INSDC:coord:len NCBI:align:get_right_soft_clip #3
    ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len )
   = NCBI:align:get_right_soft_clip_3;

extern function INSDC:coord:len NCBI:align:get_right_soft_clip #4
    ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len, INSDC:coord:len ref_len )
   = NCBI:align:get_right_soft_clip_4;

extern function INSDC:coord:len NCBI:align:get_right_soft_clip #5
    ( bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len )
   = NCBI:align:get_right_soft_clip_5;

/* get_clipped_cigar
 *  compute the CIGAR string with the soft clipping removed
 */
extern function ascii NCBI:align:get_clipped_cigar #1 ( ascii cigar );

extern function < type T >
T NCBI:align:get_clipped_cigar #2 ( ascii cigar, INSDC:coord:len cigar_len ) = NCBI:align:get_clipped_cigar_2;

/* get_clipped_ref_offset
 *  compute the reference offsets with the soft clipping removed
 */
extern function I32 NCBI:align:get_clipped_ref_offset #1
    ( bool has_ref_offset, I32 ref_offset );

/* clip
 *  remove the soft clipped bases (or qualities, or has_mismatch, or cetera)
 *  works with things whose lengths are the same as SEQUENCE.READ
 */
extern function < type T > T NCBI:align:clip #1
    ( T object, INSDC:coord:len left_clip, INSDC:coord:len right_clip);

extern function < type T > T NCBI:align:clip #2
    ( T object, INSDC:coord:len read_len, INSDC:coord:len left_clip, INSDC:coord:len right_clip)
   = NCBI:align:clip_2;

/* get_ref_len
 *  compute reference length from alignment information
 */
extern function INSDC:coord:len NCBI:align:get_ref_len #1
    ( bool has_ref_offset, I32 ref_offset, * INSDC:coord:len right_clip );

extern function INSDC:coord:len NCBI:align:get_ref_len_2 #2
    ( bool has_ref_offset, I32 ref_offset)
  = NCBI:align:get_ref_len_2;


/* get_mismatch_read
 *  generate the READ with matching bases replaced with '='
 */
extern function ascii NCBI:align:get_mismatch_read #1
    ( bool has_mismatch, INSDC:dna:text mismatch );

/* get_ref_mismatch
 * shows mismatch positions in reference space
 */
function bool NCBI:align:get_ref_mismatch #1
    ( bool has_mismatch, bool has_ref_offset, I32 ref_offset,
      INSDC:coord:len ref_len );

/* get_ref_insert
 * shows positions of inserts in reference space
 * i.e. an insert occurs between each pair of true's
 */
function bool NCBI:align:get_ref_insert #1
    ( bool has_mismatch, bool has_ref_offset, I32 ref_offset,
      INSDC:coord:len ref_len );

/* get_ref_delete
 * shows positions of deleted bases in reference space
 */
function bool NCBI:align:get_ref_delete #1
    ( bool has_mismatch, bool has_ref_offset, I32 ref_offset,
      INSDC:coord:len ref_len );

extern function INSDC:quality:phred NCBI:align:compress_quality #1
    ( INSDC:quality:phred quality, bool preserved );

extern function INSDC:quality:phred NCBI:align:decompress_quality #1
    < INSDC:quality:phred restored_qual_value >
    ( INSDC:quality:phred cmp_quality, bool preserved );

/* make_cmp_read_start
 *
 */
extern function INSDC:coord:zero NCBI:align:make_read_start #1
    (INSDC:coord:len read_len);

/* make_cmp_read_desc
 *  determines whether an element of "operand" is aligned
 *  by looking at the corresponding element of "align_id"
 *
 *  zeros out unaligned elements of operand, unless "invert" is true,
 *  in which case it zeros out aligned elements.
 *
 *  "T" [ TYPE ] - type of operand
 *
 *  "invert" [ CONST ] - if true, invert the logic of which elements
 *  to zero out.
 *
 *  "operand" [ DATA ] - uncompressed data
 *
 *  "align_id" [ DATA ] - indication of alignment
 */
extern function < type T >
T NCBI:align:make_cmp_read_desc #1 <bool invert>(T operand, I64 align_id);

/* seq_construct_read
 *  assembles read from aligned and unaligned parts
 */
extern function < type T >
T NCBI:align:seq_construct_read #1 (
    T aligned, INSDC:coord:len aligned_read_len,
    T unaligned, INSDC:coord:len unaligned_read_len );

extern function I64 NCBI:align:get_mate_align_id #1 ( I64 spot_id );

/*--------------------------------------------------------------------------
 * tables
 */


/* ref_block_cmn
 *  common implementation ancestor for reference block
 */
table NCBI:align:tbl:ref_block_cmn #1.0.0
{
    readonly column ascii REF_TABLE
        = < ascii > meta:read < "CONFIG/REF_TABLE" > ()
        | < ascii > echo < 'REFERENCE' > ();

    // REF_ID is rowid in Reference Table REF_TABLE
    extern column I64 REF_ID
        = out_ref_id;

    // this is a redefinition of REF_START
    // REF_START is the offset within REFERENCE.READ
    extern column INSDC:coord:zero REF_START
        = out_ref_start;

    // global REF_START
    extern column U64 GLOBAL_REF_START
        = out_global_ref_start;

    // REF_LEN the length of a read projection on reference
    INSDC:coord:len out_ref_len_internal
        = NCBI:align:get_ref_len_2 ( out_has_ref_offset, out_ref_offset )
        | NCBI:align:get_ref_len   ( out_has_ref_offset, out_ref_offset );

    INSDC:coord:len out_ref_len
        = .REF_LEN
/*      | NCBI:align:get_ref_len ( out_has_ref_offset, out_ref_offset, out_right_clip ) */
        | out_ref_len_internal;

    physical column < INSDC:coord:len > izip_encoding .REF_LEN = REF_LEN;
    extern column INSDC:coord:len REF_LEN = out_ref_len;

    // REF_ORIENTATION - relative orientation of original raw read to the reference
    // false -> same orientation, true -> opposite orientation
    // alignment and reference are always in the same orientation
    extern column bool_encoding REF_ORIENTATION;

    // REF_PLOIDY
    extern column < U32 > izip_encoding REF_PLOIDY;

    /* REF_POS
     *  per PLOIDY
     */
    readonly column INSDC:coord:zero REF_POS
        = NCBI:align:ref_pos ( out_ref_id, out_ref_start );

    /* REF_NAME
     *  the name of the reference
     */
    readonly column ascii REF_NAME
        = NCBI:align:ref_name ( out_ref_id );

    /* REF_SEQ_ID
     */
    readonly column ascii REF_SEQ_ID
        = NCBI:align:ref_seq_id ( out_ref_id )
        | < ascii > echo < '' > ();
};


/* global_ref_block
 *  reference block favoring global ref-start
 */
table NCBI:align:tbl:global_ref_block #1.0.0
    = NCBI:align:tbl:ref_block_cmn #1.0.0
{
    U64 out_global_ref_start = .GLOBAL_REF_START;
    physical < U64 > izip_encoding .GLOBAL_REF_START = GLOBAL_REF_START;

    I64 out_ref_id = NCBI:align:local_ref_id ( .GLOBAL_REF_START );
    INSDC:coord:zero out_ref_start = NCBI:align:local_ref_start ( .GLOBAL_REF_START );
};


/* local_ref_block
 *  reference block favoring local ref-start
 */
table NCBI:align:tbl:local_ref_block #1.0.0
    = NCBI:align:tbl:ref_block_cmn #1.0.0
{
    I64 out_ref_id = .REF_ID;
    physical < I64 > izip_encoding .REF_ID = REF_ID;

    INSDC:coord:zero out_ref_start = .REF_START;
    physical < INSDC:coord:zero > izip_encoding .REF_START = REF_START;
};


/* align_cmn
 *  common interface and implementation for alignment object
 *
 * History:
 *  2.1 - added REF_OFFSET_TYPE and RNA_ORIENTATION columns
 *        updated all cigar calculations
 */
table NCBI:align:tbl:align_cmn #2.1
    = NCBI:tbl:base_space_common #1.0.3
    , NCBI:SRA:tbl:stats #1.2.0
    , NCBI:align:tbl:ref_block_cmn #1.0.0
{
    bool is_secondary = out_is_secondary;
// temporary key
    extern column < U32 > izip_encoding TMP_KEY_ID;

    extern column <ascii> zip_encoding LINKAGE_GROUP;


/* Raw Sequence Block */
    // Points to sequence table, which may contain more information about the raw sequence.
    // row id in SEQUENCE table; 0 if not linked
    extern column < I64 > izip_encoding SEQ_SPOT_ID;

    // read number in SEQUENCE table; { SEQ_SPOT_ID, SEQ_READ_ID } is the unique link to the sequence
    extern column < INSDC:coord:one > izip_encoding SEQ_READ_ID;


/* Soft-Clipped data block */

    readonly column INSDC:coord:len LEFT_SOFT_CLIP
        = NCBI:align:get_left_soft_clip ( HAS_REF_OFFSET, REF_OFFSET, out_read_len );

    INSDC:coord:len out_right_clip
        = NCBI:align:get_right_soft_clip #5 ( out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len )
        | NCBI:align:get_right_soft_clip #4 ( out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len )
        | NCBI:align:get_right_soft_clip #3 ( out_has_ref_offset, out_ref_offset, out_ref_len )
        | NCBI:align:get_right_soft_clip #2 ( out_has_mismatch, LEFT_SOFT_CLIP, out_has_ref_offset, out_ref_offset );
    readonly column INSDC:coord:len RIGHT_SOFT_CLIP = out_right_clip;

    readonly column ascii CLIPPED_CIGAR_LONG
        = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN );

    readonly column INSDC:coord:len CLIPPED_CIGAR_LONG_LEN
        = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN );

    readonly column ascii CLIPPED_CIGAR_SHORT
        = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN );

    readonly column INSDC:coord:len CLIPPED_CIGAR_SHORT_LEN
        = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN );

    bool out_clipped_has_mismatch
        = < bool > NCBI:align:clip (out_has_mismatch, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP);

    readonly column ascii CLIPPED_HAS_MISMATCH
        = < U8 , ascii > map < [ 0 , 1 ] , '01'  > ( out_clipped_has_mismatch );

    readonly column bool CLIPPED_HAS_MISMATCH = out_clipped_has_mismatch;

    bool out_clipped_has_ref_offset
        = < bool > NCBI:align:clip (HAS_REF_OFFSET, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP);

    readonly column ascii CLIPPED_HAS_REF_OFFSET
        = < U8 , ascii > map < [ 0 , 1 ] , '01'  > ( out_clipped_has_ref_offset );

    readonly column bool CLIPPED_HAS_REF_OFFSET = out_clipped_has_ref_offset;

    // TBD cannot be computed right unless HAS_MISMATCH and! READ_LEN is used
    readonly column INSDC:dna:text CLIPPED_MISMATCH
        = < INSDC:dna:text > NCBI:align:clip #1 ( out_mismatch_dna_text, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP);

    readonly column I32 CLIPPED_REF_OFFSET
        = NCBI:align:get_clipped_ref_offset ( HAS_REF_OFFSET, REF_OFFSET );

    readonly column INSDC:quality:phred CLIPPED_QUALITY
        = < INSDC:quality:phred > NCBI:align:clip (out_qual_phred, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP);

    readonly column INSDC:dna:text CLIPPED_READ
        = < INSDC:dna:text > NCBI:align:clip (READ, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP);

/* Sequence Block */

    extern column < NCBI:align:ploidy > izip_encoding PLOIDY;

    // Number of reads per spot; corresponds to the number of alternative alignments
    // all alternative alignments are computed against the same reference region
    U32 out_nreads
        = .PLOIDY
        | < U32 > echo < 1 > ();

    // READ_START and READ_LEN are position and length of the sequence
    physical < INSDC:coord:zero > izip_encoding .READ_START = READ_START;
    INSDC:coord:zero out_read_start
        = .READ_START
        | < INSDC:coord:zero > echo < 0 > ();

    physical < INSDC:coord:len > izip_encoding .READ_LEN = READ_LEN;

    INSDC:coord:len align_spot_len = ( INSDC:coord:len ) row_len ( out_has_ref_offset );
    INSDC:coord:len out_read_len
        = .READ_LEN
        | align_spot_len;

    // associated qualities
    extern column INSDC:quality:phred CMP_QUALITY
        = .CMP_QUALITY
        | out_cmp_quality;
    physical column < INSDC:quality:phred > zip_encoding .CMP_QUALITY = CMP_QUALITY;

    INSDC:quality:phred out_raw_qual = < INSDC:quality:phred >
        NCBI:align:project_from_sequence < '( INSDC:quality:phred ) QUALITY'> ( .SEQ_SPOT_ID, .SEQ_READ_ID );
    INSDC:quality:phred out_qual_phred
        = NCBI:align:raw_restore_qual ( out_raw_qual, .REF_ORIENTATION )
        | < INSDC:quality:phred > echo < 30 > ( out_4na_bin );
    readonly column INSDC:quality:text:phred_33 SAM_QUALITY = QUALITY ;

    // project read group and name
    ascii out_spot_group = < ascii > simple_sub_select < 'SEQUENCE','SPOT_GROUP'> (.SEQ_SPOT_ID);


    INSDC:SRA:spotid_t tmp_seq_spot_id
        = cast ( .SEQ_SPOT_ID )
        ;
    physical <ascii> zip_encoding .SEQ_NAME = SEQ_NAME;
    extern column ascii SEQ_NAME
        = .SEQ_NAME
        | < ascii > simple_sub_select < 'SEQUENCE','NAME'> (.SEQ_SPOT_ID)
        | sprintf < "%u" > ( tmp_seq_spot_id );

    // compute sam flags
    /* blows up parser: starts at schema-tbl.c:2138
    readonly column U32 SAM_FLAGS = NCBI:align:get_sam_flags(MATE_ALIGN_ID,
        .SEQ_READ_ID, out_template_len, REF_ORIENTATION,
        out_mate_ref_orientation, is_secondary);
    */
    INSDC:coord:len projected_read_len
        = < INSDC:coord:len > simple_sub_select < 'SEQUENCE', 'READ_LEN' > ( .SEQ_SPOT_ID );

    readonly column U32 SAM_FLAGS
        = NCBI:align:get_sam_flags #1 (projected_read_len,
            .SEQ_READ_ID, out_template_len, REF_ORIENTATION,
            out_mate_ref_orientation, is_secondary, out_rd_filter)
        | NCBI:align:get_sam_flags #2 (out_mate_align_id,
            .SEQ_READ_ID, out_template_len, REF_ORIENTATION,
            out_mate_ref_orientation, is_secondary, out_rd_filter);

    ascii out_name_fmt = < ascii > echo < '$R' > ();

    INSDC:coord:zero trim_start
        = < INSDC:coord:zero > echo < 0 > ();
    INSDC:coord:len trim_len
        = align_spot_len;

    ascii out_label
        = .LABEL
        | < ascii > echo < "ploidy1" > ();
    INSDC:coord:zero out_label_start
        = .LABEL_START
        | < INSDC:coord:zero > echo < 0 > ();
    INSDC:coord:len out_label_len
        = .LABEL_LEN
        | < INSDC:coord:len > echo < 7 > ();

    physical < INSDC:SRA:read_filter > zip_encoding .RD_FILTER = READ_FILTER;
    INSDC:SRA:read_filter out_rd_filter
        = .RD_FILTER
        | < INSDC:SRA:read_filter > NCBI:align:project_from_sequence < 'READ_FILTER' > ( .SEQ_SPOT_ID, .SEQ_READ_ID )
        | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ( out_read_len );

    INSDC:SRA:platform_id out_platform
        = .PLATFORM
        | < INSDC:SRA:platform_id > simple_sub_select < 'SEQUENCE','PLATFORM'> (.SEQ_SPOT_ID)
        | < INSDC:SRA:platform_id > echo < SRA_PLATFORM_UNDEFINED > ();

    U8 out_alignment_count = <U8> NCBI:align:project_from_sequence < 'ALIGNMENT_COUNT' > ( .SEQ_SPOT_ID, .SEQ_READ_ID );

    /* out_read_type
     *  set to SRA_READ_TYPE_FORWARD + SRA_READ_TYPE_BIOLOGICAL
     *  which has a constant value of 3
     */
    INSDC:SRA:xread_type out_read_type
        = < INSDC:SRA:xread_type > echo < 3 > ( out_read_len );

    // stats inputs
    bool in_stats_bin = HAS_REF_OFFSET;

    INSDC:coord:len _alt_in_read_len
        = READ_LEN
        | ( INSDC:coord:len ) row_len #1 ( HAS_REF_OFFSET );

    INSDC:SRA:xread_type _alt_in_read_type
        = READ_TYPE
        | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > (_alt_in_read_len);

    readonly column ascii MISMATCH_READ
        = NCBI:align:get_mismatch_read ( out_has_mismatch, out_mismatch_dna_text );

/* Alignment block */

    // MAPQ - single value quality of the mapping; the scale is submitter specific
    extern column < I32 > izip_encoding MAPQ;

    extern column INSDC:coord:zero MATE_REF_POS = out_mate_ref_pos;
    extern column INSDC:coord:len MATE_REF_LEN = out_mate_ref_len;
    extern column I64 MATE_REF_ID = out_mate_ref_id;
    extern column I32 TEMPLATE_LEN = out_template_len;
    extern column bool MATE_REF_ORIENTATION = out_mate_ref_orientation;
    readonly column ascii MATE_REF_NAME = NCBI:align:ref_name ( out_mate_ref_id );
    readonly column ascii MATE_REF_SEQ_ID = NCBI:align:ref_seq_id( out_mate_ref_id );
    readonly column U8 ALIGNMENT_COUNT  = out_alignment_count;


/********************************
* Columns representing CIGARs
********************************/


    // one value per base i.e. length is same as sum of READ_LEN
    // partitioned by READ_START and READ_LEN into alternative alignments
    // flags the shifts in reference position preceeding the base
    // if sequence of a partitioned read starts with a ref_offset and one or more mismatches
    // then it represents a left soft clip
    // any run of mismatches at the end represents a right soft clip

    readonly column ascii HAS_REF_OFFSET =  < U8 , ascii > map < [ 0 , 1 ] , '01'  > ( out_has_ref_offset );
    extern column bool_encoding HAS_REF_OFFSET;
    bool out_has_ref_offset = .HAS_REF_OFFSET;

    // has number of elements equal to number of true elements in HAS_REF_OFFSET
    extern column < I32 > izip_encoding REF_OFFSET;
    I32 out_ref_offset = .REF_OFFSET;

    // the type of offset recorded in REF_OFFSET
    extern column < NCBI:align:ro_type > izip_encoding REF_OFFSET_TYPE;
    NCBI:align:ro_type out_ro_type = .REF_OFFSET_TYPE;

    // DISPLAY Columns

    readonly column I64 ALIGN_ID = row_id ();

    // get projection of the reference
    readonly column INSDC:dna:text REF_READ
        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( REF_READ );

    readonly column INSDC:4na:bin REF_READ
        = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len, .REF_PLOIDY)
        | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len );

    INSDC:4na:bin ref_read_internal
        = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal, .REF_PLOIDY)
        | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal);

    // text forms of reads
    INSDC:dna:text out_dna_text
        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin );
    readonly column INSDC:dna:text RAW_READ
        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_raw_read );
    readonly column INSDC:4na:bin RAW_READ
        = out_raw_read;

    // CIGARs
    readonly column ascii CIGAR_LONG
        = < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type)
        | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len)
        | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len)
        ;
    readonly column INSDC:coord:len CIGAR_LONG_LEN
        = < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type)
        | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len)
        | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len)
        ;
    readonly column ascii CIGAR_SHORT
        = < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type)
        | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len)
        | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len)
        ;
    readonly column INSDC:coord:len CIGAR_SHORT_LEN
        = < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type)
        | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len)
        | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len)
        ;

    readonly column ascii RNA_ORIENTATION
        = NCBI:align:rna_orientation ( out_ro_type )
        ;

    readonly column U32 EDIT_DISTANCE
        = NCBI:align:edit_distance #3 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len)
        | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len, out_read_len)
        | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len)
        | NCBI:align:edit_distance #1 (out_has_mismatch, out_has_ref_offset, out_ref_offset);

    readonly column ascii HAS_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01'  > ( out_has_mismatch );

    // needed for backward compatibility
    readonly column ascii SEQ_SPOT_GROUP = out_spot_group;


/* These columns are purely informational. */
    bool out_ref_mismatch = NCBI:align:get_ref_mismatch ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len );
    readonly column ascii REF_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01'  > ( out_ref_mismatch );
    readonly column bool REF_MISMATCH = out_ref_mismatch;

    bool out_ref_insert = NCBI:align:get_ref_insert ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len );
    readonly column ascii REF_INSERT = < U8 , ascii > map < [ 0 , 1 ] , '01'  > ( out_ref_insert );
    readonly column bool REF_INSERT = out_ref_insert;

    bool out_ref_delete = NCBI:align:get_ref_delete ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len );
    readonly column ascii REF_DELETE = < U8 , ascii > map < [ 0 , 1 ] , '01'  > ( out_ref_delete );
    readonly column bool REF_DELETE = out_ref_delete;

};


/* align_full
 *  aligns externally stored sequence against reference
 *  alignment transcript is calculated
 *
 * History:
 *  1.1 - respond to changes in base table
 */
table NCBI:align:tbl:align_full #1.1
    = NCBI:align:tbl:align_cmn #2.1
{
    bool out_is_secondary = <bool> echo < true > ();
    // restore reads to its raw form (orientation is restored)

    INSDC:4na:bin out_raw_read
        = < INSDC:4na:bin > simple_sub_select  < 'PRIMARY_ALIGNMENT', '( INSDC:4na:bin ) RAW_READ' > (.PRIMARY_ALIGNMENT_ID)
        | < INSDC:4na:bin > NCBI:align:project_from_sequence < '( INSDC:4na:bin ) READ'> ( .SEQ_SPOT_ID, .SEQ_READ_ID );

    INSDC:4na:bin out_4na_bin
	    = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset, .READ_LEN )
	    | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset )
    	    | NCBI:align:raw_restore_read ( out_raw_read, .REF_ORIENTATION );


    // flags mismatches with the reference
    // produced by actual comparison of REF_READ and READ
    // TMP_HAS_MISMATCH is a hack to speed up retrieval during coverage recalculation
    column bool_encoding TMP_HAS_MISMATCH;
    bool out_has_mismatch
        = .TMP_HAS_MISMATCH
	| NCBI:align:generate_has_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset );
    readonly column bool HAS_MISMATCH = out_has_mismatch;

    INSDC:4na:bin out_mismatch_4na_bin
        = NCBI:align:generate_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset );

    INSDC:4na:bin tmp_out_mismatch_4na_bin =  < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( .TMP_MISMATCH );

    // temporary column for reference coverage calculation
    column < INSDC:dna:text> zip_encoding TMP_MISMATCH;

    INSDC:dna:text out_mismatch_dna_text
        = .TMP_MISMATCH
        | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin );

    readonly column INSDC:dna:text MISMATCH = out_mismatch_dna_text;
    readonly column INSDC:4na:bin MISMATCH = out_mismatch_4na_bin;

    physical column < INSDC:coord:zero > izip_encoding .MATE_REF_POS = MATE_REF_POS;
    INSDC:coord:zero out_mate_ref_pos = .MATE_REF_POS
                                      | < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID);

    physical column < I64 > izip_encoding .MATE_REF_ID = MATE_REF_ID;
    I64 out_mate_ref_id = .MATE_REF_ID
                        | < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID);

    INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID);
    physical column < I32 > izip_encoding .TEMPLATE_LEN = TEMPLATE_LEN;
    I32 out_template_len = .TEMPLATE_LEN
            | NCBI:align:template_len(REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID);

    physical column < bool > izip_encoding .MATE_REF_ORIENTATION = MATE_REF_ORIENTATION;
    bool out_mate_ref_orientation = .MATE_REF_ORIENTATION
                                  | < bool >  simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID);

    I64 out_mate_align_id = .MATE_ALIGN_ID;
    physical column <I64> izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID;
    extern column I64 MATE_ALIGN_ID = out_mate_align_id;

    physical column < I64 > izip_encoding .PRIMARY_ALIGNMENT_ID = PRIMARY_ALIGNMENT_ID;

    I32 read_idx = <I32> cast (.SEQ_READ_ID);
    extern column I64 PRIMARY_ALIGNMENT_ID
        = .PRIMARY_ALIGNMENT_ID
        | <I64> simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID' > (.SEQ_SPOT_ID,.SEQ_READ_ID);

};


/* compressed_by_reference
 *  aligns internally represented sequence against reference
 *  alignment transcript is stored
 *  original sequence is reconstructed
 *
 * History:
 *  1.2 - respond to changes in base table
 */
table NCBI:align:tbl:compressed_by_reference #1.2
    = NCBI:align:tbl:align_cmn #2.1
{
    bool out_is_secondary = <bool> echo < false > ();

    // one value per base i.e. length is same as sum of READ_LEN
    // partitioned by READ_START and READ_LEN into alternative alignments
    // flags mismatches with the reference
    extern default column bool_encoding HAS_MISMATCH;
    bool out_has_mismatch = .HAS_MISMATCH;

    // has number of elements equal to number of true elements in HAS_MISMATCH
    extern column INSDC:dna:text MISMATCH
    {
        read = out_mismatch_dna_text;
        validate = < INSDC:dna:text > compare ( in_mismatch_dna_text, out_mismatch_dna_text );
    }

    INSDC:dna:text in_mismatch_dna_text
        = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( MISMATCH );

    INSDC:4na:bin in_mismatch_4na_bin
        = < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_mismatch_dna_text );

    extern column < ascii > zip_encoding ALIGN_GROUP;

    physical column < INSDC:4na:bin > zip_encoding .MISMATCH = in_mismatch_4na_bin;

    INSDC:4na:bin out_mismatch_4na_bin = .MISMATCH;
    INSDC:dna:text out_mismatch_dna_text
        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin );

    I64 out_mate_align_id
        = .MATE_ALIGN_ID
        | NCBI:align:get_mate_align_id (.SEQ_SPOT_ID);

    physical column <I64> izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID;
    extern column I64 MATE_ALIGN_ID = out_mate_align_id;

    // restore reads from alignment columns and the reference
    // optional .READ_LEN size defines PLOIDY
    INSDC:4na:bin out_4na_bin
	    = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset, .READ_LEN )
	    | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset );

    // restore reads to its raw form (orientation is restored)
    INSDC:4na:bin out_raw_read = NCBI:align:raw_restore_read (out_4na_bin,.REF_ORIENTATION);

    I64	primary_align_pair =  < I64 > simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID'> (.SEQ_SPOT_ID);
    I64 out_mate_ref_id = < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID);
    bool  out_mate_ref_orientation = < bool >  simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID);
    INSDC:coord:zero out_mate_ref_pos = < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID);
    INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID);
    readonly column U32   MATE_EDIT_DISTANCE   = < U32 >   simple_sub_select < '','EDIT_DISTANCE'> (MATE_ALIGN_ID);
    readonly column ascii MATE_CIGAR_LONG      = < ascii > simple_sub_select < '','CIGAR_LONG'> (MATE_ALIGN_ID);
    readonly column ascii MATE_CIGAR_SHORT     = < ascii > simple_sub_select < '','CIGAR_SHORT'> (MATE_ALIGN_ID);
    readonly column INSDC:coord:len MATE_CIGAR_LONG_LEN  = < INSDC:coord:len > simple_sub_select < '','CIGAR_LONG_LEN'> (MATE_ALIGN_ID);
    readonly column INSDC:coord:len MATE_CIGAR_SHORT_LEN = < INSDC:coord:len > simple_sub_select < '','CIGAR_SHORT_LEN'> (MATE_ALIGN_ID);

    I32 out_template_len = NCBI:align:template_len (REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID);
};


/* align_sorted
 *  deflated alignment data sorted against reference
 *
 * History:
 *  1.2 - respond to changes in base table
 */
table NCBI:align:tbl:align_sorted #1.2
    = NCBI:align:tbl:compressed_by_reference #1.2
    , NCBI:align:tbl:global_ref_block #1.0.0
{
    // 128K
    column default limit = 131072;
};


/* align_unsorted
 *  deflated alignment unsorted data
 *
 * History:
 *  1.2 - respond to changes in base table
 */
table NCBI:align:tbl:align_unsorted #1.2
    = NCBI:align:tbl:compressed_by_reference #1.2
    , NCBI:align:tbl:local_ref_block #1.0.0
{
    // 128K
    column default limit = 131072;
};


/* align_mate_sorted
 *
 * History:
 *  1.1 - respond to changes in base table
 */
table NCBI:align:tbl:align_mate_sorted #1.1
    = NCBI:align:tbl:align_full #1.1
    , NCBI:align:tbl:global_ref_block #1.0.0
{
    // 128K
    column default limit = 131072;
};


/* align_mate_unsorted
 *
 * History:
 *  1.1 - respond to changes in base table
 */
table NCBI:align:tbl:align_mate_unsorted #1.1
    = NCBI:align:tbl:align_full #1.1
    , NCBI:align:tbl:local_ref_block #1.0.0
{
    // 128K
    column default limit = 131072;
};

/* align_allele
 *  alleles coverage extension
 *
 * History:
 *  1.2 - respond to changes in base table
 */
table NCBI:align:tbl:align_allele #1.2
    = NCBI:align:tbl:align_unsorted #1.2
{
    extern column < I64 > izip_encoding EVIDENCE_ALIGNMENT_IDS;

    /*
    INSDC:quality:phred out_qual_phred
        = < INSDC:quality:phred > echo < 30 > ( out_4na_bin );
    */
};

/*--------------------------------------------------------------------------
 * seq
 *  alignment sequence table
 */
physical
I64 NCBI:align:sorted:alignment_id_encoding #1.0
{
    decode
    {
        I64 outliers_removed = iunzip ( @ );
        return < I64 > outlier_decode < 0 > ( outliers_removed );
    }

    encode
    {
        I64 outliers_removed = < I64 > outlier_encode < 0 > ( @ );
        return izip ( outliers_removed );
    }
}


table NCBI:align:tbl:seq #1.1 =
    NCBI:tbl:base_space #2.0.3,
    NCBI:tbl:phred_quality #2.0.4,
    NCBI:align:tbl:cmp_base_space #1,
    NCBI:SRA:tbl:spotdesc #1.0.2,
    NCBI:SRA:tbl:stats #1.2.0
{
    // 128K
    column default limit = 131072;

    // gets primary record in alignment table (size of column is NREADS)
    // if sorted - should used special encoding
    extern column <I64> izip_encoding PRIMARY_ALIGNMENT_ID;

    INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > ();
    INSDC:coord:len trim_len = _spot_len;

    // size is NREADS
    extern column < U8 > zip_encoding ALIGNMENT_COUNT;

    // auto-generate name from row-id
    ascii out_name_fmt = < ascii > echo < '$R' > ();

    // temparary column
    extern column < U64 > izip_encoding TMP_KEY_ID;

    // restored  READ
    INSDC:4na:bin out_dcmp_4na_bin
        = NCBI:align:seq_restore_read (out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE);

    extern column < U64 > izip_encoding TI;

    extern column <ascii> zip_encoding CMP_LINKAGE_GROUP;

    // restored LINKAGE_GROUP
    readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID)
                                        | .CMP_LINKAGE_GROUP;
};


table NCBI:align:tbl:cs_seq #1.2
{
    /* writable columns */
    extern column INSDC:color:text CMP_CSREAD
        = out_cmp_color_text
        ;

    extern column < INSDC:dna:text > zip_encoding CS_KEY;

    extern default column < INSDC:quality:phred > zip_encoding QUALITY;

    extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_ID;

    extern column < U8 > zip_encoding ALIGNMENT_COUNT;

    extern column < INSDC:SRA:platform_id > zip_encoding PLATFORM;

    extern column < ascii > zip_encoding LABEL;
    extern column < INSDC:coord:zero > izip_encoding LABEL_START;
    extern column < INSDC:coord:len > izip_encoding LABEL_LEN;

    extern column < INSDC:SRA:xread_type > zip_encoding READ_TYPE;
    extern column < INSDC:coord:zero > izip_encoding READ_START;
    extern column < INSDC:coord:len > izip_encoding READ_LEN;
    extern column < INSDC:SRA:read_filter > zip_encoding READ_FILTER;

    extern column < U64 > izip_encoding TMP_KEY_ID;

    extern column < ascii > zip_encoding SPOT_GROUP;

    extern column < U64 > izip_encoding TI;

    /* writing rules */
    INSDC:x2cs:bin in_cmp_x2cs_bin
        = < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( CMP_CSREAD )
        ;
    INSDC:2cs:bin in_cmp_2cs_bin
        = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_cmp_x2cs_bin )
        ;
    INSDC:x2cs:bin in_cmp_alt_x2cs_bin
        = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_cmp_x2cs_bin )
        ;
    physical column INSDC:2cs:packed .CMP_CSREAD
        = ( INSDC:2cs:packed ) pack ( in_cmp_2cs_bin )
        ;
    physical column < INSDC:x2cs:bin > zip_encoding .CMP_ALTCSREAD
        = < INSDC:x2cs:bin > trim < ALIGN_LEFT, 0 > ( in_cmp_alt_x2cs_bin )
        ;

    /* reading rules */
    INSDC:2cs:packed phys_cmp_2cs_packed
        = .CMP_CSREAD
        ;
    INSDC:x2cs:bin phys_cmp_alt_x2cs_bin
        = .CMP_ALTCSREAD
        ;
    INSDC:2cs:packed phys_2cs_packed
        = .CSREAD
        ;
    INSDC:x2cs:bin phys_alt_x2cs_bin
        = .ALTCSREAD
        ;
    INSDC:2cs:bin out_cmp_2cs_bin
        = ( INSDC:2cs:bin ) unpack ( phys_cmp_2cs_packed )
        ;
    INSDC:2cs:bin out_2cs_bin
        = ( INSDC:2cs:bin ) unpack ( phys_2cs_packed )
        ;
    INSDC:x2cs:bin out_cmp_x2cs_bin
        = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_cmp_2cs_bin, phys_cmp_alt_x2cs_bin )
        | ( INSDC:x2cs:bin ) out_cmp_2cs_bin
        ;
    INSDC:x2cs:bin out_x2cs_bin
        = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, phys_alt_x2cs_bin )
        | ( INSDC:x2cs:bin ) out_2cs_bin
        ;
     INSDC:color:text out_cmp_color_text
        = < INSDC:x2cs:bin, INSDC:color:text > map <  INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_cmp_x2cs_bin )
        ;
     INSDC:color:text out_color_text
        = < INSDC:x2cs:bin, INSDC:color:text > map <  INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin )
        ;

    /* triggers from stats */
    INSDC:quality:phred in_qual_phred
        = QUALITY
        ;
    INSDC:coord:len in_read_len
        = READ_LEN
        ;
    INSDC:SRA:xread_type in_read_type
        = READ_TYPE
        ;
    ascii in_spot_group
        = SPOT_GROUP
        ;
    trigger meta_stats
        = NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type, in_spot_group )
        | NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type )
        ;
    trigger qual_stats
        = NCBI:SRA:phred_stats_trigger #1 ( in_qual_phred )
        ;

    extern column <ascii> zip_encoding CMP_LINKAGE_GROUP;

    // restored LINKAGE_GROUP
    readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID)
                                        | .CMP_LINKAGE_GROUP;
};

table NCBI:align:view:cs_seq #1.1 = NCBI:align:tbl:cs_seq #1.2
{
    // various READ columns
    default readonly column INSDC:dna:text READ
        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_dcmp_4na_bin )
        | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin )
        ;
    readonly column INSDC:4na:bin READ = out_dcmp_4na_bin | out_4na_bin;
    readonly column INSDC:4na:packed READ = pack ( out_dcmp_4na_bin ) | pack ( out_4na_bin );
    readonly column INSDC:x2na:bin READ = out_dcmp_x2na_bin | out_x2na_bin;
    readonly column INSDC:2na:bin READ = out_dcmp_2na_bin | out_2na_bin;
    INSDC:2na:bin out_dcmp_2na_bin
        = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2na_bin )
        ;
    INSDC:2na:bin out_2na_bin
        = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin )
        ;
    readonly column INSDC:2na:packed READ = pack ( out_dcmp_2na_bin ) | pack ( out_2na_bin );

    // decompression in base space
    INSDC:coord:len cmp_read_len
        = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < true > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID )
        ;
    INSDC:coord:zero cmp_read_start
        = NCBI:align:make_read_start #1 ( cmp_read_len )
        ;
    INSDC:x2na:bin out_cmp_x2na_bin
        = NCBI:dna_from_color #1 ( out_cmp_x2cs_bin, cmp_read_start, cmp_read_len, .CS_KEY, color_matrix )
        ;
    INSDC:x2na:bin out_x2na_bin
        = NCBI:dna_from_color #1 ( out_x2cs_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix )
        ;
    INSDC:4na:bin out_cmp_4na_bin
        = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_cmp_x2na_bin )
        ;
    INSDC:4na:bin out_4na_bin
        = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin )
        ;
    INSDC:4na:bin out_dcmp_4na_bin
        = NCBI:align:seq_restore_read ( out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE )
        ;


    // various CSREAD columns
    default readonly column INSDC:color:text CSREAD
        = < INSDC:x2cs:bin, INSDC:color:text > map <  INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_dcmp_x2cs_bin )
        | out_color_text;
    readonly column INSDC:x2cs:bin CSREAD = out_dcmp_x2cs_bin | out_x2cs_bin;
    readonly column INSDC:2cs:bin CSREAD = out_dcmp_2cs_bin | out_2cs_bin;
    INSDC:2cs:bin out_dcmp_2cs_bin
        = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2cs_bin )
        ;
    readonly column INSDC:2cs:packed CSREAD = pack ( out_dcmp_2cs_bin ) | out_2cs_bin;


    // decompression in color space
    INSDC:x2na:bin out_dcmp_x2na_bin
        = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_dcmp_4na_bin )
        ;
    INSDC:x2cs:bin out_dcmp_x2na_x2cs_bin
        = NCBI:color_from_dna #1 ( out_dcmp_x2na_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix )
        ;
    INSDC:coord:len aligned_read_len
        = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < false > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID )
        ;
    INSDC:x2cs:bin out_dcmp_x2cs_bin
        = < INSDC:x2cs:bin > NCBI:align:seq_construct_read #1 ( out_dcmp_x2na_x2cs_bin, .READ_LEN, out_cmp_x2cs_bin, cmp_read_len )
        ;

    // CS_NATIVE - dynamic
    U32 cmp_csread_row_len
        = row_len #1 ( phys_cmp_2cs_packed )
        ;
    U32 cmp_csread_not_zero
        = < U32 > clip < 0, 1 > ( cmp_csread_row_len )
        ;
    readonly column bool CS_NATIVE
        = < U32, bool > map < [ 0, 1 ], [ false, true ] > ( cmp_cs_read_not_zero )
        ;

    // COLOR_MATRIX
    readonly column U8 COLOR_MATRIX
        = color_matrix
        ;
    U8 color_matrix
        = < U8 > echo < INSDC:color:default_matrix > ()
        ;

    // various QUALITY types
    readonly column INSDC:quality:text:phred_33 QUALITY
        = out_qual_text_phred_33
        | ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( .QUALITY );
    readonly column INSDC:quality:text:phred_64 QUALITY
        = out_qual_text_phred_64
        | ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( .QUALITY );

    // SPOT_LEN
    INSDC:coord:len spot_len
        = ( INSDC:coord:len ) row_len ( out_dcmp_4na_bin )
        | ( INSDC:coord:len ) row_len ( out_4na_bin )
        ;
    readonly column INSDC:coord:len SPOT_LEN = spot_len;

    // TRIM_START
    readonly column INSDC:coord:zero TRIM_START
        = < INSDC:coord:zero > echo < 0 > ()
        ;
    readonly column INSDC:coord:one TRIM_START
        = < INSDC:coord:one > echo < 1 > ()
        ;
    // TRIM_LEN
    readonly column INSDC:coord:len TRIM_LEN = spot_len;

    // MIN_SPOT_ID
    readonly column INSDC:SRA:spotid_t MIN_SPOT_ID
        = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MIN" > ()
        ;
    // MAX_SPOT_ID
    readonly column INSDC:SRA:spotid_t MAX_SPOT_ID
        = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MAX" > ()
        ;
    // SPOT_COUNT
    readonly column U64 SPOT_COUNT
        = < U64 > meta:value < "STATS/TABLE/SPOT_COUNT" > ()
        ;
    // BASE_COUNT
    U64 base_count
        = < U64 > meta:value < "STATS/TABLE/BASE_COUNT" > ()
        ;
    readonly column U64 BASE_COUNT = base_count;
    // BIO_BASE_COUNT
    readonly column U64 BIO_BASE_COUNT
        = < U64 > meta:value < "STATS/TABLE/BIO_BASE_COUNT" > ()
        ;
    // CMP_BASE_COUNT
    readonly column U64 CMP_BASE_COUNT
        = < U64 > meta:value < "STATS/TABLE/CMP_BASE_COUNT" > ()
        | base_count
        ;

    // various PLATFORM
    // TBD

    // SPOT_ID
    I64 rowid_64 = row_id ();
    readonly column INSDC:SRA:spotid_t SPOT_ID
        = cast ( rowid_64 )
        ;

    readonly column ascii NAME
        = sprintf < "%u" > ( SPOT_ID )
        ;

};


/***********************************
* Reference table - to store reference sequences
* Sequences are divided in chunks. Two sequences never share a chunk.
* SEQ_LEN     - real size of a chunk should never exceed MAX_SEQ_LEN when it is set
* READ        - inherited from NCBI:tbl:base_space
* CMP_READ,CMP_ALTREAD - are inherited from NCBI:align:tbl:cmp_base_space
* SEQ_ID,SEQ_START,SEQ_LEN are inherited from  NCBI:align:tbl:seqloc
* .skey contains NAME of the chunk - it corresponds to actual name used in BAM (chr1,chr2, etc....)
*
* SEQ_START,SEQ_LEN,MAX_SEQ_LEN,SEQID and rowlen(READ) operate the following way
* - SEQ_LEN < MAX_SEQ_LEN - should only happen on the last chunk of the sequence
* - .READ is absent - there should be a retrieval from external services by SEQ_ID,SEQ_START,SEQ_LEN
* - rowlen(.READ) = 0  && SEQ_START==0 (used as flag) -  the sequence is SEQ_LEN repetition of 'N'
* - rowlen(.READ) = 0  && SEQ_START >= 1 - the sequence have to be fetched from external sources
* - 0 < rowlen(.READ)< SEQ_LEN -- the sequence have to be filled with 'N's
*
v***********************************/
table NCBI:align:tbl:reference #2 =
    NCBI:align:tbl:cmp_base_space #1,
    NCBI:tbl:base_space #2.0.3,
    NCBI:tbl:seqloc #1,
    NCBI:SRA:tbl:stats #1.2.0
{
    INSDC:quality:phred out_qual_phred
        = < INSDC:quality:phred > echo < 30 > ( out_dcmp_4na_bin );

    //  MAX_SEQ_LEN - should be a constant == static column
    extern column < U32 > izip_encoding MAX_SEQ_LEN;

    // indicates if sequence has circular structure
    // copied from refSeq
    extern column bool_encoding CIRCULAR;

    // make CS_KEY writable
    INSDC:dna:text in_cs_key
        = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn', 'ACGTN' > ( CS_KEY );
    physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key;

    U32 in_spot_len = SEQ_LEN;

    INSDC:coord:len _alt_in_read_len
        = READ_LEN
        | SEQ_LEN;

    INSDC:SRA:xread_type _alt_in_read_type
        = READ_TYPE
        | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > ();

    // extra columns needed for CS conversion
    INSDC:coord:zero out_read_start = < INSDC:coord:zero > echo < 0 > ();
    INSDC:coord:len  out_read_len = .SEQ_LEN;

    extern column utf8  NAME = out_spot_name_utf8;
    physical utf8 .NAME = idx:text:insert #1.0  < 'i_name' > ( NAME );

    utf8 out_spot_name_utf8 = idx:text:project #1.0 < 'i_name' > (.NAME );

    ascii out_spot_name = cast ( out_spot_name_utf8 );

    INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > ();
    INSDC:coord:len trim_len = base_space_spot_len;

    ascii out_label
        = < ascii > echo < "reference" > ();
    INSDC:coord:zero out_label_start
        = < INSDC:coord:zero > echo < 0 > ();
    INSDC:coord:len out_label_len
        = < INSDC:coord:len > echo < 9 > ();

    U32 out_nreads
        = < U32 > echo < 1 > ();
    INSDC:SRA:xread_type out_read_type
        = < INSDC:SRA:xread_type > echo < 3 > ();
    INSDC:SRA:read_filter out_rd_filter
        = < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ();


// Columns of computed coverages by alignment

    // TBD: use percentiles instead of min/max?
    // maximum value clipped at 255 of the coverage density
    // for a chunk
    extern column < U8 > izip_encoding CGRAPH_HIGH;

    // minimum value clipped at 255 of the coverage density
    // for a chunk
    extern column < U8 > izip_encoding CGRAPH_LOW;

    // count of the number of mismatches in the chunk
    extern column < U32 > izip_encoding CGRAPH_MISMATCHES;

    // count of the number of inserts and deletes in the chunk
    extern column < U32 > izip_encoding CGRAPH_INDELS;

    // List of row ids from alignment tables
    extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_IDS;
    extern column < I64 > izip_encoding SECONDARY_ALIGNMENT_IDS;
    extern column < I64 > izip_encoding EVIDENCE_INTERVAL_IDS;

    // both OVERLAP_REF_* columns are array of three elements, matching number of *_IDS columns above.
    // points back to an offset where the alignments to this chunk start
    extern column < INSDC:coord:zero > izip_encoding OVERLAP_REF_POS;
    // indicates the length of the longest tail of the alignmnent to this chunk which start in previous chunks
    // if value of an element in this col is zero corresponding value of OVERLAP_REF_POS is meaningless
    extern column < INSDC:coord:len > izip_encoding OVERLAP_REF_LEN;

    // Mechanism to seach for NAME
    readonly column vdb:row_id_range NAME_RANGE
        = idx:text:lookup #1.0 < 'i_name', 'QUERY_SEQ_NAME' > ();

    // Fully instantiates READ
    INSDC:4na:bin out_dcmp_4na_bin
        = NCBI:align:ref_restore_read (out_cmp_4na_bin, .SEQ_ID, .SEQ_START, .SEQ_LEN);
}

// THE DATABASES
database NCBI:align:db:alignment_sorted #1.3
{
    table NCBI:align:tbl:reference #2 REFERENCE;
    table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT;
    table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT;
    table NCBI:align:tbl:seq #1.1 SEQUENCE;
    table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE;
    table NCBI:align:tbl:qstat #1.0 QUAL_STAT;
};

database NCBI:align:db:alignment_unsorted #1.3
{
    table NCBI:align:tbl:reference #2 REFERENCE;
    table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT;
    table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT;
    table NCBI:align:tbl:seq #1.1 SEQUENCE;
    table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE;
    table NCBI:align:tbl:qstat #1.0 QUAL_STAT;
};

database NCBI:align:db:alignment_evidence #1.3
{
    table NCBI:align:tbl:reference #2 REFERENCE;
    table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT;
    table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT;
    table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL;
    table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT;
    table NCBI:align:tbl:seq #1.1 SEQUENCE;
    table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE;
    table NCBI:align:tbl:qstat #1.0 QUAL_STAT;
};

database NCBI:align:db:alignment_evidence_sorted #1.2
{
    table NCBI:align:tbl:reference #2 REFERENCE;
    table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT;
    table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT;
    table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL;
    table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT;
    table NCBI:align:tbl:seq #1.1 SEQUENCE;
    table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE;
    table NCBI:align:tbl:qstat #1.0 QUAL_STAT;
};

database NCBI:align:db:unaligned #1
{
    table NCBI:align:tbl:seq #1.1 SEQUENCE;
    table NCBI:SRA:ABI:tbl:v2 #1.0.4 CS_SEQUENCE;
    table NCBI:align:tbl:qstat #1.0 QUAL_STAT;
};