#include <ncbi_pch.hpp>
#include "cdalignproc_biodata.hpp"
#if defined(__DB_OFFLINE__)
#include "biodata_blast.hpp"
#include "basealgo.hpp"
#include "compactstore.hpp"
#include "ptrmap.hpp"
#include "objutils.hpp"
#else
#include <shlu2/DataUtils/biodata_blast.hpp>
#include <shlu2/BasicUtils/basealgo.hpp>
#include <shlu2/BasicUtils/compactstore.hpp>
#include <shlu2/BasicUtils/ptrmap.hpp>
#include <StructIfx/objutils.hpp>
#endif
#include <objects/seqfeat/seqfeat__.hpp>
#include <objects/seqset/seqset__.hpp>

USING_NCBI_SCOPE;
using namespace objects;


void AlignEValueFilter(CSeq_annot::TData::TAlign &aligns, double dEValCutoff)
{
	CSeq_annot::TData::TAlign::iterator iterAlign = aligns.begin(), iterAlignEnd = aligns.end();
		
	while (iterAlignEnd != iterAlign)
	{
		const CSeq_align::TScore& rScoreTab = (*iterAlign)->GetScore();  //vector< CRef< CScore > >
		for (CSeq_align::TScore::const_iterator j = rScoreTab.begin(); j != rScoreTab.end(); ++j)
		{
			const CScore& rScore = **j;  //CScore
			
			if (rScore.IsSetId() && rScore.CanGetId())
			{
				const CScore::TId& rObjectId = rScore.GetId();  //CObject_id
					
				if (CObject_id::e_Str == rObjectId.Which())
				{
					const CObject_id::TStr& rStr = rObjectId.GetStr();    //string
					if (rStr == "e_value")
					{

						if (rScore.GetValue().GetReal() > dEValCutoff)	//delete this align
						{
							CSeq_annot::TData::TAlign::iterator temp_iterAlign = iterAlign;
							++iterAlign;
							aligns.erase(temp_iterAlign);
							goto labelNextAlign;
						}
						break;
					}
				}
			}
		}//for
		
		++iterAlign;
	labelNextAlign:;
	}
}

// -- only work for proteins
int ParseCdAlign(const CSeq_align &cdAlign, TCDDHitInfo &dst, double dEValCutoff)
{
	dst.cutoffscore = dEValCutoff;
	dst.pssmid = 0;
	const CSeq_align::TSegs & segs = cdAlign.GetSegs();
	
	if (CSeq_align::TSegs::e_Denseg == segs.Which())
	{
		if (cdAlign.IsSetScore() && cdAlign.CanGetScore())
		{
			const CSeq_align::TScore& rScoreTab = cdAlign.GetScore();  //vector< CRef< CScore > >
			for (CSeq_align::TScore::const_iterator j = rScoreTab.begin(); j != rScoreTab.end(); ++j)
			{
				const CScore& rScore = **j;  //CScore
				
				if (rScore.IsSetId() && rScore.CanGetId())
				{
					const CScore::TId& rObjectId = rScore.GetId();  //CObject_id
						
					if (CObject_id::e_Str == rObjectId.Which())
					{
						const CObject_id::TStr& rStr = rObjectId.GetStr();    //string
						if (rStr == "score")
						{
							if (rScore.IsSetValue() && rScore.CanGetValue())
							{
								dst.score = rScore.GetValue().GetInt();
							}
						}
						else if (rStr == "e_value")
						{
							if (rScore.IsSetValue() && rScore.CanGetValue())
							{
								dst.evalue = rScore.GetValue().GetReal();
								
								if (dst.evalue > dEValCutoff)
									return 0;
							}
						}
						else if (rStr == "bit_score")
						{
							if (rScore.IsSetValue() && rScore.CanGetValue())
							{
								dst.bitscore = rScore.GetValue().GetReal();
							}
						}
						//else if (rStr == "num_ident")
						//{
						//	if (rScore.IsSetValue() && rScore.CanGetValue())
						//	{
						//		iterNewAlign->m_iNumIdent = rScore.GetValue().GetInt();
						//	}
						//}
					}
				}
			}
		}
		const CSeq_align::TSegs::TDenseg& rDense_seg = cdAlign.GetSegs().GetDenseg();
		const CSeq_id& rCdSeqId = *(rDense_seg.GetIds()[1]);
		if (rCdSeqId.Which() == CSeq_id::e_General)	//supposed to be general
		{
			const CSeq_id::TGeneral& rDbTag = rCdSeqId.GetGeneral();  //CDbtag
			const CDbtag::TTag& rObjectId = rDbTag.GetTag();  //CObject_id
  		 
			if (rObjectId.Which() == CObject_id::e_Id)	//correct pssmid
				dst.pssmid = rObjectId.GetId();
			else if (rObjectId.Which() == CObject_id::e_Str)	//assume accession
			{
				dst.acxn = rObjectId.GetStr();
				dst.pssmid = INVALIDPSSMID + (PssmId_t)TDomSrcCount::DomAccType(dst.acxn) * DOMSRC_LEAD;
				
				size_t len = dst.acxn.size(), nums = len;
				
				while (nums > 0)
				{
					if (!isdigit(dst.acxn[nums - 1]))
						break;
					--nums;
				}
					
				if (nums < len)	//has digits
					dst.pssmid += NStr::StringToNumeric<PssmId_t> (dst.acxn.substr(nums));
			}

			if (dst.pssmid > 0)	//have a usable pssmid
			{
				const vector<TSignedSeqPos> &rStarts = rDense_seg.GetStarts();
				const vector<TSeqPos> &rLens = rDense_seg.GetLens();
				vector<TSignedSeqPos> mStarts, sStarts;
				vector<TSeqPos> lens;

				CleanAlignment(rStarts, rLens, mStarts, sStarts, lens);
				size_t ttlsegs = lens.size();
				if (ttlsegs > 0)
				{
					dst.hitstart = mStarts[0];
					dst.hitstop = mStarts[ttlsegs - 1] + lens[ttlsegs - 1] - 1;
					dst.pssmstart = sStarts[0];
					dst.pssmstop = sStarts[ttlsegs - 1] + lens[ttlsegs - 1] - 1;
				}
				else	//no hits actually
					return 0;
			}
			else
				return 0;
		}
	}
	return 1;		
}


PssmId_t CNcbiCdAlignProcessor::x_GetPssmId(const CSeq_id &seqid, string & acxn) const
{

	const CSeq_id::TGeneral& rDbTag = seqid.GetGeneral();  //CDbtag

	const CDbtag::TTag& rObjectId = rDbTag.GetTag();  //CObject_id

	acxn.clear();
	if (rObjectId.Which() == CObject_id::e_Id)	//correct pssmid
		return rObjectId.GetId();
	else if (rObjectId.Which() == CObject_id::e_Str)
	{
		acxn = rObjectId.GetStr();
		return TDomSrcCount::ComputePseudoPssmId(acxn);
	}
	return 0;
}



void CNcbiCdAlignProcessor::ProcessCDAlign(const list<CRef<CSeq_align> > &rAligns, vector<TDomSeqAlignment> &dst) const
{
	for (list<CRef<CSeq_align> > :: const_iterator iterSrc = rAligns.begin(); iterSrc != rAligns.end(); ++iterSrc)
	{

		TDomSeqAlignment newalign;

		CRef<CSeq_id> hit_id = ParseAlignSegs((*iterSrc)->GetSegs(), newalign);
		
		if (!hit_id.IsNull())
		{
			string acxn;
			PssmId_t uiPssmId = x_GetPssmId(*hit_id, acxn);

			if (uiPssmId > 0)	//pseudo or not
			{
				newalign.m_uiPssmId = uiPssmId;

				ParseAlignScores(**iterSrc, newalign);

				dst.emplace_back(move(newalign));
			}
		}
	}
}

void CNcbiCdAlignProcessor::ProcessCDQuery(const list<ncbi::CRef<ncbi::objects::CSeq_align> > &rAligns, TDomQuery &dst, vector<PssmId_t> *missed) const
{
	ProcessCDAlign(rAligns, dst.m_vecAlignments);
	vector<PssmId_t> vecMissed;
	
	vector<PssmId_t> &missed_pssmids = nullptr == missed ? vecMissed : *missed;
	
	if (!dst.m_bIsNa)
	{
		Calculate(dst.m_vecAlignments, dst.m_dimSplitAligns[0], missed_pssmids);
	}
	else
	{
		vector<size_t> rfIndice[READINGFRAME::TOTAL_RFS];
		CCdAlignProcessor::SortReadingFrames(dst.m_vecAlignments, rfIndice, dst.m_uiSeqLen);
		for (int i = 0; i < READINGFRAME::TOTAL_RFS; ++i)
		{

			if (!rfIndice[i].empty())
			{
				vecMissed.clear();

				Calculate(dst.m_vecAlignments, rfIndice[i], dst.m_dimSplitAligns[i], missed_pssmids);

			}
		}
	}
}


//void CNcbiCdAlignProcessor::ProcessCDQuery(const list<ncbi::CRef<ncbi::objects::CSeq_align> > &rAligns, TDomAnnot &dst, vector<PssmId_t> *missed) const
//{
//	
//	ProcessCDAlign(rAligns, dst.m_vecAlignments);
//	//for (list<CRef<CSeq_align> > :: const_iterator iterSrc = rAligns.begin(); iterSrc != rAligns.end(); ++iterSrc)
//	//
//	//	const CSeq_align& rSeq_align = **iterSrc;
//	//
//	//	const CSeq_align::TSegs & segs = rSeq_align.GetSegs();
//	//	
//	//	TDomSeqAlignment newalign;
//	//	
//	//	CRef<CSeq_id> hit_id = ParseAlignSegs(segs, newalign);
//	//	
//	//	if (!hit_id.IsNull())	//invalid aligns
//	//	{
//	//		PssmId_t uiPssmId = x_GetPssmId(*hit_id);
//	//		
//	//		if (uiPssmId > 0)
//	//		{
//	//			newalign.m_uiPssmId = uiPssmId;
//	//			ParseAlignScores(**iterSrc, newalign);
//	//			dst.m_vecAlignments.emplace_back(move(newalign));
//	//		}
//	//	}
//	//
//	
//	vector<PssmId_t> vecMissed;
//	
//	vector<PssmId_t> &missed_pssmids = nullptr == missed ? vecMissed : *missed;
//	
//	if (!dst.m_bIsNa)
//	{
//		Calculate(dst.m_vecAlignments, dst.m_dimSplitAligns[0], missed_pssmids);
//	}
//	else
//	{
//		vector<size_t> rfIndice[READINGFRAME::TOTAL_RFS];
//		CCdAlignProcessor::SortReadingFrames(rfIndice, dst);
//		for (int i = 0; i < READINGFRAME::TOTAL_RFS; ++i)
//		{
//
//			if (!rfIndice[i].empty())
//			{
//				vecMissed.clear();
//
//				Calculate(dst.m_vecAlignments, rfIndice[i], dst.m_dimSplitAligns[i], missed_pssmids);
//
//			}
//		}
//	}
//}



// -- TSeqLocInfoVector = vector< TMaskedQueryRegions >
void CNcbiCdAlignProcessor::ProcessBlastResults(list<TDomQuery> &dst, const list<CRef<CSeq_entry> > &qseqs, const TSeqLocInfoVector & masks, const list< CRef<CSeq_align > > &aligns, int gcode, vector<PssmId_t> *missed) const
{

	if (!qseqs.empty())
	{
		//list<CRef<CSeq_entry> > :: const_iterator iterSeq = qseqs.begin(), iterSeqEnd = qseqs.end();
		list< CRef<CSeq_align> > :: const_iterator iterAlign = aligns.begin(), iterAlignEnd = aligns.end();
		TSeqLocInfoVector :: const_iterator iterMask = masks.begin(), iterMaskEnd = masks.end();
		
		for (list<CRef<CSeq_entry> > :: const_iterator iterSeq = qseqs.begin(), iterSeqEnd = qseqs.end(); iterSeqEnd != iterSeq; ++iterSeq)
		{
			const CBioseq &seq = (*iterSeq)->GetSeq();	//offline sequence, assume to be seq, no more check
			list<TDomQuery> :: iterator iterDQ = dst.emplace(dst.end());
			
			FillSequenceFromBioseq(seq, *iterDQ);
			
			const CSeq_id* curr_seq_id = seq.GetLocalId();
			if (nullptr == curr_seq_id)	//added because Bioseq from command line rpsblast may not contain a local seq-id.
				curr_seq_id = seq.GetFirstId();
				
			if (iterMaskEnd != iterMask)
			{
				size_t totalMasks = iterMask->size();
				if (totalMasks > 0)
				{
					iterDQ->m_vecMaskedRegions.reserve(totalMasks);
					for (TMaskedQueryRegions::const_iterator iterReg = iterMask->begin(); iterReg != iterMask->end(); ++iterReg)
					{
						const CSeq_interval &rInt = (*iterReg)->GetInterval();
						iterDQ->m_vecMaskedRegions.emplace_back(rInt.GetFrom(), rInt.GetTo(), (*iterReg)->GetFrame());
					}
				}
				++iterMask;
			}
				
			list< CRef< CSeq_align > > lstAligns;
			
			while (iterAlignEnd != iterAlign)
			{
				const CSeq_id &aligned = (*iterAlign)->GetSeq_id(0);
				if (!curr_seq_id->Match(aligned))
				{
					if (!lstAligns.empty())
						ProcessCDQuery(lstAligns, *iterDQ, missed);
					
					goto labelNextSeq;	//to next sequence
				}
				
				// -- collect aligns for current sequence
				lstAligns.emplace_back(*iterAlign);
				
				++iterAlign;
			}
			
			if (!lstAligns.empty())
				ProcessCDQuery(lstAligns, *iterDQ, missed);
			
		labelNextSeq:;
		}
	
		//while (iterSeqEnd != iterSeq && iterAlignEnd != iterAlign)
		//{
		//
		//	
		//
		//	
		//	const CSeq_id&  aligned = (*iterAlign)->GetSeq_id(0);
		//
		//	if (!seqid->Match(aligned))
		//	{
		//
		//		if (!lstAligns.empty())
		//		{
		//			list<TDomQuery> :: iterator iterDQ = dst.emplace(dst.end());
		//			TDomQuery &dstdq = *iterDQ;
		//
		//			FillSequenceFromBioseq(seq, dstdq);
		//
		//			dstdq.m_iGenCode = gcode;
		//			
		//			if (iterMaskEnd != iterMask)
		//			{
		//				size_t totalMasks = iterMask->size();
		//				if (totalMasks > 0)
		//				{
		//					dstdq.m_vecMaskedRegions.reserve(totalMasks);
		//					for (TMaskedQueryRegions::const_iterator iterReg = iterMask->begin(); iterReg != iterMask->end(); ++iterReg)
		//					{
		//						const CSeq_interval &rInt = (*iterReg)->GetInterval();
		//						dstdq.m_vecMaskedRegions.emplace_back(rInt.GetFrom(), rInt.GetTo(), (*iterReg)->GetFrame());
		//					}
		//				}
		//				++iterMask;
		//			}
		//			
		//
		//			ProcessCDQuery(lstAligns, dstdq, missed);
		//
		//			lstAligns.clear();
		//		}
		//		++iterSeq;	//advance sequence
		//	}
		//	else	//match -- this sequence
		//	{
		//		lstAligns.emplace_back(*iterAlign);
		//		++iterAlign;
		//	}
		//}
		//// -- last sequence
		//if (!lstAligns.empty() && iterSeqEnd != iterSeq)
		//{
		//	const CBioseq &seq = (*iterSeq)->GetSeq();	//offline sequence, assume to be seq, no more check
		//	list<TDomQuery> :: iterator iterDQ = dst.emplace(dst.end());
		//	TDomQuery &dstdq = *iterDQ;
		//	FillSequenceFromBioseq(seq, dstdq);
		//	dstdq.m_iGenCode = gcode;
		//	
		//	if (iterMaskEnd != iterMask)
		//	{
		//		size_t totalMasks = iterMask->size();
		//		if (totalMasks > 0)
		//		{
		//			dstdq.m_vecMaskedRegions.reserve(totalMasks);
		//			for (TMaskedQueryRegions::const_iterator iterReg = iterMask->begin(); iterReg != iterMask->end(); ++iterReg)
		//			{
		//				const CSeq_interval &rInt = (*iterReg)->GetInterval();
		//				dstdq.m_vecMaskedRegions.emplace_back(rInt.GetFrom(), rInt.GetTo(), (*iterReg)->GetFrame());
		//			}
		//		}
		//		++iterMask;
		//	}
		//	
		//	ProcessCDQuery(lstAligns, dstdq, missed);
		//}
	}
}

void CNcbiCdAlignProcessor::ProcessBlastResults(list<TDomQuery> &dst, const list<CRef<CSeq_loc> > &qseqs, const TSeqLocInfoVector & masks, const list< CRef<CSeq_align > > &aligns, int gcode, vector<PssmId_t> *missed) const
{

	if (!qseqs.empty())
	{
		//list<CRef<CSeq_loc> > :: const_iterator iterSeq = qseqs.begin(), iterSeqEnd = qseqs.end();
		list< CRef<CSeq_align > > :: const_iterator iterAlign = aligns.begin(), iterAlignEnd = aligns.end();
		TSeqLocInfoVector :: const_iterator iterMask = masks.begin(), iterMaskEnd = masks.end();	//should be parallel to iterSeq
		
		for (list<CRef<CSeq_loc> > :: const_iterator iterSeq = qseqs.begin(), iterSeqEnd = qseqs.end(); iterSeqEnd != iterSeq; ++iterSeq)
		{
			
			// -- does not check, get intervals
			const CSeq_interval & seq_int = (*iterSeq)->GetInt();
			const CSeq_id & curr_seq_id = seq_int.GetId();
			
			list<TDomQuery> :: iterator iterDQ = dst.emplace(dst.end());
			
			if (curr_seq_id.IsGi())
				iterDQ->m_iGi = curr_seq_id.GetGi();
			else
				curr_seq_id.GetLabel(&iterDQ->m_strAccession, CSeq_id::eContent, CSeq_id::fLabel_GeneralDbIsContent);
			
            // -- m_strNcbiId is essential for sparclbl to generate seqid
            curr_seq_id.GetLabel(&(iterDQ->m_strNcbiId), CSeq_id::eBoth);
			iterDQ->m_uiSeqLen = seq_int.GetLength();
			
			if (iterMaskEnd != iterMask)
			{
				size_t totalMasks = iterMask->size();
				if (totalMasks > 0)
				{
					iterDQ->m_vecMaskedRegions.reserve(totalMasks);
					for (TMaskedQueryRegions::const_iterator iterReg = iterMask->begin(); iterReg != iterMask->end(); ++iterReg)
					{
						const CSeq_interval &rInt = (*iterReg)->GetInterval();
						iterDQ->m_vecMaskedRegions.emplace_back(rInt.GetFrom(), rInt.GetTo(), (*iterReg)->GetFrame());
					}
				}
				++iterMask;
			}
			
			list< CRef< CSeq_align > > lstAligns;
			
			while (iterAlignEnd != iterAlign)
			{
				const CSeq_id &aligned = (*iterAlign)->GetSeq_id(0);
				if (!curr_seq_id.Match(aligned))
				{

					if (!lstAligns.empty())
						ProcessCDQuery(lstAligns, *iterDQ, missed);

					goto labelNextSeq;
				}

				
				// -- collect aligns for current sequence
				lstAligns.emplace_back(*iterAlign);
				
				++iterAlign;
			}
			// -- last batch
			if (!lstAligns.empty())
				ProcessCDQuery(lstAligns, *iterDQ, missed);
			
		
		labelNextSeq:;
		
		}
	}
}




