#include <ncbi_pch.hpp>
#include "common/structifx.hpp"
#include "common/offl_cd_align_proc.hpp"
#include "common/offl_sparcle_data.hpp"
#include "common/objutils.hpp"
#include "common/ptrmap.hpp"
#include "common/compactstore.hpp"
#include "common/basealgo.hpp"
#include "common/enumlit.hpp"
#include "common/argwrapper.hpp"
#include "common/datanode.hpp"
#include "common/lxml.hpp"
#include "common/ljson.hpp"
#include "common/ustring.hpp"
#include "common/combistream.hpp"
#include "common/segset.hpp"
#include "common/biodata_blast.hpp"

#include <objects/blast/Blast4_archive.hpp>
#include <objects/blast/Blast4_request.hpp>
#include <objects/blast/Blast4_request_body.hpp>
#include <objects/blast/Blast4_queue_search_reques.hpp>
#include <objects/blast/Blast4_queries.hpp>
#include <objects/blast/Blas_get_searc_resul_reply.hpp>
#include <objects/blast/Blast4_parameters.hpp>
#include <objects/blast/Blast4_parameter.hpp>
#include <objects/seqset/seqset__.hpp>
#include <objects/seqset/seqset__.hpp>


#include ClusterAssignment_HPP


// #if defined(__BLAST_EMBEDDED)
// #include <app/SparcleLabel/ClusterAssignment__.hpp>
// #else
// #include <internal/structure/offline/SparcleLabel/ClusterAssignment__.hpp>
// #endif

#include <corelib/ncbiapp.hpp>
#include <corelib/ncbifile.hpp>
#include <corelib/ncbireg.hpp>

static const char *PROGRAM_USAGE =	//multiline
"Process rpsblast output and try to assign a name based on "
"computed domain architecture.";

static const char *PROGRAM_DESCRIPTION =	//multiline
"This program accepts output from rpsblast command line utility with '-outfmt 11' option, "
"basically the CBlastOutput object serialized in xml. For each Iteration in the object, "
"the program will computer the domain architecture of the query sequence and assign the associated "
"name to it.\n\n"
"The output will be an xml document defined by cluster_assignment.dtd from gpipe project";

#define VERDEF 0,1,3

static constexpr const struct TVER
{
	int major;
	int minor;
	int patch_level;
} VER = {VERDEF};

// -- Define error codes
enum EErrorSubCode
{
	e_OK = 0,
	e_InitFileError,
	e_DataFileError,
	e_PrecedenceUndefined,
	e_NamingSourceUndefined,
	e_InputDataError,
	e_ProcessError,
	e_AbortReq,
	//-----------------------
	eErr_TotalErrorCodes
};


//This macro must be used inside ncbi namespace
BEGIN_NCBI_SCOPE
NCBI_DEFINE_ERRCODE_X(Sparclbl_job, 134, eErr_TotalErrorCodes - 1);
END_NCBI_SCOPE

//customize Argument definition
USING_NCBI_SCOPE;
using namespace objects;
using namespace LXML;
// -- must be defined for ERR_POST_X
#define NCBI_USE_ERRCODE_X Sparclbl_job



struct TNamingSource2Confidence
{
	string m_NameSource;
	int m_Confidence;	//specificarch
};

typedef std::vector<TNamingSource2Confidence> TNameSources;

struct TOutputFormat
{
	enum EIndex: TENUMIDX
	{
		eEnumStart = 0,
		e_xml = eEnumStart,
		e_fasta = e_xml + 1,
		//
		eEnumStop = e_fasta + 1
	};
	
	static const EIndex eDefault = e_xml;
	static const char* dimLits[eEnumStop - eEnumStart];
};

// -- define
const char* TOutputFormat::dimLits[] = {"xml", "fasta"};
	


// -- give argument index names
enum EArgIndice: unsigned int
{
	// -- name-indice here. C++ standard, if no value explicitly assigned, enumerators start from 0 and increase by 1
	argListIn,	//path/name of a file contains a list of input data files (ie, blast xml files)
	argInfileMask,	//path/name_pattern to search for input files
	argDoneDump,	//target location to move done input files
	argOutFile,
	argOutFormat,
	argEVCutoff,
	argDataDir,
	argPrecedenceFile,
	argSessExitFlag,
	argSessAbortFlag,
	// ------------------------
	TOTALARGS	//This natually as arg count.
};

static TArgDefinition dimValidArgs[] = 
{
	// -- define your valid arguments
	// -- do not use nullptr. use EMPTYSTR as empty string
	{
		"l",	//argListIn
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eString,	//ncbi::CArgDescriptions::EType(process type)
		"Specify the name of the list file that contains path/names of input data files",	//string (description)
		true,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-inlist",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)
		TArgDefinition::EMPTYSTR,	//Default Value
		nullptr,	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	},
	{
		"m",	//argInfileMask
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eString,	//ncbi::CArgDescriptions::EType(process type)
		"Specify mask to collect input files.",	//string (description)
		true,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-inp-mask",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)
		TArgDefinition::EMPTYSTR,	//Default Value
		nullptr,	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	},
	{
		"k",	//argDoneDump
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eString,	//ncbi::CArgDescriptions::EType(process type)
		"Specify locations to dump processed input files. If omitted, files will be deleted",	//string (description)
		true,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-inp-dump",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)
		TArgDefinition::EMPTYSTR,	//Default Value
		nullptr,	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	},
	{
		"o",	//argOutFile
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eString,	//ncbi::CArgDescriptions::EType(process type)
		"Specify output file name. If omitted, output to stdout",	//string (description)
		true,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-outfile",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)
		TArgDefinition::EMPTYSTR,	//Default Value
		nullptr,	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	},
	{
		"t",	//argOutFormat
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eString,	//ncbi::CArgDescriptions::EType(process type)
		"Specify output format.",	//string (description)
		true,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-outfmt",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)
		GetDefaultLit<TOutputFormat> (),	//Default Value
		AllowEnumLits<TOutputFormat> (),	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	},
	{
		"e",	//argEVCutoff
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eDouble,	//ncbi::CArgDescriptions::EType(process type)
		"Specify E-Value cut off. Default is 0.01",	//string (description)
		true,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-evcut",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)
		NumericDefVal(DEF_EVALCUTOFF),	//Default Value
		nullptr,	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	},
	{
		"d",	//argDataDir
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eString,	//ncbi::CArgDescriptions::EType(process type)
		"Specify the directory where CDD/SPARCLE data reside. If omitted, looking for a directory named 'data' in the same directory where this binary is.",	//string (description)
		true,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-datadir",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)
		TArgDefinition::EMPTYSTR,	//Default Value
		nullptr,	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	},
	{
		"p",	//argPrecedenceFile
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eString,	//ncbi::CArgDescriptions::EType(process type)
		"Specify the full path of a procedence definition file.",	//string (description)
		false,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-precedence",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)
		TArgDefinition::EMPTYSTR,	//Default Value
		nullptr,	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	},
	{
		"x",	//argSessExitFlag
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eString,	//ncbi::CArgDescriptions::EType(process type)
		"Specify the name of flag file to signal exit. If omitted, program exits after the processing one batch of input files",	//string (description)
		true,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-exit-flg",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)
		TArgDefinition::EMPTYSTR,	//Default Value
		nullptr,	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	},
	{
		"b",	//argSessAbortFlag
		TArgDefinition::eKey,	//enum TArgDefinition::EArgCategory(argument type: eKey, ePos, eOpenPos, eFlag, eNegFlag)
		ncbi::CArgDescriptions::eString,	//ncbi::CArgDescriptions::EType(process type)
		"Specify the name of flag file to signal abort. Will not produce any output",	//string (description)
		true,	//if optional
		TArgDefinition::EMPTYSTR,	//Synopsis (short description)
		"-abort-flg",	//alias (can be used to specify at command line but not indexed in program
		0,	//ncbi::CArgDescriptions::EFlags
		TArgDefinition::EMPTYSTR,	//value environment var (read arg value from this environment variable)x
		TArgDefinition::EMPTYSTR,	//Default Value
		nullptr,	//ncbi::CArgAllow * , constraint
		ncbi::CArgDescriptions::eConstraint	//ncbi::CArgDescriptions::EConstraintNegate, eConstraint or eConstraintInvert
	}
	
};

// -- positional arguments
static const size_t TOTALEXTRAARGS = 1;
static TExtraArg dimValidExtraArgs[] = 
{
	{0, kMax_UInt, "Files that contain genomic sequence data. If none specified, read sequence data from stdin.", ncbi::CArgDescriptions::eInputFile, 0}
};

class CSparcLabelApp: public CNcbiApplication
{
public:
	typedef void (CSparcLabelApp :: * TLPFNFormatOut) (const TDomQuery &q, const TDomArch * pArch, bool is_specific);

	/**********************************************************************
	*	Define constants
	**********************************************************************/
	constexpr static const char * __default_datadir = "data";
	constexpr static const char * __reg_section_cdddata = "cdddata";
	constexpr static const char * __reg_section_nametypes = "nametypes";
	constexpr static const char * __reg_tag_datadir = "datadir";
	constexpr static const char * __reg_tag_cddids = "cddids";
	constexpr static const char * __reg_tag_bscores = "bscores";
	constexpr static const char * __reg_tag_spfeats = "spfeats";
	constexpr static const char * __reg_tag_genfeats = "genfeats";
	constexpr static const char * __reg_tag_cdtrack = "cdtrack";
	constexpr static const char * __reg_tag_famlinks = "famlinks";
	constexpr static const char * __reg_tag_sparchs = "sparchs";
	constexpr static const char * __reg_tag_genarchs = "genarchs";
	constexpr static const char * __reg_tag_namesrc = "namesrc";
	// -- for naming source and confidence level
	constexpr static const char * __name_src_file = "reviewlevel_namesrc.txt";
	// -- xml output, PGAP DTD
	constexpr static const char * __PGAP_DTD = "http://www.ncbi.nlm.nih.gov/gpipe/protein_naming/cluster_assignment.dtd";
	constexpr static const size_t __fasta_width = 70;	//70 characters per line for sequence output
	
	// -- added 5/14/2018: GPIPE asked to sort xml nodes by seq-id
	constexpr static const size_t NODE_SORT_BLOCK_SIZE = 8192;	//assume 8192 sequences. allocate sorting vector by this block.

	constexpr static const char * SPARCH = "SpecificArch";
	constexpr static const char * SFARCH = "SuperfamArch";
	
	// *******************************************************************/
	
	static string m_st_exec_name;	//binary full path
	static string m_st_launch_path;	//binary launch path
	static string m_st_real_path;	//binary true path
	
	CSparcLabelApp(void);
	virtual void Init(void);
	virtual int Run(void);
	virtual int DryRun(void);
	virtual void Exit(void);
	~CSparcLabelApp(void);
private:
	
	// -- common data members
	CNcbiRegistry& m_reg;
	int m_app_status;
	stringstream m_init_msg;
	// -- parameters data
	istream *m_istream;
	CCombFileBuf m_infiles;	//combination of input files
	std::string m_infmask;	//search for input mask
	std::string m_done_dump;
	
	
	ostream *m_ostream;
	ofstream m_outfile;
	
	TENUMIDX m_outmode;	//xml or fasta
	double m_evcutoff;	//evalue cut off
	//CGXMLRef m_xroot;	//for xml output
	string m_exit_flg;	//exit flag file, placed by sparclbl flag
	string m_abort_flg;	//abort flag file, placed by sparclbl flag
	
	COfflDomClusterData m_dom_cluster_src;
	COfflArchData m_arch_src;
	
	TNameSources m_namesrc_cfd;
	map<string, TNameSources::const_iterator> m_review_levels;

	
	
	// -- added 5/14/2018: GPIPE asked to sort xml nodes by seq-id
	struct xSortData
	{
		string m_key;	//seq-id
		size_t m_idx;
		
		xSortData(const string &k = k_strEmptyString, size_t i = 0):
			m_key(k), m_idx(i) {};
	};
	
	struct xSortFacility
	{
		bool operator () (const xSortData &d1, const xSortData &d2)
		{
			return d1.m_key < d2.m_key;
		}
	};
	
	vector<string> m_fasta_data;
	vector<CRef<bct::CProtein> > m_xml_data;
	vector<xSortData> m_sort_array;
	
	size_t m_num_seqs_annotated;
	
	// -- last parameter: init status
	
	void OutputNamedFasta(const TDomQuery &q, const TDomArch * pArch, bool is_specific);
	void OutputNamingXml(const TDomQuery &q, const TDomArch * pArch, bool is_specific);




	void ProcessBlastOut11(TLPFNFormatOut lpfnFormatOut);
	void ProcessInputFiles(CSparcLabelApp::TLPFNFormatOut lpfnFormatOut, const CDir &src_dir, const string &fmask);
};


string CSparcLabelApp::m_st_exec_name;
string CSparcLabelApp::m_st_launch_path;
string CSparcLabelApp::m_st_real_path;

CSparcLabelApp::CSparcLabelApp(void): CNcbiApplication(), m_reg(GetConfig()), m_app_status(e_OK), m_init_msg(), 
	m_istream(nullptr), m_infiles(), m_infmask(), m_done_dump(), m_ostream(&cout), m_outfile(), m_outmode(TOutputFormat::eDefault), m_evcutoff(DEF_EVALCUTOFF), //m_xroot(nullptr), 
	m_exit_flg(), m_abort_flg(), m_dom_cluster_src(), m_arch_src(), m_namesrc_cfd(), m_review_levels(),
	m_fasta_data(), m_xml_data(), m_sort_array(), m_num_seqs_annotated(0)
{
	SetVersion(CVersionInfo(VER.major, VER.minor, VER.patch_level));
}

void CSparcLabelApp::Init(void)
{
	// -- first
	CNcbiApplication::Init();
	SetupArgDescriptions(ProcessArgDefinitions(CSparcLabelApp::m_st_exec_name, PROGRAM_USAGE, PROGRAM_DESCRIPTION, dimValidArgs, TOTALARGS, dimValidExtraArgs, TOTALEXTRAARGS));
	
	
	// -- start to process parameters
	const CArgs & args = GetArgs();
	
	const CArgValue
		&AV_InList = args[dimValidArgs[argListIn].name],
		&AV_InfMask = args[dimValidArgs[argInfileMask].name],
		&AV_DumpLoc = args[dimValidArgs[argDoneDump].name],
		&AV_ExitFlag = args[dimValidArgs[argSessExitFlag].name],
		&AV_AbortFlag = args[dimValidArgs[argSessAbortFlag].name],
		&AV_OutFile = args[dimValidArgs[argOutFile].name],
		&AV_EvalCutoff = args[dimValidArgs[argEVCutoff].name],
		&AV_DataDir = args[dimValidArgs[argDataDir].name],
		&AV_OutFormat = args[dimValidArgs[argOutFormat].name];
		
	m_evcutoff = AV_EvalCutoff.AsDouble();
	m_outmode = GetIdx<TOutputFormat>(AV_OutFormat.AsString());
	
	if (AV_ExitFlag.HasValue())
		m_exit_flg = AV_ExitFlag.AsString();
	
	if (AV_AbortFlag.HasValue())
		m_abort_flg = AV_AbortFlag.AsString();
	
	if (AV_InfMask.HasValue())
		m_infmask = AV_InfMask.AsString();
		
	if (AV_DumpLoc.HasValue())
		m_done_dump = AV_DumpLoc.AsString();
	
	if (!m_done_dump.empty() && !CDir(m_done_dump).Create())	//non-exist dir, try to create
	{
		LOG_POST_X(e_ProcessError, "Done file dumping location specified but does not exist and cannot be created: " << m_done_dump);
		m_app_status = e_ProcessError;
		return;
	}
	
	string datadir = CSparcLabelApp::m_st_real_path + "/" + __default_datadir;
	
	if (AV_DataDir.HasValue())
		datadir = AV_DataDir.AsString();
	else
	{
		const string & dataloc = m_reg.Get(__reg_section_cdddata, __reg_tag_datadir);
		if (!dataloc.empty())
			datadir = dataloc;
	}
	
	// -- check if procedence definition file provided
	// -- read namesrc to reviewlevel from registry and construct a reverse map
	const string &PrecedenceFile = args[dimValidArgs[argPrecedenceFile].name].AsString();
	ifstream precf(PrecedenceFile.c_str());
	if (precf.good())
	{
		string buf;
		map<string, int> map_proc;	//add for superfamily archs
		while (precf.good())
		{
			buf.clear();
			getline(precf, buf);
			TrimString(buf);
			if (!buf.empty() && '#' != buf[0])
			{
				size_t len = buf.size(), pos = 0;
				string tag;
				int p, p0;
				
				while (pos < len && buf[pos] > ' ') ++pos;
				if (pos == len) continue;	
				
				tag = buf.substr(0, pos);
				
				while (pos < len && buf[pos] <= ' ') ++pos;
				if (pos == len) continue;	
				
				try
				{
					p = NStr::StringToNumeric<int> (buf.substr(pos));
				}
				catch (...)
				{
					continue;	//just skip invalid lines
				}
				
				m_namesrc_cfd.emplace_back(TNamingSource2Confidence{tag, p});
			}
		}
		

		precf.close();
		
		list<string> namesrcs;
		m_reg.EnumerateEntries(__reg_section_nametypes, &namesrcs);

		map<string, int> :: const_iterator iterPrecEnd = map_proc.end();
		
		if (!namesrcs.empty())
		{
			for (const auto & nsrc : namesrcs)
			{
				for (auto iter_tag = m_namesrc_cfd.begin(), iter_tag_end = m_namesrc_cfd.end(); iter_tag_end != iter_tag; ++iter_tag)
				{
					if (nsrc == iter_tag->m_NameSource)	// found
					{
						string rl = m_reg.Get(__reg_section_nametypes, nsrc);
						size_t pos = 0, len = rl.size();

						while (pos < len)
						{
							size_t pos1 = rl.find(pos, ',');
							if (string::npos == pos1)
								pos1 = len;
							if (pos < pos1)
								m_review_levels.emplace(rl.substr(pos, pos1), iter_tag);
							pos = pos1 + 1;
							
						}

						break;
					}
				}
			}
		}
		else
		{
			LOG_POST_X(e_InitFileError, "Cannot read name sources from configuration");
			m_app_status = e_InitFileError;
			return;
		}
	
	}
	else
	{
		LOG_POST_X(e_DataFileError, "Cannot open " << PrecedenceFile);
		m_app_status = e_DataFileError;
		return;
	}

	if (CDir(datadir).Exists())
	{
		const char
			*cddids = nullptr,
			*bscores = nullptr,
			*spfeats = nullptr,
			*genfeats = nullptr,
			*cdtrack = nullptr,
			*famlinks = nullptr,
			*sparchs = nullptr,
			*genarchs = nullptr;
		
		if (e_OK == m_app_status)
		{
			const string & reg_cddids = m_reg.Get(__reg_section_cdddata, __reg_tag_cddids);
    	
			if (!reg_cddids.empty())
				cddids = reg_cddids.c_str();
			
			const string & reg_bscores = m_reg.Get(__reg_section_cdddata, __reg_tag_bscores);
			if (!reg_bscores.empty())
				bscores = reg_bscores.c_str();
			
			const string & reg_spfeats = m_reg.Get(__reg_section_cdddata, __reg_tag_spfeats);
			if (!reg_spfeats.empty())
				spfeats = reg_spfeats.c_str();
			
			const string & reg_genfeats = m_reg.Get(__reg_section_cdddata, __reg_tag_genfeats);
			if (!reg_genfeats.empty())
				genfeats = reg_genfeats.c_str();
			
			const string & reg_cdtrack = m_reg.Get(__reg_section_cdddata, __reg_tag_cdtrack);
			if (!reg_cdtrack.empty())
				cdtrack = reg_cdtrack.c_str();
			
			const string & reg_famlinks = m_reg.Get(__reg_section_cdddata, __reg_tag_famlinks);
			if (!reg_famlinks.empty())
				famlinks = reg_famlinks.c_str();

			try
			{
				m_dom_cluster_src.LoadData(datadir, cddids, spfeats, genfeats, bscores, cdtrack, famlinks);
			}
			catch (CSimpleException e)
			{
				LOG_POST_X(e_DataFileError, e.GetFile() << ':' << e.GetLine() << ": " << e.what());
				m_app_status = e_DataFileError;
				goto labelReturn;
			}
			catch (std::exception e)
			{
				LOG_POST_X(e_DataFileError, e.what());
				m_app_status = e_DataFileError;
				goto labelReturn;
			}
			catch (...)
			{
				LOG_POST_X(e_DataFileError, "Unknown error loading CDD data files");
				m_app_status = e_DataFileError;
				goto labelReturn;
			}
			
			const string & reg_sparchs = m_reg.Get(__reg_section_cdddata, __reg_tag_sparchs);
			if (!reg_sparchs.empty())
				sparchs = reg_sparchs.c_str();
    	
			const string & reg_genarchs = m_reg.Get(__reg_section_cdddata, __reg_tag_genarchs);
			if (!reg_genarchs.empty())
				genarchs = reg_genarchs.c_str();
				
			try
			{
				m_arch_src.LoadData(datadir, sparchs, genarchs);

			}
			catch (CSimpleException e)
			{
				LOG_POST_X(e_DataFileError, e.GetFile() << ':' << e.GetLine() << ": " << e.what());
				m_app_status = e_DataFileError;
				goto labelReturn;
			}
			catch (std::exception e)
			{
				LOG_POST_X(e_DataFileError, e.what());
				m_app_status = e_DataFileError;
				goto labelReturn;
			}
			catch (...)
			{
				LOG_POST_X(e_DataFileError, "Unknown error loading Architecture data files");
				m_app_status = e_DataFileError;
				goto labelReturn;
			}

			
			if (AV_InList.HasValue())
			{
				string inlist = AV_InList.AsString();
				CFile infname(inlist);
				if (infname.Exists())
				{
					string buf;
					ifstream lsfile(infname.GetPath().c_str());
					while (lsfile.good())
					{
						buf.clear();
						getline(lsfile, buf);
						if (!buf.empty())
						{
							infname.Reset(buf);
    	
							if (infname.Exists())
								m_infiles.AppendFile(buf);
						}
					}
				}
			}
			
			// -- check extra files
			size_t extrafiles = args.GetNExtra();
    	
			if (extrafiles > 0)
			{
				CFile infname;
				for (size_t i = 1; i <= extrafiles; ++i)
				{
					const string & exfile = args[i].AsString();
					if (!exfile.empty())
					{
						infname.Reset(exfile);
						if (infname.Exists())
						{
							m_infiles.AppendFile(exfile);
						}
					}
				}
			}
			
			if (m_infiles.NLeft() > 0)
				m_istream = m_infiles.OpenStream();
			else if (m_infmask.empty())	//no mask as well
				m_istream = &cin;
			
			if (AV_OutFile.HasValue())
			{
				const string & ofilename = AV_OutFile.AsString();
				m_outfile.open(ofilename.c_str(), ios::out);	// write. 
				if (m_outfile.good())
				{
					m_ostream = &m_outfile;
				}
				else
					LOG_POST_X(e_DataFileError, "Output file " << ofilename << " open failure. Direct to stdout");
			}
		}
	}
	else
	{
		LOG_POST_X(e_DataFileError, "Data file directory " << datadir << " does not exist or unaccessible");
		m_app_status = e_DataFileError;
	}
labelReturn:
	
}

int CSparcLabelApp::Run(void)
{
	if (e_OK == m_app_status)
	{
		
		CDir src_dir;
		string fmask;
			
		if (!m_infmask.empty())
		{
			size_t pos = m_infmask.rfind('/');
			if (string::npos != pos)
			{
				src_dir = m_infmask.substr(0, pos);
				fmask = m_infmask.substr(pos + 1);
			}
			else	//no path, get cwd
				src_dir = CDir::GetCwd();
		}
		
		if (TOutputFormat::e_fasta == m_outmode)
		{
			ProcessInputFiles(&CSparcLabelApp::OutputNamedFasta, src_dir, fmask);
			if (e_OK == m_app_status)
			{
				if (!m_sort_array.empty())
				{
					sort(m_sort_array.begin(), m_sort_array.end(), xSortFacility());
					for (const auto & v : m_sort_array)
						(*m_ostream) << m_fasta_data[v.m_idx] << endl;
				}
			}
		}
		else
		{
			m_sort_array.reserve(NODE_SORT_BLOCK_SIZE);
			m_xml_data.reserve(NODE_SORT_BLOCK_SIZE);
	
			ProcessInputFiles(&CSparcLabelApp::OutputNamingXml, src_dir, fmask);

			if (e_OK == m_app_status)
			{
				
				bct::CNameAssignment name_assign;
				auto & prot_lst = name_assign.SetProtein();


				// CGXMLRef xroot = new CGXMLNode("NameAssignment");
				if (!m_sort_array.empty())
				{
					sort(m_sort_array.begin(), m_sort_array.end(), xSortFacility());
					for (const auto & v : m_sort_array)
						// xroot->AppendChild(m_xml_data[v.m_idx]);
						prot_lst.emplace_back(m_xml_data[v.m_idx]);
				}
	
				ObjStreamOut<bct::CNameAssignment>(*m_ostream, name_assign, ncbi::eSerial_Xml);
				(*m_ostream) << endl;
			}
		}
	}
	
	return m_app_status;
}

int CSparcLabelApp::DryRun(void)
{
	if (e_OK == m_app_status)
		cerr << "Dryrun passed, all resources are present and loaded successfully." << endl;
	else
		cerr << "Dryrun failed, error subcode = " << m_app_status << endl;
	
	return m_app_status;
}

void CSparcLabelApp::Exit(void)
{
	// -- last
	CNcbiApplication::Exit();
}

CSparcLabelApp::~CSparcLabelApp(void)
{}



void CSparcLabelApp::OutputNamedFasta(const TDomQuery &q, const TDomArch *pArch, bool is_specific)
{
	static stringstream ss;
	
	string defl(q.m_OriDefline);
	if (defl.empty())
		defl = ">" + q.m_strNcbiId;

	if ('|' == defl.back())
		defl.pop_back();
		
	ss.str(k_strEmptyString);
	
	ss << defl << "|sparcle|" << pArch->m_uiArchId << '|' << pArch->m_strName << endl;
	StreamOutWithWrap(ss, __fasta_width, q.m_strSeqData.begin(), q.m_strSeqData.end()) << endl << endl;
	
	size_t sz = m_sort_array.size(), cap = m_sort_array.capacity();
	if (sz >= cap)
	{
		m_sort_array.reserve(cap + NODE_SORT_BLOCK_SIZE);
		m_fasta_data.reserve(cap + NODE_SORT_BLOCK_SIZE);
	}
	
	m_fasta_data.emplace_back(ss.str());
	size_t tokpos = defl.find('|');
	if (string::npos == tokpos)
		tokpos = defl.find(' ');
	m_sort_array.emplace_back(string::npos == tokpos ? defl.substr(1) : defl.substr(1, tokpos - 1) , sz);
}

void CSparcLabelApp::OutputNamingXml(const TDomQuery &q, const TDomArch *pArch, bool is_specific)
{
	map<string, TNameSources::const_iterator> :: const_iterator iterReviewLevel = m_review_levels.find(is_specific ? pArch->m_strReviewLevel : pArch->m_strReviewLevel + "0");

	// -- review level used
	if (m_review_levels.end() != iterReviewLevel)
	{
		CRef<bct::CProtein> protein(new bct::CProtein());
		auto & attrs = protein->SetAttlist();
		attrs.SetSeq_id(q.m_strNcbiId);

		protein->SetAssignedName(pArch->m_strName);

		auto & support = protein->SetSupport();
		auto & sparcle = support.SetSPARCLE();

		sparcle.SetName(pArch->m_strName);
		sparcle.SetLabel(pArch->m_strLabel);

		sparcle.SetArchId(to_string(pArch->m_uiArchId));

		auto & ssf = sparcle.SetSSF();
		auto & precedence = protein->SetPrecedence();

		precedence.SetEvidence_source(iterReviewLevel->second->m_NameSource);
		precedence.SetConfidence(to_string(iterReviewLevel->second->m_Confidence));

		if (is_specific)
			ssf.SetSpecificArch(pArch->m_strArchString);
		else
			ssf.SetSuperfamArch(pArch->m_strArchString);

		size_t sz = m_sort_array.size(), cap = m_sort_array.capacity();
		if (sz >= cap)
		{
			m_sort_array.reserve(cap + NODE_SORT_BLOCK_SIZE);
			m_xml_data.reserve(cap + NODE_SORT_BLOCK_SIZE);
		}
		m_xml_data.emplace_back(protein);
		m_sort_array.emplace_back(q.m_strNcbiId.substr(q.m_strNcbiId.find('|') + 1), sz);	
	}

}

void CSparcLabelApp::ProcessBlastOut11(CSparcLabelApp::TLPFNFormatOut lpfnFormatOut)
{
	COfflCdAlignProcessor proc(&m_dom_cluster_src);
	
	// -- assert(nullptr != m_istream)
	while (m_istream->good())
	{
		CBlast4_archive objBlastOutput;
		try
		{
			ObjStreamIn<CBlast4_archive> (*m_istream, objBlastOutput, eSerial_AsnText);
		}
		catch (...)	//It's done here
		{
			m_num_seqs_annotated = m_sort_array.size();
			return;
		}
		
		int gcode = 1;	//standard

		list<TDomQuery> qobjs;
		// -- need to translate to this format
		// -- assume one TMaskedQueryRegions for each query
		

		if (objBlastOutput.IsSetRequest() && objBlastOutput.CanGetRequest())
		{
            if (objBlastOutput.IsSetResults() && objBlastOutput.CanGetResults())
			{
                TSeqLocInfoVector vecMasks;
				const CBlast4_archive::TResults & resobj = objBlastOutput.GetResults();	//
					
				if (resobj.IsSetMasks() && resobj.CanGetMasks())
				{
					const list< CRef< CBlast4_mask > > & blmasks = resobj.GetMasks();

					if (!blmasks.empty())
						ConvertBlastMaskListToSeqLocInfoVector(blmasks, vecMasks);
				}	
				
				if (resobj.IsSetAlignments() && resobj.CanGetAlignments())
				{
                    const CSeq_align_set &rAligns = resobj.GetAlignments();
                    
                    const CBlast4_archive::TRequest::TBody &reqbody = objBlastOutput.GetRequest().GetBody();
                    switch (reqbody.Which())
                    {
                    case CBlast4_request_body::e_Queue_search:
                        {
                            const CBlast4_queue_search_request &qsreq = reqbody.GetQueue_search();
                            if (qsreq.IsSetAlgorithm_options() && qsreq.CanGetAlgorithm_options())
                            {
                                const CBlast4_parameters &algoParams = qsreq.GetAlgorithm_options();
                                CRef<CBlast4_parameter> refGCode = algoParams.GetParamByName("QueryGeneticCode");
                                if (!refGCode.IsNull())
                                {
                                    const CBlast4_value &gcval = refGCode->GetValue();
                                    if (CBlast4_value::e_Integer == gcval.Which())
                                        gcode = gcval.GetInteger();
                                }
                            }
                            
                            const CBlast4_queries &blast_queries = qsreq.GetQueries();
                            
                            switch (blast_queries.Which())
                            {
                            case CBlast4_queries::e_Bioseq_set:
                                proc.ProcessBlastResults(qobjs, blast_queries.GetBioseq_set().GetSeq_set(), vecMasks, rAligns.Get(), gcode);
                                break;
                            case CBlast4_queries::e_Seq_loc_list:
                                proc.ProcessBlastResults(qobjs, blast_queries.GetSeq_loc_list(), vecMasks, rAligns.Get(), gcode);
                                break;
                            default:
                                stringstream ss;
                                ObjStreamOut<CBlast4_queries>(ss, blast_queries);
                                
                                LOG_POST_X(e_InputDataError, "Wrong type of request data: CBlast4_queries::e_Bioseq_set expected. " << move(ss).str());
                                continue;
                            }	//switch (blast_queries.Which())
                        }
                        break;
                    default:
                        stringstream ss;
                        ObjStreamOut<CBlast4_archive::TRequest::TBody>(ss, reqbody);
                        LOG_POST_X(e_InputDataError, "Wrong type of request data: CBlast4_request_body::e_Queue_search expected. " << move(ss).str());
                        continue;
                    }	//switch (reqbody.Which())
				}
				else
				{
                    stringstream ss;
                    ObjStreamOut<CBlast4_archive::TResults>(ss, resobj);
					LOG_POST_X(e_InputDataError, "Wrong type of results data: CBlast4_get_search_results_reply::e_alignments expected. " << move(ss).str());
                    continue;
				}
			}
			else
			{
                stringstream ss;
                ObjStreamOut<CBlast4_archive>(ss, objBlastOutput);
				LOG_POST_X(e_InputDataError, "Unable to get results object from input data. " << move(ss).str());
				continue;
			}
		}
		else
		{
            stringstream ss;
            ObjStreamOut<CBlast4_archive>(ss, objBlastOutput);
			LOG_POST_X(e_InputDataError, "Unable to get request object from input data. " << move(ss).str());
			continue;
		}
	
		for (const auto & v: qobjs)
        {
            string archStr, spArchStr;
    
            CreateArchStrings(m_dom_cluster_src, v.m_dimSplitAligns[0].m_vecConciseIndice, v.m_vecAlignments, archStr, spArchStr);
			const TDomArch * pArch = nullptr;
			bool is_spec = false;

            if (!spArchStr.empty())
            {
				pArch = m_arch_src.FindArch(spArchStr);
				is_spec = true;
			}
			else if (!archStr.empty())
			{
				archStr = ConvertDartString(archStr);
				pArch = m_arch_src.FindArch(archStr);
				is_spec = false;
			}

			if (nullptr != pArch)
				(this->*lpfnFormatOut)(v, pArch, is_spec);
				
        }
	}
}

void CSparcLabelApp::ProcessInputFiles(CSparcLabelApp::TLPFNFormatOut lpfnFormatOut, const CDir &src_dir, const string &fmask)
{

	if (!m_abort_flg.empty() && CFile(m_abort_flg).Exists())
	{
		m_app_status = e_AbortReq;
		return;
	}
	
	bool hasFlag = false;

	if (nullptr != m_istream)
	{
		ProcessBlastOut11(lpfnFormatOut);
		hasFlag = (m_exit_flg.empty() || CFile(m_exit_flg).Exists());
	}


	if (m_infmask.empty() || hasFlag)	//no files to search
		return;
		
	CCombFileBuf::FileList InFilesFromMask;	//original files do not move or delete
	
labelNextBatch:
    if (!m_abort_flg.empty() && CFile(m_abort_flg).Exists())
	{
		m_app_status = e_AbortReq;
		return;
	}
	// -- get flag first, but not check immediately.
	hasFlag = (m_exit_flg.empty() || CFile(m_exit_flg).Exists());
	
	m_infiles.Reset();
	m_istream = nullptr;
	
	CDir::TEntries entries = src_dir.GetEntries(fmask, CDir::fIgnoreRecursive);
    
    
	if (!entries.empty())
	{
		for (auto &v : entries)
			if (!v->IsDir())	//ignore dir
            {
                const string &afile = v->GetPath();
				m_infiles.AppendFile(afile);
            }
		
		InFilesFromMask = m_infiles.GetFiles();
		if (!InFilesFromMask.empty())
			m_istream = m_infiles.OpenStream();
		
		ProcessBlastOut11(lpfnFormatOut);
		// -- move done files to dump location
		if (!m_done_dump.empty())
        {
			for (const auto & v : InFilesFromMask)
            {
				CFile(v).MoveToDir(m_done_dump, CDirEntry::fRF_Overwrite);
            }
        }
		else
			for (const auto & v : InFilesFromMask)
            {
				CFile(v).Remove();
				
            }
		
		InFilesFromMask.clear();
	}
    else if (m_exit_flg.empty() || CFile(m_exit_flg).Exists())  // we are done
    {
        LOG_POST_X(e_OK, "Total sequences annotated: " << m_num_seqs_annotated);
		return;
    }

	SleepSec(15);
	goto labelNextBatch;
}

int main(int argc, char * argv[])
{
	string launchAlias = CDirEntry::NormalizePath(argv[0]);
	string base, ext;
	CDirEntry::SplitPath(launchAlias, &CSparcLabelApp::m_st_launch_path, &base, &ext);
	if (!CSparcLabelApp::m_st_launch_path.empty())
		CSparcLabelApp::m_st_launch_path.pop_back();	//get rid of the tailing '/'
	// -- this is deliberately left alone
	CSparcLabelApp::m_st_exec_name = base + ext;
	
	CDir launchPathDir(launchAlias);
	launchPathDir.DereferencePath();
	
	CDirEntry::SplitPath(CDirEntry::CreateAbsolutePath(launchPathDir.GetPath(), CDirEntry::eRelativeToCwd), &CSparcLabelApp::m_st_real_path);
	if (!CSparcLabelApp::m_st_real_path.empty())
		CSparcLabelApp::m_st_real_path.pop_back();
	
	return CSparcLabelApp().AppMain(argc, argv);
	
}
