#!/bin/bash

echo "start run exporession module..."

binDir=$(cd `dirname $0`; pwd)
transDir=$1
species=$2
exprDir=$3
sample_name=`basename $exprDir`
dataDir=$4/$species/expression
annoDir=$binDir/../../data/$species/annotation
outDir=$transDir/expression
rm -rf $outDir
mkdir -p $outDir
cuffDir=$outDir/cuffcmp

mkdir -p $cuffDir

########## get the gtf file ###########
bedExonFile=$transDir/exon.bed
if [ ! -s $bedExonFile ]; then
	echo "[Exit 1] $bedExonFile is empty or not existed."
	exit 1
fi
gtfFile=$transDir/exon.gtf
python $binDir/../utils/bed2gtf.py < $bedExonFile > $gtfFile

############ run cuffcmp ###########
case $species in
	hg19)
		gmGTF=$dataDir/annolnc_v1_exon.gtf.gz
		anno=$annoDir/annolnc_v1_exon.gtf
		;;
	hg38)
		gmGTF=$dataDir/GENCODE.v32.exon.gtf.gz
		anno=$annoDir/gencode.v32.annotation.gtf
		;;
	mm10)
		gmGTF=$dataDir/GENCODE.vM23.exon.gtf.gz
		anno=$annoDir/gencode.vM23.annotation.gtf
		;;
	*)
        echo "Unsupported species: $species"
        exit 1
        ;;
esac

cd $cuffDir
cuffcompare -T -R -r <(zcat $gmGTF) $gtfFile

trackingFile=$cuffDir/cuffcmp.tracking
resultFile=$cuffDir/result.txt
# if no tracking file generated, treat this transcript as a novel locus
if [ ! -s $trackingFile ]; then
	echo novel_locus > $resultFile
else
	########## decode the cuffcompare result ##########
	python $binDir/../utils/decode_cuffcompare.py < $trackingFile > $resultFile
	if [ ! -s $resultFile ]; then
		echo "[Exit 1] $resultFile is empty or not existed."
		exit 1
	fi
fi

###### call expression #######
novelGTFfile=$transDir/novel_trans.gtf
novel=0
tp=`head -n 1 $resultFile`
sample_group=$(ls $exprDir/*.sample.list)
for sample in $sample_group;do
	sample_label=$(basename $sample | sed 's/.sample.list//')
	bamDir=$exprDir/$sample_label
	eDir=$outDir/$sample_label
	mkdir -p $eDir
	###### prepare for locexpress if not exist ######
	if [ ! -f $bamDir/normalized.fpkm.txt ] || [ ! -f $bamDir/geo_factors.txt ] || [ ! -f $bamDir/frag_stat.txt ]; then
		bash $binDir/prepare_for_LocExpress.sh $bamDir $anno
	fi
	if [ "$tp" = "known_trans" ]; then
		trans=`tail -n 1 $resultFile` #do not remove postfix, because transcripts in chromosome Y need postfix to distinguish them from paralogous genes in chrX
		expFile=$bamDir/normalized.fpkm.txt
		cat $expFile | awk -v trans="$trans" '$1==trans' | tr '\t' '\n' | sed '1 d' > $eDir/normalized.fpkm.tmp
		paste $sample $eDir/normalized.fpkm.tmp > $eDir/normalized.fpkm
		rm -f $eDir/normalized.fpkm.tmp
	elif [ "$tp" = "novel_trans" ]; then
		novel=1
		gene=`tail -n 1 $resultFile`
		cat $gtfFile | sed -r 's/(gene_id ")T[0-9]+/\1'$gene'/' > $novelGTFfile
	elif [ "$tp" = "novel_locus" ]; then
		novel=1
		cp $gtfFile $novelGTFfile
	fi

	# calcualte expression for novel trans
	if [ $novel -eq 1 ]; then
		cmd=" bash $binDir/locexpress.sh $transDir $bamDir $gmGTF"
		echo $cmd
		eval $cmd
	fi
	Rscript $binDir/final_output.R $eDir/normalized.fpkm $eDir/final.fpkm
	ls $eDir | sed "s:^:`echo $eDir`/:" | grep -v 'final' | grep -v 'normalized' | xargs rm -rf
done

###removew tmp files
#rm -rf $outDir/cuffcmp
echo "expression module is done!"
