import pandas as pd import os import sys sampleFile = sys.argv[1] transFile = sys.argv[2] dataDir = sys.argv[3] outFile = sys.argv[4] sampleList = pd.read_csv(sampleFile, header=None, sep="\t") transList = pd.read_csv(transFile, header=None, sep="\t") #all gtf have the same gene order out = pd.DataFrame(0, index=transList[0], columns=sampleList[0]) for sample in sampleList[0]: print("load " + sample) data = pd.read_csv(os.path.join(dataDir, sample+".gtf"), sep="\t", header=None, comment="#") data_tmp1 = data.loc[data[2]=="transcript",8] trans_id = [x.split("\"")[3] for x in data_tmp1.values] trans_fpkm = [x.split("\"")[7] for x in data_tmp1.values] data_tmp2 = pd.DataFrame({"fpkm":trans_fpkm}, index=trans_id) idx = set(data_tmp2.index).intersection(set(out.index)) data_tmp3 = data_tmp2.loc[idx,] out.loc[data_tmp3.index, sample] = data_tmp3['fpkm'] out.to_csv(outFile, sep="\t", header=True, index=True)