options(stringsAsFactors=F)
#suppressMessages(library(snowfall))
suppressMessages(library(data.table))
args <- commandArgs(trailingOnly = T)
dataDir <- args[1]
transExp <- args[2]
totalExp <- args[3]
outDir <- args[4]

#chech whether this lncRNA is expressed
trans_expr <- read.table(transExp, header=F, row.names=1)
if(sum(trans_expr[,2]) < 1){
    print("This transcript is not expressed or has very low expression, skip for co-expression analysis")
    quit(save="no", status=0)
}

#load data
BP_mat <- fread(paste0(dataDir, "/GOA.BP.mat.gz"), nThread=5)
MF_mat <- fread(paste0(dataDir, "/GOA.MF.mat.gz"), nThread=5)
ensg_has_go <- read.table(paste0(dataDir, "/ENSG_has_GO.txt"), header=F)
ensg_has_go <- ensg_has_go$V1
ensg2symbol <- read.table(paste0(dataDir, "/ENSG_to_symbol.txt"), header=T)
total_expr <- read.table(totalExp, header=T, row.names=1, check.names=F)
tran_expr_tmp <- as.numeric(trans_expr[colnames(total_expr),2])

# start 4 cpus
#suppressWarnings(sfInit(parallel = TRUE, cpus = 4))

# convert ensembl gene ID to gene symbol
convertID <- function(coTable) {
  # convert ensg ID to gene symbol for the co-expression table
  coTable <- as.data.frame(coTable)
  coTable[, 1] <- as.vector(coTable[, 1])
  tmp <- ensg2symbol[ensg2symbol[, 1] %in% coTable[, 1], ]
  convert <- tapply(as.vector(tmp[, 2]), as.factor(tmp[, 1]), function(x) paste(x, collapse=','))
  result <- cbind(coTable[, 1], convert[coTable[, 1]], coTable[, 2])
  result[is.na(result[, 2]), 2] <- 'NA'
  rownames(result) <- NULL
  result <- as.data.frame(result)
  return(result)
}

# fisher test
fisher_test <- function(x, ref.num, target.num){
  #x[refList]: how many genes in reference list were annotated to this term
  #x[geneList]: how many genes in target list were annotated to this term
  return(fisher.test(matrix(c(x["geneList"], x["refList"], target.num-x["geneList"], ref.num-x["refList"]), nrow=2))$p.value)
}

extract_GO_ID <- function(x){
  return(strsplit(x, "|", fixed=T)[[1]][1])
}
extract_GO_name <- function(x){
  return(strsplit(x, "|", fixed=T)[[1]][2])
}

# GO enrichment
GO_enrichment <- function(goa_mat, target_gene_list, cutoff, outFile){
  ref_gene_num <- dim(goa_mat)[2]-1 #first column is GO ID
  target_gene_num <- length(target_gene_list)
  dat_for_enrichment <- data.frame(refList=rowSums(goa_mat[,-1]), geneList=rowSums(goa_mat[, ..target_gene_list]))
  rownames(dat_for_enrichment) <- goa_mat[, go_id]
  dat_for_enrichment$pvalue <- apply(dat_for_enrichment, 1, fisher_test, ref.num=ref_gene_num, target.num=target_gene_num)
  dat_for_enrichment$qvalue <- p.adjust(dat_for_enrichment$pvalue, method = "BH")
  #output <- dat_for_enrichment[dat_for_enrichment$qvalue<=cutoff, ]
  output <- dat_for_enrichment[dat_for_enrichment$pvalue<=cutoff, ]
  if(dim(output)[1] > 1){
    output$GO <- unlist(lapply(rownames(output), extract_GO_ID))
    output$description <- unlist(lapply(rownames(output), extract_GO_name))
    output <- output[order(output$qvalue), c("GO", "description", "pvalue", "qvalue")]
    #output <- output[order(output$qvalue),]
    output$pvalue <- format(output$pvalue, scientific = T, digits = 3)
    output$qvalue <- format(output$qvalue, scientific = T, digits = 3)
    write.table(output, outFile, sep = '\t', row.names = F, col.names = T, quote = F)
   }else{
    print(paste0("no GO term passed the filtration: FDR < ", cutoff))
  }
}

Pcutoff <- 0.05
cutoffs <- c(0.9, 0.8, 0.7)

# calculate correlation
cors <- cor(tran_expr_tmp, t(total_expr))[1, ]
all_genes <- rownames(total_expr)

cat('---positive correlation---\n')
for(c in cutoffs) {
    enstp_out <- all_genes[cors >= c]
    enstp <- enstp_out[enstp_out %in% ensg_has_go]
    if (length(enstp) >= 10) break
}
cat(paste0('Final cutoff: ', c, '\n'))
enstp_out <- all_genes[cors >= c]
if(length(enstp_out) > 0){
    coNb <- cbind(enstp_out, cors[enstp_out])
    coNb[, 2] <- format(as.numeric(coNb[, 2]), digits = 4)
    coNb <- convertID(coNb)
    write.table(coNb, paste0(outDir, "/positive.correlated.gene.txt"), sep="\t", row.names = F, col.names = F, quote = F)
    # GO enrichment
    enstp <- enstp_out[enstp_out %in% ensg_has_go]
    if(length(enstp) > 1){
	GO_enrichment(BP_mat, enstp, Pcutoff, paste0(outDir, '/positive.correlated.gene.BP.txt'))
	GO_enrichment(MF_mat, enstp, Pcutoff, paste0(outDir, '/positive.correlated.gene.MF.txt'))
    }else{
	print("no protein coding genes with GO annotation in positive correlated gene list!")
    }
}else{
    print("no positive correlated genes found!")
}

cat('---negative correlation---\n')
for(c in cutoffs) {
    enstp_out <- all_genes[cors <= -c]
    enstp <- enstp_out[enstp_out %in% ensg_has_go]
    if (length(enstp) >= 10) break
}
cat(paste0('Final cutoff: ', -c, '\n'))
enstp_out <- all_genes[cors <= -c]
if(length(enstp_out) > 0){
    coNb <- cbind(enstp_out, cors[enstp_out])
    coNb[, 2] <- format(as.numeric(coNb[, 2]), digits = 4)
    coNb <- convertID(coNb)
    write.table(coNb, paste0(outDir, "/negative.correlated.gene.txt"), sep="\t", row.names = F, col.names = F, quote = F)
    # GO enrichment
    enstp <- enstp_out[enstp_out %in% ensg_has_go]
    if(length(enstp) > 1){
	GO_enrichment(BP_mat, enstp, Pcutoff, paste0(outDir, '/negative.correlated.gene.BP.txt'))
	GO_enrichment(MF_mat, enstp, Pcutoff, paste0(outDir, '/negative.correlated.gene.MF.txt'))
    }else{
	print("no protein coding genes with GO annotation in negative correlated gene list!")
    }
}else{
    print("no negative correlated genes found!")
}
