options(stringsAsFactors=F)
suppressMessages(library(data.table))
suppressMessages(library(GOstats))
suppressMessages(library(GSEABase))
suppressMessages(library(reshape2))
args <- commandArgs(T)
dataDir <- args[1]

# load data
load(paste0(dataDir, '/GO_set.Rdata'))
geneIDs <- geneIds(gsc)
ontology <- read.csv(paste0(dataDir, "/GO.ID.name.map"), header=F, row.names=1, sep="\t")
ontology.MF <- rownames(ontology)[ontology[,2]=="molecular_function"]
ontology.MF <- intersect(ontology.MF, names(geneIDs))
ontology.BP <- rownames(ontology)[ontology[,2]=="biological_process"]
ontology.BP <- intersect(ontology.BP, names(geneIDs))

# add GO name
add_name <- function(x){
  return(paste(x, ontology[x,1], sep="|"))
}

goa <- melt(geneIDs)
colnames(goa) <- c("ensembl_gene_id", "go_id")
goa$value <- 1
goa <- data.table(goa)
go_mat <- data.table::dcast(goa, go_id~ensembl_gene_id, mean, value.var="value", fill=0)

# set matrix for molecular function
print("set matrix for molecular function")
MF_mat <- go_mat[go_mat[,go_id] %in% ontology.MF,]
MF_mat$go_id <- unlist(lapply(MF_mat$go_id, add_name))
gz1 <- gzfile(paste0(dataDir, "/GOA.MF.mat.gz"), 'w')
write.table(MF_mat, gz1, sep=";", quote=F, row.names=F, col.names=T)
close(gz1)

# set matrix for biologic process
print("set matrix for biologic process")
BP_mat <- go_mat[go_mat[,go_id] %in% ontology.BP,]
BP_mat$go_id <- unlist(lapply(BP_mat$go_id, add_name))
gz2 <- gzfile(paste0(dataDir, "/GOA.BP.mat.gz"), 'w')
write.table(BP_mat, gz2, sep=";", quote=F, row.names=F, col.names=T)
close(gz2)
