options(stringsAsFactors=F) args <- commandArgs(T) geneListFile <- args[1] species <- args[2] outDir <- args[3] if(species %in% c('hg19', 'hg38')){ dataset <- "hsapiens_gene_ensembl" symbol <- "hgnc_symbol" organism <- "Homo sapiens" } if(species %in% c('mm10')){ dataset <- "mmusculus_gene_ensembl" symbol <- "mgi_symbol" organism <- "Mus musculus" } # =============== get the ENSG map to GO =================== suppressMessages(library(biomaRt)) gene_to_trans <- read.table(geneListFile, header=F, sep="\t") gene_id <- unlist(lapply(gene_to_trans$V1, function(x){return(strsplit(x, ".", fixed=T)[[1]][1])})) gene_id <- unique(gene_id) ensembl <- useMart("ensembl",dataset=dataset, host="uswest.ensembl.org") goids <- getBM(values = gene_id, filters = 'ensembl_gene_id', attributes=c('ensembl_gene_id','go_id', 'go_linkage_type'), mart = ensembl) goids <- goids[goids[, 2] != '', ] ensg_has_go <- unique(goids[, 1]) write.table(ensg_has_go, file = paste(outDir, 'ENSG_has_GO.txt', sep="/"), row.names=F, col.names=F, quote=F) # =============== get the gsc for GO ==================== suppressMessages(library(GOstats)) suppressMessages(library(GSEABase)) goframeData <- goids[, c(2, 3, 1)] goFrame <- GOFrame(goframeData,organism=organism) goAllFrame <- GOAllFrame(goFrame) gsc <- GeneSetCollection(goAllFrame, setType = GOCollection()) save(gsc, file = paste(outDir, 'GO_set.Rdata', sep="/")) # ================== get the ENSG to symbol =============== ensg2symbol <- getBM(values = gene_id, filters = 'ensembl_gene_id', attributes=c('ensembl_gene_id',symbol), mart = ensembl) ensg2symbol <- ensg2symbol[ensg2symbol[, 2] != '', ] write.table(ensg2symbol, file = paste(outDir, 'ENSG_to_symbol.txt', sep="/"), row.names=F, col.names=T, quote=F, sep="\t")