genomics: R code for gene to GO assignment

######################################################################################################
### GOHyperGAll: Global Hypergeometric Test Using Custom Gene-to-GO Mappings Plus GO Slim Analysis ###
######################################################################################################
# Author: Thomas Girke
# Last update: May 26, 2006
# Utility: To test a sample population of genes for over-representation of GO terms, the
# function 'GOHyperGAll' computes for all GO nodes a hypergeometric distribution test and
# returns the corresponding p-values. A subsequent filter function performs a GO Slim
# analysis using default or custom GO Slim categories.
# Note: GOHyperGAll provides similar utilities as the GOHyperG function in the GOstats package
# from BioConductor. The main difference is that GOHyperGAll simplifies the usage of custom
# chip-to-gene and gene-to-GO mappings.
#
# How it works:
# (A) Generate the required data objects (slow, but needs to be done only once)
# (B) Define GOhyperG_All function
# (C) Subsetting and plotting of results by assigned nodes or goSlim categories
# To demo the script and import all required functions, run the following source() command:
#         source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/GOHyperGAll.txt")
# HTML Instructions:
#       http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/R_BioCondManual.html#GOHyperGAll

########################################
# (A) Generate the required data objects      
########################################
# (A.1) Generate sample data frames with assigned gene-to-GO mappings,
# one for MF, one for BP and one for CC mappings
# custom mappings can be used here, but need to have the same format as GO_XX_DFs in the following examples
# (A.1.1) Obtain mappings from geneontology.org
readGOorg <- function(myfile = "gene_association.tair", colno = c(5,3,9), org = "Arabidopsis") {        go_org <- read.delim(myfile, na.strings = "", header=F, comment.char = "!", sep="\t")        if(org == "Arabidopsis") {         go_org <- read.delim("gene_association.tair", na.strings = "", header=F, comment.char = "!", sep="\t")         id_vec <- gsub(".*(AT\\dG\\d\\d\\d\\d\\d).*", "\\1", as.character(go_org[,11]), perl=T)         go_org <- data.frame(go_org, GeneID=id_vec)         go_org <- go_org[grep("^AT\\dG\\d\\d\\d\\d\\d", as.character(go_org$GeneID), perl=T),]         go_org <- go_org[!duplicated(paste(go_org[,5], gsub("\\.\\d{1,}", "", as.character(go_org[,16]), perl=T), sep="_")),]         go_org <- go_org[,c(1:2,16,4:15,3)]        }        go_org <- go_org[ ,colno]        go_org <- na.omit(go_org)        names(go_org) <- c("GOID", "GeneID", "GOCAT")        GO_MF_DF <<- go_org[go_org[,3]=="F",]        write.table(GO_MF_DF, file="GO_MF_DF", quote=T, sep="\t")        cat("\n", "Object 'GO_MF_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n")        GO_BP_DF <<- go_org[go_org[,3]=="P",]        write.table(GO_BP_DF, file="GO_BP_DF", quote=T, sep="\t")        cat("\n", "Object 'GO_BP_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n")        GO_CC_DF <<- go_org[go_org[,3]=="C",]        write.table(GO_CC_DF, file="GO_CC_DF", quote=T, sep="\t")        cat("\n", "Object 'GO_CC_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") }  # (A.1.2) Obtain mappings from BioC sampleDFgene2GO <- function(lib="ath1121501") {        require(GOstats); require(lib, character.only=T)        affyGOMF <- eapply(get(paste(lib, "GO", sep=""), mode = "environment"), getOntology, "MF") # generates list with GeneID components containing MFGOs        GO_MF_DF <<- data.frame(GOID=unlist(affyGOMF), GeneID=rep(names(affyGOMF), as.vector(sapply(affyGOMF, length))), Count=rep(as.vector(sapply(affyGOMF, length)), as.vector(sapply(affyGOMF, length))))        write.table(GO_MF_DF, file="GO_MF_DF", quote=F, sep="\t")        cat("\n", "Object 'GO_MF_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n")        affyGOBP <- eapply(get(paste(lib, "GO", sep=""), mode = "environment"), getOntology, "BP") # generates list with GeneID components containing BPGOs        GO_BP_DF <<- data.frame(GOID=unlist(affyGOBP), GeneID=rep(names(affyGOBP), as.vector(sapply(affyGOBP, length))), Count=rep(as.vector(sapply(affyGOBP, length)), as.vector(sapply(affyGOBP, length))))        write.table(GO_BP_DF, file="GO_BP_DF", quote=F, sep="\t")        cat("\n", "Object 'GO_BP_DF' created containing assigned gene-to-BPGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n")        affyGOCC <- eapply(get(paste(lib, "GO", sep=""), mode = "environment"), getOntology, "CC") # generates list with GeneID components containing CCGOs        GO_CC_DF <<- data.frame(GOID=unlist(affyGOCC), GeneID=rep(names(affyGOCC), as.vector(sapply(affyGOCC, length))), Count=rep(as.vector(sapply(affyGOCC, length)), as.vector(sapply(affyGOCC, length))))        write.table(GO_CC_DF, file="GO_CC_DF", quote=F, sep="\t")        cat("\n", "Object 'GO_CC_DF' created containing assigned gene-to-CCGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") } cat("\n", "(A.1) To use the GOHyperGAll() function, one needs 3 data frames containing the gene-to-GO mappings MF, BP and CC. \n       Demo data sets from GO.org or BioC can be created with one of these commands: \n \t (A.1.1) For annotations from geneontology.org: \n \t readGOorg(myfile = \"gene_association.tair\", colno = c(5,3,9), org = \"Arabidopsis\") \n \t \t myfile: download annotation table from geneontology.org and unzip it. Then point function to file name. \n \t \t colno: required column numbers; default 'c(3,5,9)' should work in most cases \n \t \t org: needs to be specified only for Arabidopsis mappings \n \t (A.1.2) For annotations from BioC: \n \t sampleDFgene2GO(lib=\"ath1121501\") \n \t \t lib: defines annotation library, default is \"ath1121501\"", "\n")  # (A.2) Generate list containing gene-to-GO-OFFSPRING associations including assiged nodes # This is very slow (3x3 minutes), but needs to be done only once! gene2GOlist <- function() {        require(GOstats)        for(i in c("MF","BP","CC")) {                if(i=="MF") {                        go_offspr_list <- as.list(GOMFOFFSPRING) }                if(i=="BP") {                        go_offspr_list <- as.list(GOBPOFFSPRING) }                if(i=="CC") {                        go_offspr_list <- as.list(GOCCOFFSPRING) }                go_offspr_list <- lapply(go_offspr_list, unlist); go_offspr_list <- lapply(go_offspr_list, as.vector) # clean-up step for the list                go_offspr_list_temp <- lapply(names(go_offspr_list), function(x) c(x, go_offspr_list[[x]]) ) # include list component (GOID) names in corresponding (GOID) vectors                names(go_offspr_list_temp) <- names(go_offspr_list) # names list components after go_offspr_list                go_offspr_list <- go_offspr_list_temp                go_offspr_list <- lapply(go_offspr_list, function(x) x[!is.na(x)]) # remove NAs in vectors                if(i=="MF") {                        MF_node_affy_list <<- lapply(go_offspr_list, function(x) unique(as.vector(GO_MF_DF[GO_MF_DF$GOID %in% x, 2]))) # retrieve gene/affy IDs for GOID vectors                        save(MF_node_affy_list, file="MF_node_affy_list") }                if(i=="BP") {                        BP_node_affy_list <<- lapply(go_offspr_list, function(x) unique(as.vector(GO_BP_DF[GO_BP_DF$GOID %in% x, 2])))                        save(BP_node_affy_list, file="BP_node_affy_list") }                if(i=="CC") {                        CC_node_affy_list <<- lapply(go_offspr_list, function(x) unique(as.vector(GO_CC_DF[GO_CC_DF$GOID %in% x, 2])))                         save(CC_node_affy_list, file="CC_node_affy_list") }                cat("\n", paste("Object '", i, "_node_affy_list'", sep=""), "with gene-to-GO-OFFSPRING associations created and saved in your working directory.", "\n")        } } cat("\n", "(A.2) The corresponding gene-to-GO-OFFSPRING associations are created from the three data frames with the following \n       command. This creates 3 list objects with the required MF, BP and CC associations. \n \t gene2GOlist()", "\n")  # (A.3) Generate AffyID-to-GeneID mappings when working with chip feature IDs # This function creates a AffyID-to-GeneID mapping data frame using by default the TAIR mappings for the Arabidopsis ATH1 chip. # Once the decoding data frame 'affy2locusDF' is created, the function returns for a query set of AffyIDs the corresponding GeneIDs. # To use the function for the mappings of other chips, one needs to create the corresponding decoding data frame 'affy2locusDF'. AffyID2GeneID <- function(map = "ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2006-07-14.txt", affyIDs) {        if(!exists("affy2locusDF")) {                cat("\n", "Downloading AffyID-to-GeneID mappings, creating object 'affy2locusDF' and saving it in your working directory", "\n")                affy2locus <- read.delim(map, na.strings = "", fill=TRUE, header=T, sep="\t")[,-c(2:4,7:9)]                names(affy2locus) <- c("AffyID", "AGI", "Desc")                row.names(affy2locus) <- as.vector(affy2locus[,1])                my_list <- apply(affy2locus[,-c(3)], 1, list); my_list <- lapply(my_list, unlist)                my_list <- lapply(my_list, function(x) as.vector(x[-1]))                my_list <- lapply(my_list, strsplit, ";"); my_list <- lapply(my_list, unlist)                affy2locusDF <- data.frame(unlist(my_list))                affy2locusDF <- data.frame(rep(names(unlist(lapply(my_list, length))), as.vector(unlist(lapply(my_list, length)))), affy2locusDF)                names(affy2locusDF) <- c("AffyID", "GeneID")                affy2locusDF <<- affy2locusDF         write.table(affy2locusDF, file="affy2locusDF", quote=F, sep="\t")        }        if(!missing(affyIDs)) {         GeneIDs <- unique(as.vector(affy2locusDF[affy2locusDF[,1] %in% affyIDs, 2]))         return(GeneIDs)        } } cat("\n", "(A.3) To work with AffyIDs, the function AffyID2GeneID() can be used to import custom AffyID-to-GeneID mappings. \n \t AffyID2GeneID(map = \"ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2006-07-14.txt\") \n \t \t map: location of custom AffyID-to-GeneID mappings", "\n")  # (A.4) Next time things are much faster by reading the 6 data objects from file loadData <- function() {                need_affy2gene <- dir()         if(length(need_affy2gene[need_affy2gene=="affy2locusDF"]) > 0) {
        affy2locusDF <<- read.table(file="affy2locusDF", header=T, colClasses = "character")         }                GO_MF_DF <<- read.table(file="GO_MF_DF", header=T, colClasses = "character")                GO_BP_DF <<- read.table(file="GO_BP_DF", header=T, colClasses = "character")                GO_CC_DF <<- read.table(file="GO_CC_DF", header=T, colClasses = "character") } cat("\n", "(A.4) In future R sessions one can can omit the previous 3 steps (A.1-A.3) by importing all 6 (7) data objects like this: \n \t loadData(); load(file=\"MF_node_affy_list\"); load(file=\"BP_node_affy_list\"); load(file=\"CC_node_affy_list\")", "\n")  ########################### # (B) GOhyperG_All function ########################### # (B.1) Define GOhyperG_All function GOHyperGAll <- function(gocat="MF", sample) { # (go_df): Generates data frame containing the commonly used components for all GO nodes: GOID, GO Term and Ontology Type.        require(GOstats)        if(!exists("go_df")) {        go_df <- data.frame(GOID=unlist(eapply(GOTERM, function(x) x@GOID)), Term=unlist(eapply(GOTERM, function(x) x@Term)), Ont=unlist(eapply(GOTERM, function(x) x@Ontology)))        go_df <<- na.omit(go_df)        cat("\n", "Object 'go_df' created containing for all GO nodes the commonly used components: GOID, GO Term and Ontology Type", "\n")        } # (m): Obtain for every node in GO tree their number of associated genes or chip features        if(gocat=="MF") {node_affy_list <- MF_node_affy_list}        if(gocat=="BP") {node_affy_list <- BP_node_affy_list}        if(gocat=="CC") {node_affy_list <- CC_node_affy_list}        node_stats_df <- data.frame(NodeSize=sapply(node_affy_list, length))        node_stats_df <- data.frame(GOID=row.names(node_stats_df), node_stats_df)        row.names(node_stats_df) <- 1:length(node_stats_df[,1])        m <- as.vector(node_stats_df$NodeSize)                       # (x): Obtain for every node in GO tree the number of matching genes in sample set        node_sample_stats <- sapply(node_affy_list, function(x) { sum(unlist(x) %in% sample) } )        node_sample_stats <- as.vector(node_sample_stats)        x <- node_sample_stats         # (n): Obtain the number of unique genes associated at all GO nodes        if(gocat=="MF") { GO_DF <- GO_MF_DF }        if(gocat=="BP") { GO_DF <- GO_BP_DF }        if(gocat=="CC") { GO_DF <- GO_CC_DF }        n <- length(unique(GO_DF[, 2]))  # (k): Obtain number of unique genes in test sample that have GO mappings        k <- length(unique(GO_DF[GO_DF[,2] %in% sample, 2]))  # Obtain gene/chip keys matching at GO nodes        match_key <- sapply(node_affy_list, function(x) { x[unlist(x) %in% sample] } )        match_key <- sapply(match_key, function(x) { paste(x, collapse=" ") } )  match_key <- as.vector(match_key)  key <- match_key; key[key==""] <- "NA"   # Apply phyper function        phyp_v <- phyper(x-1, m, n-m , k, lower.tail = FALSE)         result_df <- data.frame(node_stats_df, SampleMatch=x, Phyper=phyp_v, SampleKeys=key)        result_df <- merge(result_df, go_df, x.by="GOID", y.by="GOID", all.x=T)        result_df <- result_df[order(result_df$Phyper), ]  result_df <- result_df[,c(1:4,6:7,5)]        result_df } cat("\n", "(B.1) The function GOHyperGAll() runs the phyper test against all nodes in the GO network. \n Usage: \n \t GOHyperGAll(gocat=\"MF\", sample=test_sample)[1:20,] \n \t \t gocat: \"MF\", \"BP\", \"CC\" \n \t \t test_sample <- unique(as.vector(GO_MF_DF[1:40,2])) # for GeneIDs\n \t \t test_sample <- AffyID2GeneID(affyIDs=affy_sample) # for AffyIDs \n \t \t affy_sample <- c(\"266592_at\", \"266703_at\", \"266199_at\", \"246949_at\", \"267370_at\", \"267115_s_at\", \"266489_at\", \"259845_at\", \"266295_at\", \"262632_at\")", "\n")  ################################################################################### # (C) Subsetting of results from GOHyperGAll by assigned nodes or goSlim categories ################################################################################### # (C.1) Define subsetting function        GOHyperGAll_Subset <- function(GOHyperGAll_result, sample=test_sample, type="goSlim", myslimv) { # type: "goSlim" or "assigned"; optional argument "myslimv" to privde custom goSlim vector                if(type=="goSlim") {                        if(missing(myslimv)) {                                slimv <- c("GO:0030246","GO:0008289","GO:0003676","GO:0000166","GO:0019825","GO:0005515","GO:0003824","GO:0030234","GO:0005554","GO:0003774","GO:0004871","GO:0005198","GO:0030528","GO:0045182","GO:0005215","GO:0000004","GO:0006519","GO:0007154","GO:0007049","GO:0016043","GO:0006412","GO:0006464","GO:0006810","GO:0007275","GO:0007049","GO:0006519","GO:0005975","GO:0006629","GO:0006139","GO:0019748","GO:0015979","GO:0005618","GO:0005829","GO:0005783","GO:0005768","GO:0005794","GO:0005739","GO:0005777","GO:0009536","GO:0005840","GO:0005773","GO:0005764","GO:0005856","GO:0005634","GO:0005886","GO:0008372","GO:0005576")                                } else {                                slimv <- myslimv }                        GOHyperGAll_subset <- GOHyperGAll_result[GOHyperGAll_result[,1] %in% slimv, ]                }                if(type=="assigned") {                        termGO <- c(as.vector(GO_MF_DF[GO_MF_DF$GeneID %in% test_sample, 1]),                                as.vector(GO_BP_DF[GO_BP_DF$GeneID %in% test_sample, 1]),                                as.vector(GO_CC_DF[GO_CC_DF$GeneID %in% test_sample, 1]))                        subset_v <- unique(termGO)                        GOHyperGAll_subset <- GOHyperGAll_result[GOHyperGAll_result[,1] %in% subset_v, ]                }                GOHyperGAll_subset        } cat("\n", "(C.1) The function GOHyperGAll_Subset() allows subsetting of the GOHyperGAll() results by assigned GO nodes or custom goSlim categories.", "\n", "Usage:", "\n", "\t GOHyperGAll_result <- GOHyperGAll(gocat=\"MF\", sample=test_sample)", "\n", "\t GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type=\"goSlim\")", "\n", "\t \t type: \"goSlim\" or \"assigned\"", "\n", "\t \t myslimv: optional argument allows usage of a custom goSlim vector", "\n")  # Apply subsetting function        # GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type="goSlim")  # (C.2) Plotting of subsetted results        # subset <- GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type="goSlim")        # pie(subset[subset$SampleMatch>0 ,3], labels=as.vector(subset[subset$SampleMatch>0 ,1]), main=unique(as.vector(subset[subset$SampleMatch>0 ,6])))
cat("\n", "(C.2) Plot pie chart of subsetted results: \n \t subset <- GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type=\"goSlim\") \n \t pie(subset[subset$SampleMatch>0 ,3], labels=as.vector(subset[subset$SampleMatch>0 ,1]), main=unique(as.vector(subset[subset$SampleMatch>0 ,6])))", "\n")

########################################################
# (D) Reduce GO Term Redundancy in 'GOHyperGAll_results'
########################################################
# (D.1) The function 'GOHyperGAll_Simplify' subsets the data frame 'GOHyperGAll_result' by a user
# specified p-value cutoff and removes from it all GO nodes with overlapping children sets
# (OFFSPRING). Only the best scoring nodes remain in the data frame.
# The argument 'correct' is experimental. It aims to favor the selection of distal (information rich)
# GO terms that have at the same time a large number of sample matches. The following calculation is used
# for this adjustment: phyper x Number_of_children / SampleMatch
# Define GOHyperGAll_Simplify()
GOHyperGAll_Simplify <- function(GOHyperGAll_result, gocat="MF", cutoff=0.001, correct=T) { # gocat: "MF", "BP" or "CC"; cutoff: p-value cutoff; correct: TRUE or FALSE  if(gocat!=as.vector(GOHyperGAll_result$Ont[!is.na(GOHyperGAll_result$Ont)])[1]) { stop("The GO categories in GOHyperGAll_Simplify() and GOHyperGAll_result need to match") }  testDF <- GOHyperGAll_result[GOHyperGAll_result$Phyper<=cutoff,]  testDF <- data.frame(testDF, test=rep(0, times=length(testDF[,1])))  testDF <- testDF[!is.na(testDF$Ont),]  GOIDv <- NULL  GO_OL_Matchv <- NULL  while(sum(testDF$test==0)>0) {
  clusterv <- NULL   test <- as.vector(testDF[,1])   for(j in 1:length(test)) {    if(gocat=="MF") { mymatch <- sum(unique(na.omit(c(test[j], as.list(GOMFOFFSPRING)[[test[j]]])) %in% na.omit(c(test[1], as.list(GOMFOFFSPRING)[[test[1]]]))))     if(mymatch==1) { mymatch <- length(as.list(GOMFOFFSPRING)[[test[j]]])  }    }    if(gocat=="BP") { mymatch <- sum(unique(na.omit(c(test[j], as.list(GOBPOFFSPRING)[[test[j]]])) %in% na.omit(c(test[1], as.list(GOBPOFFSPRING)[[test[1]]]))))     if(mymatch==1) { mymatch <- length(as.list(GOBPOFFSPRING)[[test[j]]])  }    }    if(gocat=="CC") { mymatch <- sum(unique(na.omit(c(test[j], as.list(GOCCOFFSPRING)[[test[j]]])) %in% na.omit(c(test[1], as.list(GOCCOFFSPRING)[[test[1]]]))))     if(mymatch==1) { mymatch <- length(as.list(GOCCOFFSPRING)[[test[j]]])  }    }    clusterv <- c(clusterv, mymatch)   }   clusterv[clusterv==0] <- NA   testDF <- data.frame(testDF[,-8], test=clusterv)   if(correct==T) {    testDF <- data.frame(testDF, decide=testDF$Phyper * (testDF$test/testDF$SampleMatch))    } else {    testDF <- data.frame(testDF, decide=testDF$Phyper) }   GOIDv <- c(GOIDv, as.vector(testDF[order(testDF[,9]),][1,1]))   GO_OL_Matchv <- c(GO_OL_Matchv, length(unique(unlist(strsplit(as.vector(testDF[!is.na(testDF$test),7]), " ")))))   testDF <- testDF[is.na(testDF$test),]   testDF <- testDF[order(testDF[,4]),-c(8,9)]   testDF <- data.frame(testDF, test=rep(0, times=length(testDF[,1])))   cat(GOIDv, "\n")  }  simplifyDF <- data.frame(GOID=GOIDv, GO_OL_Match=GO_OL_Matchv)  simplifyDF }  # Apply GOHyperGAll_Simplify # simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result, gocat="MF", cutoff=0.001, correct=T) # data.frame(GOHyperGAll_result[GOHyperGAll_result[,1] %in% simplifyDF[,1], -7], GO_OL_Match=simplifyDF[,2])   ####################################### # (D.2) Batch Analysis of Gene Clusters ####################################### # The function 'GOCluster_Report' performs a 'GOHyperGAll_Simplify' analysis in batch mode. # It processes many groups of genes (e.g. from gene expression clusters) and organizes the results # in a single data frame. # The gene sets or clusters need to be provided in a data frame of this structure: #  probeID/geneID ClusterID ClusterSize # id1  CL1  2 # id2  CL1  2 # id3  CL2  1 # ...  ...  ...   # # Define 'GOCluster_Report()'  GOCluster_Report <- function(CL_DF=CL_DF, CLSZ=10, cutoff=0.001, gocat=c("MF", "BP", "CC"), recordSpecGO=NULL) { # CLSZ: cluster size cutoff; gocat: "MF", "BP" or "CC"; cutoff: p-value cutoff; recordSpecGO: argument to include one specific GOID in each of the 3 ontologies, e.g: recordSpecGO=c("GO:0005554", "GO:0000004", "GO:0008372")        cluster_loop <- unique(as.vector(CL_DF[CL_DF[,3]>CLSZ,2]))
      if(length(cluster_loop[grep("CL", cluster_loop)])>0) {
              cluster_loop <- paste("CL", sort(as.numeric(gsub("CL","", as.character(cluster_loop)))), sep="")                containerDF <- data.frame(CLID=NULL, CLSZ=NULL, GOID=NULL, NodeSize=NULL, SampleMatch=NULL, Phyper=NULL, Term=NULL, Ont=NULL, GO_OL_Match=NULL)        }        for(i in cluster_loop) {                cat("\n", "Processing cluster no", i, "\n")                affy_sample <- CL_DF[CL_DF[,2]==i, 1]                test_sample <- AffyID2GeneID(affyIDs=affy_sample)                containerDF2 <- data.frame(GOID=NULL, NodeSize=NULL, SampleMatch=NULL, Phyper=NULL, Term=NULL, Ont=NULL, GO_OL_Match=NULL)                count <- 0   for(j in c("MF", "BP", "CC")) {                        count <- count+1    GOHyperGAll_result <- GOHyperGAll(gocat=j, sample=test_sample)                        simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result, gocat=j, cutoff=cutoff, correct=T)                        if(length(simplifyDF)==0) { simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result[1:2,], gocat=j, cutoff=1, correct=T) }                        tempDF <- data.frame(GOHyperGAll_result[GOHyperGAll_result[,1] %in% simplifyDF[,1], -7], GO_OL_Match=simplifyDF[,2])                        containerDF2 <- rbind(containerDF2, tempDF)    if(length(recordSpecGO)>0) {
    containerDF2 <- rbind(containerDF2, data.frame(GOHyperGAll_result[GOHyperGAll_result[,1]==recordSpecGO[count],-7], GO_OL_Match=GOHyperGAll_result[GOHyperGAll_result[,1]==recordSpecGO[count],3]))    }    no_annot <- test_sample[!test_sample %in% eval(parse(text=paste("GO_", j, "_DF", sep="")))[,2]]; no_annot <- length(no_annot[no_annot!="no_match"])    containerDF2 <- rbind(containerDF2, data.frame(GOID=paste("no_annot_", j, sep=""), NodeSize=NA, SampleMatch=no_annot, Phyper=NA, Term=NA, Ont=j, GO_OL_Match=no_annot))                }                tempDF2 <- data.frame(CLID=rep(i, times=length(containerDF2[,1])), CLSZ=rep(unique(as.vector(CL_DF[CL_DF[,2]==i,3])), times=length(containerDF2[,1])), containerDF2)                containerDF <- rbind(containerDF, tempDF2)        }        containerDF }  # Apply GOCluster_Report # BatchResult <- GOCluster_Report(CL_DF=CL_DF, CLSZ=10, cutoff=0.001, gocat=c("MF", "BP", "CC"), recordSpecGO=c("GO:0005554", "GO:0000004", "GO:0008372"))
2 comments:

kyle said...: Do u have anything on how to retrieve enzyme information (EC number e.g 3.6.1.17) from KEGG for a particular reaction (e.g R00184) or from set of genes/compounds (e.g from C01260 to C00002).. thanks alot; 11:42 PM
Business news said...: I like to read an article that express interesting views. Many points the author made are unique and are things I wouldn't have thought about. One in particular (mentioned by a certain post-punk group) leads a normal life, and nobody notices he's a goid, and he's happy that way.; 9:44 AM
Wednesday, August 30, 2006

R code for gene to GO assignment

2 comments: