Friday, December 15, 2006

Writing Background correction code for a large number of CEL files

# This is the working back ground correction script
# Author: Sucheta
# Date: 9th Dec 2006

library(affy)

library(gcrma)

# Affinity info need not be calculated all the time, since it is
#mostly preserved with the workspace
#affinfo <- compute.affinities("soybean")


celfile<-read.table("list.txt") # read the elements of list.txt as a list

celfile<-as.matrix(celfile) # convert celfile from a list to a matrix

n<-dim(celfile)[1] # Getting the number of cel files

m <- 18;

dir<-"/home/XXX/cel/"

outdir<-"/home/XXX/bgCorrect/"




print(n);


k <- (m - 2*m)+1;

i<- k;

while( { i = i + m } < n){ # Replace for loop with while loop

print(i);

file <- NULL;
obj <- NULL;
tmp1 <- NULL;

for(j in 0:(m-1)){

tmpFile <- paste(dir, celfile[i+j], sep="");

file <- c(file, list(tmpFile)) # appending a list to the existing list

}

obj<-ReadAffy(filenames=file)


tmp1<-bg.adjust.gcrma(obj,optical.correct=T,affinity.info=affinfo)

outfile<-paste(outdir, celfile[i,1], "_bg.txt", sep="")

write.table(pm(tmp1), file=outfile, row.names=FALSE, col.names=TRUE, sep="\t")


}

Wednesday, August 30, 2006

R script for KEGG analysis

###########################################################
### R Script to Map Arabidpsis AGI Keys KEGG Pathways ###
###########################################################
# Author: Thomas Girke, Mar 2006
# (A) Usage
# (A.1) Assign your locus IDs (AGI Keys) to vector
# (A.2) Load kegg_stat_fct as shown under demo (A.4)
# (A.3) Run kegg_stat_fct like this:
# kegg_stat_fct(locus_ids, type=2)
# argument 'type':
# 2 = Pathway
# 4 = PathwayClass
# (A.4) To demo what the script does, run it like this:
# source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/KEGG_ATH.txt")
# (B) Dependencies
# (B.1) Pathway category file "KEGG_ATH.annot" created from these pages:
# http://www.genome.jp/kegg/KGML/KGML_v0.6/ath/index.html
# http://www.genome.jp/kegg/KGML/KGML_v0.6/ath/index2.html
# (B.2) Script can be used for other organisms after adjusting the regular expressions in part (1),
# and creating the corresponding pathway category file (KEGG_ATH.annot)

# (C) Script
# (C.1) Import ATH KEGG annotations: the following commands are only necessary to update the info in kegg data frame (see below)
# Import pathway category file "KEGG_ATH.annot"
# kegg <- read.table("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/KEGG_ATH.annot", h=T, sep="\t")
# If an object "mylist" is present then remove it, since the following loop will append to it.
# if(exists("mylist")) { rm(mylist) }
# for loop to import and reformat remote pathway xml files one-by-one by 'PathID' in 'kegg' (103 iterations)
# for(i in 1:length(kegg[,3])) {
# zzz <- readLines(paste("http://www.genome.jp/kegg/KGML/KGML_v0.6/ath/", kegg[i,3], ".xml", sep="")) # read lines into vector
# zzz <- zzz[grep("at\\dg\\d\\d\\d\\d\\d", as.character(zzz), ignore.case = T, perl=T)] # obtain fields with AGI keys
# zzz <- zzz[-grep("link=", as.character(zzz), ignore.case = T, perl=T)] # remove duplications
# zzz <- gsub(".*name=\\\"|\".*", "", as.character(zzz), ignore.case = T, perl=T) # remove xml tags
# zzz <- gsub("ath:", "", as.character(zzz), ignore.case = T, perl=T) # obtain clean AGI keys
# zzz <- unlist(strsplit(zzz, " ")) # split fields with several keys; strsplit generates list of vectors and unlist converts it into one vector
# zzz <- unique(zzz) # makes AGIs unique within each PathID
# if(!exists("mylist")) # if "mylist" does not exist then create it in next line, otherwise append to "mylist" in second line
# { mylist <- list(zzz) } else
# { mylist <- c(mylist, list(zzz)) }
# names(mylist)[i] <- as.vector(kegg[i,3]) # assign names by 'PathID'
# cat(i, as.vector(kegg[i,3]), "\n", sep = " ") # print iteration number and 'PathID' to monitor loop status
# }
# keggat <- data.frame(unlist(mylist)) # transform list of vectors into data frame, tracking of one-to-many relationship is resolved by appending counts to PathIDs
# keggat <- data.frame(PathID=gsub("(ath\\d\\d\\d\\d\\d).*", "\\1", row.names(keggat), perl=T), AGI=keggat[,1]) # remove from PathIDs appended counts
# keggat <- data.frame(PathID=keggat$PathID, AGI=gsub("([A-Z])", "\\U\\1", as.character(keggat$AGI), perl=T, ignore.case=T)) # make AGI keys uniform by setting them to upper case
# kegg <- merge(keggat, kegg, by.x="PathID", by.y="PathID", all.x=T) # merge data frames 'keggat' and 'kegg' (KEGG_ATH.annot) by PathIDs

# If no update is required, then read the Arab KEGG mappings from this file:
kegg <- read.table("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/kegg_ath", h=T, sep="\t")

# (C.2) Define function: "kegg_stat_fct"
kegg_stat_fct <- function(locus_ids, type=2) { # argument 'type': 2 = Pathway, 4 = PathwayClass (see kegg)
my_select <- merge(kegg, as.data.frame(locus_ids), by.x="AGI", by.y=1, all.y=T) # obtain kegg data frame that cotains only rows for AGI test keys (locus_ids)
x <- as.character(my_select[,type]) # create vector with Pathway (2) or PathwayClass (4) strings in my_select
x[is.na(x)] <- "no_annot" # fill NA fields with "no_annot"
my_select <- data.frame(count_col=x, my_select[,-type]) # combine x and my_select
kegg_stat <- data.frame(table(my_select$count_col)) # obtain counts for count_col
kegg_stat <- data.frame(kegg_stat, Percent=round(kegg_stat$Freq/sum(kegg_stat$Freq)*100,2)) # append percent calculation for counts
names(kegg_stat)[1] <- "Cat" # give frist column a descriptive title
if(type==2) # if type==2 then merge kegg_stat with kegg data frame on column 1, if type==4 then merge kegg_stat with kegg data frame on column 4
{ kegg_stat <- merge(kegg_stat, kegg[!duplicated(kegg[,type-1]),], by.x="Cat", by.y=type-1, all.x=T) } else
{ kegg_stat <- merge(kegg_stat, kegg[!duplicated(kegg[,type]),], by.x="Cat", by.y=type, all.x=T) }
kegg_stat
}

# (C.3) Use function: "kegg_stat_fct"
cat("\n", "You have imported the function 'kegg_stat_fct'.", "\n\n", "Here is a short demo for AGI keys:", "\n") # print usage of function
AGI_vec <- c("AT1G23800", "AT5G63620", "AT1G30120", "AT2G07050", "AT2G22830", "AT2G38700", "AT1G29910")
print(AGI_vec)
kegg_stat <- kegg_stat_fct(locus_ids=AGI_vec, type=4)
print(kegg_stat)
pie(kegg_stat[kegg_stat[,2]>0 ,3], labels=as.vector(kegg_stat[kegg_stat[,2]>0 ,1]), main="KEGG") # plot final result

cat("\n", "Usage:", "\n", "\t kegg_stat <- kegg_stat_fct(locus_ids=AGI_vec, type=4)", "\n", "\t Arguments: 'locus_ids', vector of locus keys; 'type=2', Pathway; 'type=4', PathwayClass", "\n", "\t pie(kegg_stat[kegg_stat[,2]>0 ,3], labels=as.vector(kegg_stat[kegg_stat[,2]>0 ,1]), main=\"KEGG\")", "\n")

R code for gene to GO assignment

######################################################################################################
### GOHyperGAll: Global Hypergeometric Test Using Custom Gene-to-GO Mappings Plus GO Slim Analysis ###
######################################################################################################
# Author: Thomas Girke
# Last update: May 26, 2006
# Utility: To test a sample population of genes for over-representation of GO terms, the
# function 'GOHyperGAll' computes for all GO nodes a hypergeometric distribution test and
# returns the corresponding p-values. A subsequent filter function performs a GO Slim
# analysis using default or custom GO Slim categories.
# Note: GOHyperGAll provides similar utilities as the GOHyperG function in the GOstats package
# from BioConductor. The main difference is that GOHyperGAll simplifies the usage of custom
# chip-to-gene and gene-to-GO mappings.
#
# How it works:
# (A) Generate the required data objects (slow, but needs to be done only once)
# (B) Define GOhyperG_All function
# (C) Subsetting and plotting of results by assigned nodes or goSlim categories
# To demo the script and import all required functions, run the following source() command:
# source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/GOHyperGAll.txt")
# HTML Instructions:
# http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/R_BioCondManual.html#GOHyperGAll

########################################
# (A) Generate the required data objects
########################################
# (A.1) Generate sample data frames with assigned gene-to-GO mappings,
# one for MF, one for BP and one for CC mappings
# custom mappings can be used here, but need to have the same format as GO_XX_DFs in the following examples
# (A.1.1) Obtain mappings from geneontology.org
readGOorg <- function(myfile = "gene_association.tair", colno = c(5,3,9), org = "Arabidopsis") { go_org <- read.delim(myfile, na.strings = "", header=F, comment.char = "!", sep="\t") if(org == "Arabidopsis") { go_org <- read.delim("gene_association.tair", na.strings = "", header=F, comment.char = "!", sep="\t") id_vec <- gsub(".*(AT\\dG\\d\\d\\d\\d\\d).*", "\\1", as.character(go_org[,11]), perl=T) go_org <- data.frame(go_org, GeneID=id_vec) go_org <- go_org[grep("^AT\\dG\\d\\d\\d\\d\\d", as.character(go_org$GeneID), perl=T),] go_org <- go_org[!duplicated(paste(go_org[,5], gsub("\\.\\d{1,}", "", as.character(go_org[,16]), perl=T), sep="_")),] go_org <- go_org[,c(1:2,16,4:15,3)] } go_org <- go_org[ ,colno] go_org <- na.omit(go_org) names(go_org) <- c("GOID", "GeneID", "GOCAT") GO_MF_DF <<- go_org[go_org[,3]=="F",] write.table(GO_MF_DF, file="GO_MF_DF", quote=T, sep="\t") cat("\n", "Object 'GO_MF_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") GO_BP_DF <<- go_org[go_org[,3]=="P",] write.table(GO_BP_DF, file="GO_BP_DF", quote=T, sep="\t") cat("\n", "Object 'GO_BP_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") GO_CC_DF <<- go_org[go_org[,3]=="C",] write.table(GO_CC_DF, file="GO_CC_DF", quote=T, sep="\t") cat("\n", "Object 'GO_CC_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") } # (A.1.2) Obtain mappings from BioC sampleDFgene2GO <- function(lib="ath1121501") { require(GOstats); require(lib, character.only=T) affyGOMF <- eapply(get(paste(lib, "GO", sep=""), mode = "environment"), getOntology, "MF") # generates list with GeneID components containing MFGOs GO_MF_DF <<- data.frame(GOID=unlist(affyGOMF), GeneID=rep(names(affyGOMF), as.vector(sapply(affyGOMF, length))), Count=rep(as.vector(sapply(affyGOMF, length)), as.vector(sapply(affyGOMF, length)))) write.table(GO_MF_DF, file="GO_MF_DF", quote=F, sep="\t") cat("\n", "Object 'GO_MF_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") affyGOBP <- eapply(get(paste(lib, "GO", sep=""), mode = "environment"), getOntology, "BP") # generates list with GeneID components containing BPGOs GO_BP_DF <<- data.frame(GOID=unlist(affyGOBP), GeneID=rep(names(affyGOBP), as.vector(sapply(affyGOBP, length))), Count=rep(as.vector(sapply(affyGOBP, length)), as.vector(sapply(affyGOBP, length)))) write.table(GO_BP_DF, file="GO_BP_DF", quote=F, sep="\t") cat("\n", "Object 'GO_BP_DF' created containing assigned gene-to-BPGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") affyGOCC <- eapply(get(paste(lib, "GO", sep=""), mode = "environment"), getOntology, "CC") # generates list with GeneID components containing CCGOs GO_CC_DF <<- data.frame(GOID=unlist(affyGOCC), GeneID=rep(names(affyGOCC), as.vector(sapply(affyGOCC, length))), Count=rep(as.vector(sapply(affyGOCC, length)), as.vector(sapply(affyGOCC, length)))) write.table(GO_CC_DF, file="GO_CC_DF", quote=F, sep="\t") cat("\n", "Object 'GO_CC_DF' created containing assigned gene-to-CCGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") } cat("\n", "(A.1) To use the GOHyperGAll() function, one needs 3 data frames containing the gene-to-GO mappings MF, BP and CC. \n Demo data sets from GO.org or BioC can be created with one of these commands: \n \t (A.1.1) For annotations from geneontology.org: \n \t readGOorg(myfile = \"gene_association.tair\", colno = c(5,3,9), org = \"Arabidopsis\") \n \t \t myfile: download annotation table from geneontology.org and unzip it. Then point function to file name. \n \t \t colno: required column numbers; default 'c(3,5,9)' should work in most cases \n \t \t org: needs to be specified only for Arabidopsis mappings \n \t (A.1.2) For annotations from BioC: \n \t sampleDFgene2GO(lib=\"ath1121501\") \n \t \t lib: defines annotation library, default is \"ath1121501\"", "\n") # (A.2) Generate list containing gene-to-GO-OFFSPRING associations including assiged nodes # This is very slow (3x3 minutes), but needs to be done only once! gene2GOlist <- function() { require(GOstats) for(i in c("MF","BP","CC")) { if(i=="MF") { go_offspr_list <- as.list(GOMFOFFSPRING) } if(i=="BP") { go_offspr_list <- as.list(GOBPOFFSPRING) } if(i=="CC") { go_offspr_list <- as.list(GOCCOFFSPRING) } go_offspr_list <- lapply(go_offspr_list, unlist); go_offspr_list <- lapply(go_offspr_list, as.vector) # clean-up step for the list go_offspr_list_temp <- lapply(names(go_offspr_list), function(x) c(x, go_offspr_list[[x]]) ) # include list component (GOID) names in corresponding (GOID) vectors names(go_offspr_list_temp) <- names(go_offspr_list) # names list components after go_offspr_list go_offspr_list <- go_offspr_list_temp go_offspr_list <- lapply(go_offspr_list, function(x) x[!is.na(x)]) # remove NAs in vectors if(i=="MF") { MF_node_affy_list <<- lapply(go_offspr_list, function(x) unique(as.vector(GO_MF_DF[GO_MF_DF$GOID %in% x, 2]))) # retrieve gene/affy IDs for GOID vectors save(MF_node_affy_list, file="MF_node_affy_list") } if(i=="BP") { BP_node_affy_list <<- lapply(go_offspr_list, function(x) unique(as.vector(GO_BP_DF[GO_BP_DF$GOID %in% x, 2]))) save(BP_node_affy_list, file="BP_node_affy_list") } if(i=="CC") { CC_node_affy_list <<- lapply(go_offspr_list, function(x) unique(as.vector(GO_CC_DF[GO_CC_DF$GOID %in% x, 2]))) save(CC_node_affy_list, file="CC_node_affy_list") } cat("\n", paste("Object '", i, "_node_affy_list'", sep=""), "with gene-to-GO-OFFSPRING associations created and saved in your working directory.", "\n") } } cat("\n", "(A.2) The corresponding gene-to-GO-OFFSPRING associations are created from the three data frames with the following \n command. This creates 3 list objects with the required MF, BP and CC associations. \n \t gene2GOlist()", "\n") # (A.3) Generate AffyID-to-GeneID mappings when working with chip feature IDs # This function creates a AffyID-to-GeneID mapping data frame using by default the TAIR mappings for the Arabidopsis ATH1 chip. # Once the decoding data frame 'affy2locusDF' is created, the function returns for a query set of AffyIDs the corresponding GeneIDs. # To use the function for the mappings of other chips, one needs to create the corresponding decoding data frame 'affy2locusDF'. AffyID2GeneID <- function(map = "ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2006-07-14.txt", affyIDs) { if(!exists("affy2locusDF")) { cat("\n", "Downloading AffyID-to-GeneID mappings, creating object 'affy2locusDF' and saving it in your working directory", "\n") affy2locus <- read.delim(map, na.strings = "", fill=TRUE, header=T, sep="\t")[,-c(2:4,7:9)] names(affy2locus) <- c("AffyID", "AGI", "Desc") row.names(affy2locus) <- as.vector(affy2locus[,1]) my_list <- apply(affy2locus[,-c(3)], 1, list); my_list <- lapply(my_list, unlist) my_list <- lapply(my_list, function(x) as.vector(x[-1])) my_list <- lapply(my_list, strsplit, ";"); my_list <- lapply(my_list, unlist) affy2locusDF <- data.frame(unlist(my_list)) affy2locusDF <- data.frame(rep(names(unlist(lapply(my_list, length))), as.vector(unlist(lapply(my_list, length)))), affy2locusDF) names(affy2locusDF) <- c("AffyID", "GeneID") affy2locusDF <<- affy2locusDF write.table(affy2locusDF, file="affy2locusDF", quote=F, sep="\t") } if(!missing(affyIDs)) { GeneIDs <- unique(as.vector(affy2locusDF[affy2locusDF[,1] %in% affyIDs, 2])) return(GeneIDs) } } cat("\n", "(A.3) To work with AffyIDs, the function AffyID2GeneID() can be used to import custom AffyID-to-GeneID mappings. \n \t AffyID2GeneID(map = \"ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2006-07-14.txt\") \n \t \t map: location of custom AffyID-to-GeneID mappings", "\n") # (A.4) Next time things are much faster by reading the 6 data objects from file loadData <- function() { need_affy2gene <- dir() if(length(need_affy2gene[need_affy2gene=="affy2locusDF"]) > 0) {
affy2locusDF <<- read.table(file="affy2locusDF", header=T, colClasses = "character") } GO_MF_DF <<- read.table(file="GO_MF_DF", header=T, colClasses = "character") GO_BP_DF <<- read.table(file="GO_BP_DF", header=T, colClasses = "character") GO_CC_DF <<- read.table(file="GO_CC_DF", header=T, colClasses = "character") } cat("\n", "(A.4) In future R sessions one can can omit the previous 3 steps (A.1-A.3) by importing all 6 (7) data objects like this: \n \t loadData(); load(file=\"MF_node_affy_list\"); load(file=\"BP_node_affy_list\"); load(file=\"CC_node_affy_list\")", "\n") ########################### # (B) GOhyperG_All function ########################### # (B.1) Define GOhyperG_All function GOHyperGAll <- function(gocat="MF", sample) { # (go_df): Generates data frame containing the commonly used components for all GO nodes: GOID, GO Term and Ontology Type. require(GOstats) if(!exists("go_df")) { go_df <- data.frame(GOID=unlist(eapply(GOTERM, function(x) x@GOID)), Term=unlist(eapply(GOTERM, function(x) x@Term)), Ont=unlist(eapply(GOTERM, function(x) x@Ontology))) go_df <<- na.omit(go_df) cat("\n", "Object 'go_df' created containing for all GO nodes the commonly used components: GOID, GO Term and Ontology Type", "\n") } # (m): Obtain for every node in GO tree their number of associated genes or chip features if(gocat=="MF") {node_affy_list <- MF_node_affy_list} if(gocat=="BP") {node_affy_list <- BP_node_affy_list} if(gocat=="CC") {node_affy_list <- CC_node_affy_list} node_stats_df <- data.frame(NodeSize=sapply(node_affy_list, length)) node_stats_df <- data.frame(GOID=row.names(node_stats_df), node_stats_df) row.names(node_stats_df) <- 1:length(node_stats_df[,1]) m <- as.vector(node_stats_df$NodeSize) # (x): Obtain for every node in GO tree the number of matching genes in sample set node_sample_stats <- sapply(node_affy_list, function(x) { sum(unlist(x) %in% sample) } ) node_sample_stats <- as.vector(node_sample_stats) x <- node_sample_stats # (n): Obtain the number of unique genes associated at all GO nodes if(gocat=="MF") { GO_DF <- GO_MF_DF } if(gocat=="BP") { GO_DF <- GO_BP_DF } if(gocat=="CC") { GO_DF <- GO_CC_DF } n <- length(unique(GO_DF[, 2])) # (k): Obtain number of unique genes in test sample that have GO mappings k <- length(unique(GO_DF[GO_DF[,2] %in% sample, 2])) # Obtain gene/chip keys matching at GO nodes match_key <- sapply(node_affy_list, function(x) { x[unlist(x) %in% sample] } ) match_key <- sapply(match_key, function(x) { paste(x, collapse=" ") } ) match_key <- as.vector(match_key) key <- match_key; key[key==""] <- "NA" # Apply phyper function phyp_v <- phyper(x-1, m, n-m , k, lower.tail = FALSE) result_df <- data.frame(node_stats_df, SampleMatch=x, Phyper=phyp_v, SampleKeys=key) result_df <- merge(result_df, go_df, x.by="GOID", y.by="GOID", all.x=T) result_df <- result_df[order(result_df$Phyper), ] result_df <- result_df[,c(1:4,6:7,5)] result_df } cat("\n", "(B.1) The function GOHyperGAll() runs the phyper test against all nodes in the GO network. \n Usage: \n \t GOHyperGAll(gocat=\"MF\", sample=test_sample)[1:20,] \n \t \t gocat: \"MF\", \"BP\", \"CC\" \n \t \t test_sample <- unique(as.vector(GO_MF_DF[1:40,2])) # for GeneIDs\n \t \t test_sample <- AffyID2GeneID(affyIDs=affy_sample) # for AffyIDs \n \t \t affy_sample <- c(\"266592_at\", \"266703_at\", \"266199_at\", \"246949_at\", \"267370_at\", \"267115_s_at\", \"266489_at\", \"259845_at\", \"266295_at\", \"262632_at\")", "\n") ################################################################################### # (C) Subsetting of results from GOHyperGAll by assigned nodes or goSlim categories ################################################################################### # (C.1) Define subsetting function GOHyperGAll_Subset <- function(GOHyperGAll_result, sample=test_sample, type="goSlim", myslimv) { # type: "goSlim" or "assigned"; optional argument "myslimv" to privde custom goSlim vector if(type=="goSlim") { if(missing(myslimv)) { slimv <- c("GO:0030246","GO:0008289","GO:0003676","GO:0000166","GO:0019825","GO:0005515","GO:0003824","GO:0030234","GO:0005554","GO:0003774","GO:0004871","GO:0005198","GO:0030528","GO:0045182","GO:0005215","GO:0000004","GO:0006519","GO:0007154","GO:0007049","GO:0016043","GO:0006412","GO:0006464","GO:0006810","GO:0007275","GO:0007049","GO:0006519","GO:0005975","GO:0006629","GO:0006139","GO:0019748","GO:0015979","GO:0005618","GO:0005829","GO:0005783","GO:0005768","GO:0005794","GO:0005739","GO:0005777","GO:0009536","GO:0005840","GO:0005773","GO:0005764","GO:0005856","GO:0005634","GO:0005886","GO:0008372","GO:0005576") } else { slimv <- myslimv } GOHyperGAll_subset <- GOHyperGAll_result[GOHyperGAll_result[,1] %in% slimv, ] } if(type=="assigned") { termGO <- c(as.vector(GO_MF_DF[GO_MF_DF$GeneID %in% test_sample, 1]), as.vector(GO_BP_DF[GO_BP_DF$GeneID %in% test_sample, 1]), as.vector(GO_CC_DF[GO_CC_DF$GeneID %in% test_sample, 1])) subset_v <- unique(termGO) GOHyperGAll_subset <- GOHyperGAll_result[GOHyperGAll_result[,1] %in% subset_v, ] } GOHyperGAll_subset } cat("\n", "(C.1) The function GOHyperGAll_Subset() allows subsetting of the GOHyperGAll() results by assigned GO nodes or custom goSlim categories.", "\n", "Usage:", "\n", "\t GOHyperGAll_result <- GOHyperGAll(gocat=\"MF\", sample=test_sample)", "\n", "\t GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type=\"goSlim\")", "\n", "\t \t type: \"goSlim\" or \"assigned\"", "\n", "\t \t myslimv: optional argument allows usage of a custom goSlim vector", "\n") # Apply subsetting function # GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type="goSlim") # (C.2) Plotting of subsetted results # subset <- GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type="goSlim") # pie(subset[subset$SampleMatch>0 ,3], labels=as.vector(subset[subset$SampleMatch>0 ,1]), main=unique(as.vector(subset[subset$SampleMatch>0 ,6])))
cat("\n", "(C.2) Plot pie chart of subsetted results: \n \t subset <- GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type=\"goSlim\") \n \t pie(subset[subset$SampleMatch>0 ,3], labels=as.vector(subset[subset$SampleMatch>0 ,1]), main=unique(as.vector(subset[subset$SampleMatch>0 ,6])))", "\n")

########################################################
# (D) Reduce GO Term Redundancy in 'GOHyperGAll_results'
########################################################
# (D.1) The function 'GOHyperGAll_Simplify' subsets the data frame 'GOHyperGAll_result' by a user
# specified p-value cutoff and removes from it all GO nodes with overlapping children sets
# (OFFSPRING). Only the best scoring nodes remain in the data frame.
# The argument 'correct' is experimental. It aims to favor the selection of distal (information rich)
# GO terms that have at the same time a large number of sample matches. The following calculation is used
# for this adjustment: phyper x Number_of_children / SampleMatch
# Define GOHyperGAll_Simplify()
GOHyperGAll_Simplify <- function(GOHyperGAll_result, gocat="MF", cutoff=0.001, correct=T) { # gocat: "MF", "BP" or "CC"; cutoff: p-value cutoff; correct: TRUE or FALSE if(gocat!=as.vector(GOHyperGAll_result$Ont[!is.na(GOHyperGAll_result$Ont)])[1]) { stop("The GO categories in GOHyperGAll_Simplify() and GOHyperGAll_result need to match") } testDF <- GOHyperGAll_result[GOHyperGAll_result$Phyper<=cutoff,] testDF <- data.frame(testDF, test=rep(0, times=length(testDF[,1]))) testDF <- testDF[!is.na(testDF$Ont),] GOIDv <- NULL GO_OL_Matchv <- NULL while(sum(testDF$test==0)>0) {
clusterv <- NULL test <- as.vector(testDF[,1]) for(j in 1:length(test)) { if(gocat=="MF") { mymatch <- sum(unique(na.omit(c(test[j], as.list(GOMFOFFSPRING)[[test[j]]])) %in% na.omit(c(test[1], as.list(GOMFOFFSPRING)[[test[1]]])))) if(mymatch==1) { mymatch <- length(as.list(GOMFOFFSPRING)[[test[j]]]) } } if(gocat=="BP") { mymatch <- sum(unique(na.omit(c(test[j], as.list(GOBPOFFSPRING)[[test[j]]])) %in% na.omit(c(test[1], as.list(GOBPOFFSPRING)[[test[1]]])))) if(mymatch==1) { mymatch <- length(as.list(GOBPOFFSPRING)[[test[j]]]) } } if(gocat=="CC") { mymatch <- sum(unique(na.omit(c(test[j], as.list(GOCCOFFSPRING)[[test[j]]])) %in% na.omit(c(test[1], as.list(GOCCOFFSPRING)[[test[1]]])))) if(mymatch==1) { mymatch <- length(as.list(GOCCOFFSPRING)[[test[j]]]) } } clusterv <- c(clusterv, mymatch) } clusterv[clusterv==0] <- NA testDF <- data.frame(testDF[,-8], test=clusterv) if(correct==T) { testDF <- data.frame(testDF, decide=testDF$Phyper * (testDF$test/testDF$SampleMatch)) } else { testDF <- data.frame(testDF, decide=testDF$Phyper) } GOIDv <- c(GOIDv, as.vector(testDF[order(testDF[,9]),][1,1])) GO_OL_Matchv <- c(GO_OL_Matchv, length(unique(unlist(strsplit(as.vector(testDF[!is.na(testDF$test),7]), " "))))) testDF <- testDF[is.na(testDF$test),] testDF <- testDF[order(testDF[,4]),-c(8,9)] testDF <- data.frame(testDF, test=rep(0, times=length(testDF[,1]))) cat(GOIDv, "\n") } simplifyDF <- data.frame(GOID=GOIDv, GO_OL_Match=GO_OL_Matchv) simplifyDF } # Apply GOHyperGAll_Simplify # simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result, gocat="MF", cutoff=0.001, correct=T) # data.frame(GOHyperGAll_result[GOHyperGAll_result[,1] %in% simplifyDF[,1], -7], GO_OL_Match=simplifyDF[,2]) ####################################### # (D.2) Batch Analysis of Gene Clusters ####################################### # The function 'GOCluster_Report' performs a 'GOHyperGAll_Simplify' analysis in batch mode. # It processes many groups of genes (e.g. from gene expression clusters) and organizes the results # in a single data frame. # The gene sets or clusters need to be provided in a data frame of this structure: # probeID/geneID ClusterID ClusterSize # id1 CL1 2 # id2 CL1 2 # id3 CL2 1 # ... ... ... # # Define 'GOCluster_Report()' GOCluster_Report <- function(CL_DF=CL_DF, CLSZ=10, cutoff=0.001, gocat=c("MF", "BP", "CC"), recordSpecGO=NULL) { # CLSZ: cluster size cutoff; gocat: "MF", "BP" or "CC"; cutoff: p-value cutoff; recordSpecGO: argument to include one specific GOID in each of the 3 ontologies, e.g: recordSpecGO=c("GO:0005554", "GO:0000004", "GO:0008372") cluster_loop <- unique(as.vector(CL_DF[CL_DF[,3]>CLSZ,2]))
if(length(cluster_loop[grep("CL", cluster_loop)])>0) {
cluster_loop <- paste("CL", sort(as.numeric(gsub("CL","", as.character(cluster_loop)))), sep="") containerDF <- data.frame(CLID=NULL, CLSZ=NULL, GOID=NULL, NodeSize=NULL, SampleMatch=NULL, Phyper=NULL, Term=NULL, Ont=NULL, GO_OL_Match=NULL) } for(i in cluster_loop) { cat("\n", "Processing cluster no", i, "\n") affy_sample <- CL_DF[CL_DF[,2]==i, 1] test_sample <- AffyID2GeneID(affyIDs=affy_sample) containerDF2 <- data.frame(GOID=NULL, NodeSize=NULL, SampleMatch=NULL, Phyper=NULL, Term=NULL, Ont=NULL, GO_OL_Match=NULL) count <- 0 for(j in c("MF", "BP", "CC")) { count <- count+1 GOHyperGAll_result <- GOHyperGAll(gocat=j, sample=test_sample) simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result, gocat=j, cutoff=cutoff, correct=T) if(length(simplifyDF)==0) { simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result[1:2,], gocat=j, cutoff=1, correct=T) } tempDF <- data.frame(GOHyperGAll_result[GOHyperGAll_result[,1] %in% simplifyDF[,1], -7], GO_OL_Match=simplifyDF[,2]) containerDF2 <- rbind(containerDF2, tempDF) if(length(recordSpecGO)>0) {
containerDF2 <- rbind(containerDF2, data.frame(GOHyperGAll_result[GOHyperGAll_result[,1]==recordSpecGO[count],-7], GO_OL_Match=GOHyperGAll_result[GOHyperGAll_result[,1]==recordSpecGO[count],3])) } no_annot <- test_sample[!test_sample %in% eval(parse(text=paste("GO_", j, "_DF", sep="")))[,2]]; no_annot <- length(no_annot[no_annot!="no_match"]) containerDF2 <- rbind(containerDF2, data.frame(GOID=paste("no_annot_", j, sep=""), NodeSize=NA, SampleMatch=no_annot, Phyper=NA, Term=NA, Ont=j, GO_OL_Match=no_annot)) } tempDF2 <- data.frame(CLID=rep(i, times=length(containerDF2[,1])), CLSZ=rep(unique(as.vector(CL_DF[CL_DF[,2]==i,3])), times=length(containerDF2[,1])), containerDF2) containerDF <- rbind(containerDF, tempDF2) } containerDF } # Apply GOCluster_Report # BatchResult <- GOCluster_Report(CL_DF=CL_DF, CLSZ=10, cutoff=0.001, gocat=c("MF", "BP", "CC"), recordSpecGO=c("GO:0005554", "GO:0000004", "GO:0008372"))

R code for vector and phred score trimming

####################################################################################################
### R & BioConductor Script for Trimming Adaptor and Low Quality Positions of 454 sRNA Sequences ###
####################################################################################################
# Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside
# Utility: This script brings 454 sequences into the proper orientation, trims off the
# adaptor segments and tags the low quality calls (default score <14) with Ns. The
# current draft implementation is not very fast. The processing of 50,000 sequences
# will take 15-25 minutes.
# Requirements: cross_match program from the Phred/Phrap/Consed package
# Code tag where one can adjust filter settings: '### adjust'
# How it works:
# (A) Some format editing on the command-line
# Requirements:
# clean directory containing
# (a) 454.seq and 454.seq.qual files
# (b) The 2 adaptor files are exprected to be in parent directory under the names
# adaptfor.txt
# adaptrev.txt
# Make sure the extension of *.qual matches the one expected by cross_match, e.g.
# $ mv 454.qual 454.seq.qual
# For the import of the *.qual data into R, the *.qual file needs to have spaces at end of lines:
# $ perl -p -i -w -e 's/(^.*)/$1 /g' 454.seq.qual
# (B) Processing in R
# Run the following script from the directory with your 454 sequences like this:
# source("454_sRNA_Trim.R")
# or
# R CMD BATCH --no-save 454_sRNA_Trim.R

# Start of R script
# (1) Split sequence batch into smaller sets of 50,000 per file
# Define Sequence Import Function 'seq_imp_fct'
seq_imp_fct <- function(fileloc) {
my_fasta <- readLines(fileloc) # reads file line-wise into vector
y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign
y <- as.vector(y); y[y==-1] <- 0
index <- which(y==0)
distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry
distance <- data.frame(distance, dist=distance[,2]-distance[,1])
seq_no <- 1:length(y[y==0])
index <- rep(seq_no, as.vector(distance[,3]))
my_fasta <- data.frame(index, y, my_fasta)
my_fasta[my_fasta[,2]==0,1] <- 0
seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
seq <- as.vector(seq[2:length(seq)])
Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
my_fasta <- data.frame(Desc, Length=nchar(seq), seq)
my_fasta
}
# Generate smaller sequence batches of max 50,000 sequences per file. This is required for cross_match
seq <- seq_imp_fct(fileloc="454.seq")
export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
slice <- data.frame(FROM=seq(1,length(seq[,1]), by=50000), TO=seq(1,length(seq[,1]), by=50000)+49999)
slice[length(slice[,1]),2] <- length(seq[,1])
for (i in 1:length(slice[,1])) {
write.table(as.vector(as.character(t(export_seq[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
}
qual <- seq_imp_fct(fileloc="454.seq.qual")
export_qual <- data.frame(Desc=qual[,1], seq=qual[,3])
for (i in 1:length(slice[,1])) {
write.table(as.vector(as.character(t(export_qual[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq.qual", sep=""), quote=F, row.names=F, col.names=F)
}

# (2) Run cross_match in 2 steps to tag forward adaptors with Ns and reverse adaptors with Xs
# Run cross_match on all generated *.seq files:
for (i in 1:length(slice[,1])) {
if(i==1) { system(paste("rm -f ", "all.seq.screen", sep="")) } # removes in fist loop an already existing "all.seq.screen" file, since data slices will be appended to a file of this name
# next import steps protect Ns that are inserted by 454.com for positions with score=0
seq <- seq_imp_fct(fileloc=paste(i, ".seq", sep=""))
export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
export_seq$seq <- gsub("N", "Z", as.character(seq$seq))
write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
### adjust
system(paste("cross_match ", paste(i, ".seq ", sep=""), "../adaptfor.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for forward adaptor only
# next import steps tag forward adaptor with N's
seq <- seq_imp_fct(fileloc=paste(i, ".seq.screen", sep=""))
export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
export_seq$seq <- gsub("X", "N", as.character(seq$seq))
write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq.screen", sep=""), quote=F, row.names=F, col.names=F)
system(paste("mv ", paste(i, ".seq", sep=""), ".screen ", paste(i, ".seq", sep=""), sep=""))
### adjust
system(paste("cross_match ", paste(i, ".seq ", sep=""), "../adaptrev.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for reverse adaptor only, gets tagged with X's
system(paste("cat ", paste(i, ".seq", sep=""), ".screen ", ">> ", "all.seq.screen", sep=""))
system(paste("rm ", "-f ", paste(i, ".seq ", sep=""), paste(i, ".seq.qual ", sep=""), paste(i, ".seq.screen ", sep=""), paste(i, ".seq.log", sep=""), sep=""))
}

# (3) Import of cross_matched sequences into R
# Import of 'all.seq.screen'
seq <- seq_imp_fct(fileloc="all.seq.screen")
cat("\n", "Sequence batch imported containing ", length(seq[,1]), " entries", "\n")
# Tag forward adapter with "F" and reverse adapter with "R", and revert Zs (454 Ns) back to Ns
seqedit <- gsub("N", "F", as.character(seq$seq))
seqedit <- gsub("X", "R", as.character(seqedit))
seqedit <- gsub("Z", "N", as.character(seqedit))
seq <- data.frame(seq[,-3], seq=seqedit)
seq <- data.frame(Desc=gsub(" .*", "", as.character(seq[,1]), perl=T), seq[,-1])
qual <- seq_imp_fct(fileloc="454.seq.qual")
cat("\n", "Quality batch imported containing ", length(qual[,1]), " entries", "\n")
qual <- data.frame(qual[,-3], seq=gsub("[ ]{2,}", " ", as.character(qual$seq)))
# qual[qual[,3]=="N",3] <- 0 # removed May 08, 06
qual <- data.frame(Desc=gsub(" .*", "", as.character(qual[,1]), perl=T), qual[,-1])

# (4) For larger data sets, loop over slices of 10,000 sequences
# In this loop the sequences are (1) vector trimmed, (2) quality trimmed and (3) orientation adjusted.
slicedf <- data.frame(FROM=seq(1, length(seq[,1]), by=10000),
TO=c(seq(0, length(seq[,1]), by=10000)[2:((as.integer(length(seq[,1])/10000))+1)],
length(seq[,1])))
appendDF <- data.frame(Seq=NULL, Phred=NULL)
for(i in 1:length(slicedf[,1])){
cat("\n", "Start of 10K slice no: ", i, "\n")
seqslice <- seq[slicedf[i,1]:slicedf[i,2],]
qualslice <- qual[slicedf[i,1]:slicedf[i,2],]
# Write sequences and qual data in vectorized format into list files
seqslicel <- strsplit(as.character(seqslice[,3]),"")
qualslicel <- strsplit(as.character(qualslice[,3])," ")
names(seqslicel) <- seqslice[,1]
names(qualslicel) <- qualslice[,1]
qualslicel <- lapply(qualslicel, as.numeric)
# Replaces all NTs with Phred < 14 by Ns; this includes vector/adaptor positions
### adjust
seqtriml <- lapply(names(seqslicel), function(x) replace(seqslicel[[x]], which(qualslicel[[x]]<14), "N"))
names(seqtriml) <- names(seqslicel)
# Convert all Ns that fall into adaptor regions back to Fs and Rs
seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="F"), "F"))
names(seqtriml) <- names(seqslicel)
seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="R"), "R"))
names(seqtriml) <- names(seqslicel)

# To determine orientation of sequences, generate data frame with adaptor positions
zzz <- lapply(seqtriml, paste, collapse="")
posF <- lapply(names(zzz), function(x) gregexpr("(F){1,}", as.character(zzz[[x]]), perl=T))
posR <- lapply(names(zzz), function(x) gregexpr("(R){1,}", as.character(zzz[[x]]), perl=T))
names(posF) <- names(seqtriml); names(posR) <- names(seqtriml)
myposF <- unlist(lapply(names(posF), function(x) as.vector(unlist(posF[[x]][[1]]))[1]))
myposR <- unlist(lapply(names(posR), function(x) as.vector(unlist(posR[[x]][[1]]))[1]))
myposF[myposF==-1] <- 0; myposR[myposR==-1] <- 0
mylengthF <- lapply(names(posF), function(x) as.vector(unlist(attributes(posF[[x]][[1]])))[1])
mylengthF <- unlist(mylengthF)
mylengthF[mylengthF==-1] <- 0
mylengthR <- lapply(names(posR), function(x) as.vector(unlist(attributes(posR[[x]][[1]])))[1])
mylengthR <- unlist(mylengthR)
mylengthR[mylengthR==-1] <- 0
orientationDF <- data.frame(F1=myposF, F2=myposF+mylengthF-1, R1=myposR, R2=myposR+mylengthR-1, Length=seqslice[,2])
orientationDF <- data.frame(orientationDF,
Orient=(orientationDF[,3]-orientationDF[,2])/abs(orientationDF[,3]-orientationDF[,2]))
# The following steps are for sequences that contain only one adaptor. This approximation seems to give fairly good results.
orientationDF[orientationDF[,2]==-1 & orientationDF[,3] < (orientationDF[,5]/2), 6] <- -1
orientationDF[orientationDF[,4]==-1 & orientationDF[,1] < ((orientationDF[,5]/2)*0.4), 6] <- 1
orientationDF <- data.frame(ID=names(seqtriml), orientationDF)

# Generate data frame with insert parsing positions
zzz <- lapply(seqtriml, paste, collapse="")
# fix for gregexpr version change between R 2.2.0 and 2.3.0
# R version 2.0.0:# pos <- lapply(names(zzz), function(x) gregexpr("[ATGCN]{1,}", as.character(zzz[[x]]), perl=T))
pos <- lapply(names(zzz), function(x) gregexpr("[^ATGCN][ATGCN]+", paste("", as.character(zzz[[x]])), perl=T))
names(pos) <- names(seqtriml)
mypos <- as.vector(unlist(pos))
mypos[mypos==-1] <- 0
mylength <- lapply(names(pos), function(x) as.vector(unlist(attributes(pos[[x]][[1]]))))
mylength <- unlist(mylength)
# fix for gregexpr version change between R 2.2.0 and 2.3.0
mylength[mylength==-1] <- 0
mylength <- mylength-1
mylength[mylength==-1] <- 0
count <- lapply(names(pos), function(x) length(pos[[x]][[1]]))
count <- as.vector(unlist(count))
IDs <- rep(names(pos), count)
InsertFrame <- data.frame(ID=IDs, UniID=paste(IDs, unlist(sapply(count, function(x) seq(1,x))), sep="."),
InsertCount=rep(count,count), Pos=mypos, Length=mylength)
InsertFrame <- merge(InsertFrame, orientationDF, by.x="ID", by.y="ID", all.x=T)

# Position parsing of inserts using InsertFrame
InsertFrame <- data.frame(InsertFrame, End=InsertFrame[,4]+InsertFrame[,5]-1)
InsertFrame <- InsertFrame[,c(1:5, length(InsertFrame), 6:(length(InsertFrame)-1))]
InsertFrame <- InsertFrame[!InsertFrame$Pos==0, ] # added May 8, 06 to remove no-insert sequences
mylist <- apply(InsertFrame[,c(4,6)], 1, list)
mylist <- lapply(mylist, function(x) as.vector(as.matrix(unlist(x))))
names(mylist) <- as.vector(InsertFrame$UniID)
myindex <- as.vector(InsertFrame$ID)
seqtriml_long <- seqtriml[myindex]; names(seqtriml_long) <- as.vector(InsertFrame$UniID) # creates long version of 'seqtriml' that contains entries with several inserts in duplicates
quall_long <- qualslicel[myindex]; names(quall_long) <- as.vector(InsertFrame$UniID) # creates long version of 'quall' that contains entries with several inserts in duplicates
insert_list <- lapply(names(mylist), function(x) seqtriml_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
insert_qual_list <- lapply(names(mylist), function(x) quall_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
names(insert_list) <- as.vector(InsertFrame$UniID)
names(insert_qual_list) <- as.vector(InsertFrame$UniID)

# Reverse and complement of antisense sequences
insert_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_list[[x]]))
insert_list_rev <- lapply(insert_list_rev, paste, collapse="")
names(insert_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("A", "1", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("T", "2", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("C", "3", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("G", "4", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("1", "T", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("2", "A", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("3", "G", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("4", "C", x))
insert_qual_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_qual_list[[x]]))
names(insert_qual_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])
insert_qual_list_rev <- lapply(insert_qual_list_rev, paste, collapse=" ")

# Combine everything in final sequence object
insert_list_for <- insert_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])]
insert_list_for <- lapply(insert_list_for, paste, collapse="")
insert_list_final <- c(insert_list_rev, insert_list_for)
insert_list_final <- insert_list_final[names(insert_list)]
insert_qual_list_for <- insert_qual_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])]
insert_qual_list_for <- lapply(insert_qual_list_for, paste, collapse=" ")
insert_qual_list_final <- c(insert_qual_list_rev, insert_qual_list_for)
insert_qual_list_final <- insert_qual_list_final[names(insert_list)]

# Remove terminal Ns. Most of the code here is to adjust qual data set.
tempDF <- data.frame(as.data.frame(unlist(insert_list_final)), as.data.frame(unlist(insert_qual_list_final)))
names(tempDF)[1:2] <- c("Seq", "Phred")
remove_start <- nchar(as.character(tempDF$Seq)) - nchar(gsub("^N{1,}", "", as.character(tempDF$Seq), perl=T))
remove_end <- nchar(as.character(tempDF$Seq)) - nchar(gsub("N{1,}$", "", as.character(tempDF$Seq), perl=T))
NremoveDF <- data.frame(remove_start, remove_end)
tempDF2 <- data.frame(row.names=row.names(tempDF), Seq=gsub("^N{1,}|N{1,}$", "", as.character(tempDF$Seq), perl=T), tempDF[,2])
qual_removel <- strsplit(as.character(tempDF2[,2]), " ")
names(qual_removel) <- row.names(tempDF2)
NremoveDF <- data.frame(remove_start, remove_end)
Nremovel <- apply(NremoveDF, 1, list)
Nremovel <- lapply(Nremovel, function(x) as.vector(as.matrix(unlist(x))))
names(Nremovel) <- row.names(tempDF2)
qual_removel <- lapply(names(qual_removel), function(x) qual_removel[[x]][(1+Nremovel[[x]][1]):(length(as.vector(qual_removel[[x]]))-Nremovel[[x]][2])])
qual_removel <- lapply(qual_removel, paste, collapse=" ")
tempDF <- data.frame(row.names=row.names(tempDF2), Seq=tempDF2[,1], Phred=unlist(qual_removel))

appendDF <- rbind(appendDF, tempDF)
cat("\n", "End of 10K slice no: ", i, "\n")
}
# (5) Create final result data frame
finalDF <- appendDF
zzz <- gregexpr("N", as.character(finalDF[,1]), perl=T)
Ncount <- lapply(zzz, function(x) length(unlist(x))); Ncount <- unlist(Ncount)
# Obtain Ncount: for_loop here necessary for very large sequence sets (>100,000)
names(zzz) <- rownames(finalDF)
incDF <- data.frame(FROM=seq(1, length(zzz), by=10000),
TO=c(seq(0, length(zzz), by=10000)[2:((as.integer(length(zzz)/10000))+1)], length(zzz)))
Ncountfix <- NULL
for(i in 1:length(incDF[,1])) {
Ncountfix <- c(Ncountfix, unlist(lapply(names(zzz[incDF[i,1]:incDF[i,2]]), function(x) as.vector(unlist(zzz[[x]][[1]]))[1])))
cat("\n", "N count loop no: ", i, "\n")
}
Ncount[which(Ncountfix==-1)] <- 0

finalDF <- data.frame(finalDF,
InsertLength=nchar(as.character(finalDF[,1])),
mPhred=as.vector(apply(finalDF, 1, function(x) mean(as.numeric(unlist(strsplit((as.character(x[2]))," ")))))),
Ncount=Ncount
)
names(finalDF)[1:2] <- c("Seq", "Phred")
finalDF <- data.frame(ID=rownames(finalDF), finalDF)

# (6) Filtering steps
# Remove all sequences that have insert lengths of less than 15bp
### adjust
finalDF <- finalDF[finalDF$InsertLength>=15,]
# Remove inserts with >=20% Ns
### adjust
finalDF <- finalDF[(finalDF$Ncount/finalDF$InsertLength)<=0.2,]

# Adjust name extensions to filter results
namev <- gsub("\\..*$", "", as.character(rownames(finalDF)), perl=T)
namev <- paste(sort(namev), unlist(lapply(as.vector(table(sort(namev))), function(x) 1:x)), sep=".")
finalDF <- data.frame(finalDF, sort=1:length(finalDF[,1]))
finalDF <- data.frame(ID=sort(namev), finalDF[order(rownames(finalDF)),-1])
finalDF <- finalDF[order(finalDF$sort),]

# Export sequences and qual data to two fasta batch files
export_seq <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,2])
write.table(as.vector(as.character(t(export_seq))), file="final_seq.txt", quote=F, row.names=F, col.names=F)
export_qual <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,3])
write.table(as.vector(as.character(t(export_qual))), file="final_qual.txt", quote=F, row.names=F, col.names=F)

# # Optional steps
# # Average Phred scores of inserts
# mylist <- strsplit(as.character(finalDF[,3]), " "); mylist <- lapply(mylist, as.numeric); mean(unlist(lapply(mylist, mean)))


parsing sequence by positional co-ordinates using R

##############################################
# Sequence Parsing by Positional Coordinates #
##############################################
# Utility:
# Script parses sequences based on positional information provided
# by a coordinate matrix like the one below. The parsing is performed
# on vectorized sequences in a list container.
# Format of coordinate matrix
# ID start end
# ID001 20 60
# ID001 120 190
# ID002 20 60
# ID002 120 190
#
# To demo what the script does, run it like this:
# source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/positionParsing.txt")

# Create a sample set of sequences (here random sequences)
fx <- function(test) {x <- as.integer(runif(200, min=1, max=5)) # sets length of each random sequence to '200'
x[x==1] <- "A"; x[x==2] <- "T"; x[x==3] <- "G"; x[x==4] <- "C"; paste(x, sep = "", collapse ="")
}
z1 <- c()
for(i in 1:5) { # sets number of generated sequences to '5'
z1 <- c(fx(i), z1)
}
SeqID <- c("ID001","ID002","ID003","ID004","ID005")
z1 <- data.frame(SeqID=SeqID, Seq=z1)
cat("Source sequences:", "\n")
print(z1)

# Write the sequence data frame into a list object.
# This is an important step for handling sets of vectorized sequences of variable length, since
# list objects are more versatile in handling them than data frames.
seq_list <- apply(as.matrix(z1[,2]), 1, list) # generates list with list components
seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors
seq_list <- lapply(seq_list, strsplit, "") # vectorizes sequences by splitting them into separate vector fields
seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors
names(seq_list) <- SeqID # names sequence entries, e.g. by their IDs

# Generate coordinate data frame with positions to be parsed from sequences.
pos_matrix <- data.frame(ID=sort(rep(SeqID, 2)), start=rep(c(20,120),5), end=rep(c(60,190),5))
cat("Position matrix:", "\n")
print(pos_matrix)

# Define sequence parse function
seq_parse_fct <- function(seq_list, pos_matrix) {
# Parse sequences in nested 'for loop'
# nesting required to parse more than one fragment per sequence ID
parse_list_all <- list(NULL) # create empty list as list container
if (length(setdiff(names(seq_list), as.vector(pos_matrix[,1]))) > 1) stop("The number of sequences and coordinate groups differ") # Check if all sequence IDs are contained in the sequence list and in the coordinate matrix
for(i in names(seq_list)) { # loop over each sequence ID
sub_pos_matrix <- pos_matrix[pos_matrix[,1]==i,]
parse_list <- list(NULL) # creates empty list of vectorslength container
for(j in 1:length(sub_pos_matrix[,1])) { # loop over all fragments of a sequence
index <- eval(parse(text=paste(sub_pos_matrix[j,2],":", sub_pos_matrix[j,3])))
parse_list <- c(parse_list, list(seq_list[[i]][index]))
if (length(parse_list[[1]][[1]])==0) { parse_list <- parse_list[-1] } # removes in first loop first (empty) component
}
parse_list_all <- c(parse_list_all, list(parse_list))
if (length(parse_list_all[[1]][[1]])==0) { parse_list_all <- parse_list_all[-1] } # removes in first loop first (empty) component
}
names(parse_list_all) <- SeqID # names sequence entries, e.g. by their IDs
parse_list_all
}
parse_list_all <- seq_parse_fct(seq_list, pos_matrix)
cat("Vectorized sequence fragments are managed in the list object 'parse_list_all'.", "\n", "Print first fragment set as sample:", "\n")
print(parse_list_all[[1]])

# Collapse sequence vectors into strings
parse_seq_list <- lapply(parse_list_all, unlist) # joins sequence vectors in each list
parse_seq_list <- lapply(parse_seq_list, paste, collapse="") # collapses vectorised sequences into strings
cat("Concatenated sequence fragments:", "\n")
parsed_result <- data.frame(SeqID, Parsed_Seq=unlist(parse_seq_list))
print(parsed_result)

Role of R in sequence Analysis

######################################
# (A) Sequence Batch Import Function #
######################################
# This script provides the following functionality for sequence management and analysis:
# (1) Batch sequence import into R data frame
# (2) Motif searching with hit statistics
# (3) Analysis of sequence composition
# (4) All-against-all sequence comparisons
# (5) Generation of phylogenetic trees

# Download sample proteome from NCBI in fasta format. The chosen example is from Halobacterium sp. which contains 2058 proteins:
myfileloc <- "ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp/AE004437.faa"
# Larger example from Arabidopsis with 30690 proteins:
# ftp://ftp.arabidopsis.org/home/tair/home/tair/Sequences/blast_datasets/TAIR6_pep_20051108

# (A1) Define Sequence Import Function 'seq_imp_fct'
# This function does the following:
# (a) reads fasta batch file into a single vector
# (b) writes each sequence into a single field using a grouping vector
# (c) provides length information for each sequence
# (d) organizes description lines and sequences in a single data frame

seq_imp_fct <- function(fileloc) {
my_fasta <- readLines(fileloc) # reads file line-wise into vector
y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign
y <- as.vector(y); y[y==-1] <- 0

# The 'rep' function is used here to generate a grouping vector/column for writing each sequence into a single field
# This 'indirect' approach runs much faster than the alternative 'for loop' below.
index <- which(y==0)
distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry
distance <- data.frame(distance, dist=distance[,2]-distance[,1])
seq_no <- 1:length(y[y==0])
index <- rep(seq_no, as.vector(distance[,3]))
my_fasta <- data.frame(index, y, my_fasta)
my_fasta[my_fasta[,2]==0,1] <- 0

# Alternative 'for loop' that generates above grouping function, but is 100 times slower
# counter <- 0
# index <- NULL
# for(i in y) {
# if(i==1) { index <- c(index, counter) } else
# { index <- c(index, 0); counter <- counter + 1 }
# }
# my_fasta <- data.frame(index, y, my_fasta)

# Use the 'paste' function in a 'tapply' loop to write each sequence into a single field
# This runs much faster than the alternative 'for loop' below.
seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
seq <- as.vector(seq[2:length(seq)])

# Alternative 'for loop' that does the same as above 'tapply' function, but is >20 times slower
# seq <- NULL
# for(i in 1:max(my_fasta[,1])) {
# seq <- c(seq, paste(my_fasta[my_fasta[,1]==i,3], collapse=""))
# }
Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
my_fasta <- data.frame(Desc, Length=nchar(seq), seq)
my_fasta
}
# (A2) Apply 'seq_imp_fct' to import Halobacterium proteome
Halobact <- seq_imp_fct(fileloc=myfileloc)
# Generate separate column with accession numbers. This regular expression needs to be adjusted for other title line formats
Acc <- gsub(".*gb\\|(.*)\\|.*" , "\\1", as.character(Halobact[,1]), perl=T) # regular expression specific for this example
Halobact <- data.frame(Acc, Halobact)

#################################
# (B) Pattern Matching Function #
#################################
# (B1) Define pattern function
# This function counts the matches of a user defined pattern (reg expr) in a sequence data frame generated by above 'seq_imp_fct'.
# In addition to the pattern counts, the function provides the positions of the pattern matches.
pattern_fct <- function(pattern, sequences) { # expects sequences in fourth column of 'sequence' data frame
pos <- gregexpr(pattern, as.character(sequences[,4]), perl=T) # 'gregexpr' returns list with pattern positions
posv <- unlist(lapply(pos, paste, collapse=", ")); posv[posv==-1] <- 0 # retrieves positions where pattern matches in each sequence
hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence
sequences <- data.frame(sequences[,1:3], Position=as.vector(posv), Hits=hitsv, sequences[,4])
sequences
}
# (B2) Apply pattern function
patterndf <- pattern_fct(pattern="H..H{1,2}", sequences=Halobact)
# Print a title message to explain the following output
cat("These sequences in the Halobacterium proteome contain the motif \"H..H{1,2}\" more than two times:","\n")
print(patterndf[patterndf[,5]>2, 1:5])
# Tag pattern in sequence by setting pattern to upper case and remaining sequence to lower case
tag <- gsub("([A-Z])", "\\L\\1", as.character(patterndf[patterndf[,5]>2, 6]), perl=T, ignore.case=T)
tag <- gsub("(H..H{1,2})", "\\U\\1", as.character(tag), perl=T, ignore.case=T)
export <- data.frame(patterndf[patterndf[,5]>2,-6], tag)
export <- data.frame(Acc=paste(">", export[,1], sep=""), seq=export[,6])
# Possibility to export any sequence subset in fasta format to an external file
write.table(as.vector(as.character(t(export))), file="export.txt", quote=F, row.names=F, col.names=F)
# Print a title message to explain the output
cat("The sequences with the \"H..H{1,2}\" pattern in upper case have been written to a file 'export.txt' in your current working directory", "\n", "\n")

# (B3) Check AA distribution in sequences
AA <- data.frame(AA=c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"))
AAstat <- lapply(strsplit(as.character(patterndf[patterndf[,5]>2, 6]),""), table)
for(i in 1:length(AAstat)) {
AAperc <- AAstat[[i]] / patterndf[patterndf[,5]>2, 3][i] *100 # calculates AA content in percent using length information
AA <- merge(AA, as.data.frame(AAperc), by.x="AA", by.y="Var1", all=T)
}
names(AA)[2:length(AA)] <- as.vector(patterndf[patterndf[,5]>2, 1])
AASummary <- data.frame(AA, Mean=apply(AA[,2:length(AA)], 1, mean, na.rm=T))
for(i in 1:length(patterndf[patterndf[,5]>2, 3])) {
patterndf[patterndf[,5]>2, 3][i]
}
# Print a title message to explain the following output
cat("The identified sequences with two or more \"H..H{1,2}\" motifs have the following AA composition:", "\n")
print(AASummary)
cat("If you encounter in the next steps error messages, then Needle and/or Phylip may not be available on your system!!!!!!!", "\n")

######################################
# Run Needle in All-Against-All Mode #
######################################
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# This part is for UNIX-like OSs that have EMBOSS installed
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# The following code runs the Needle program from EMBOSS in all-against-all mode
# to generate a distance matrix based on scores obtained in each pairwise alignment.
# The same code can be used to run Water, BLAST2 or other pairwise sequence analysis
# programs in all-against-all mode.
# In the last step the obtained distance matrix is used to plot a similarity tree in R.
# The same matrix can also be used to calculate a phylogenetic tree with the Phylip
# software (see below).
patterndf <- patterndf[patterndf[,5]>2,] # start with above patterndf
system("rm -f my_needle_file")
for(i in 1:length(patterndf[,1])) {
cat(as.character(paste(">", as.vector(patterndf[i,1]),sep="")), as.character(as.vector(patterndf[i,6])), file="file1", sep="\n")
for(j in 1:length(patterndf[,1])) {
cat(as.character(paste(">", as.vector(patterndf[j,1]),sep="")), as.character(as.vector(patterndf[j,6])), file="file2", sep="\n")
system("needle file1 file2 stdout -gapopen 10.0 -gapextend 0.5 >> my_needle_file")
}
}

# Import needle results and plot them as tree
score <- readLines("my_needle_file")
score <- score[grep("^# Score", score, perl=T)]
score <- gsub(".* " , "", as.character(score), perl=T)
score <- as.numeric(score)
scorem <- matrix(score, length(patterndf[,1]), length(patterndf[,1]), dimnames=list(as.vector(patterndf[,1]),as.vector(patterndf[,1])))
scorem <- as.dist(1/scorem)
hc <- hclust(scorem, method="complete")
plot(hc, hang = -1, main="Distance Tree Based on Needle All-Against-All Comparison")
# Print a title message to explain the generated output
cat("The created all-against-all distance matrix ('scorem') of Needle scores is plotted in form of a distance tree.", "\n")

###################
# PHYLIP Analysis #
###################
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# This part is for UNIX-like OSs that have PHYLIP installed
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# In this step the obtained distance matrix is exported in Phylip format to a file named 'infile'.
# This file is used to generate a phylogenetic tree with the 'neighbor' program in Phylip. After this
# step the retree module is used to view and edit the tree.
scorem <- matrix(score, length(patterndf[,1]), length(patterndf[,1]), dimnames=list(as.vector(patterndf[,1]),as.vector(patterndf[,1])))
z <- 1/scorem
zv <- as.vector(z)
zv[seq(1,64, by=9)] <- 0
z <- matrix(zv, 8, 8, dimnames=list(names(as.data.frame(scorem)), names(as.data.frame(scorem))))
z <- round(z, 7)
write.table(as.data.frame(z), file="infile", quote=F, col.names=F, sep="\t")
# Print a title message to explain the generated output
cat("The distance matrix was exported in Phylip format to a file named 'infile'. Follow the instructions in the next comment lines to continue with the Phylip anaylsis.", "\n")
# system("vim infile") # add number of sequences in first line
# system("phylip neighbor") # choose: r, n, y
# system("mv -f outtree intree")
# system("phylip retree") # choose: y, m, x, y, r

genomics: June 2006

genomics: June 2006

Thursday, June 15, 2006

role of exon arrays in alternate splicing

genomics: May 2006

Research indicates that 60 percent or more of the best characterized genes in the human genome comprise multiple exons, or small blocks of RNA that can be rearranged, or "spliced" together to create different transcripts from the same gene. Each of those transcripts can potentially be translated into a different protein carrying out different functions, which greatly increases the diversity of proteins generated from the genome. The GeneChip exon arrays offer a new and high resolution view to examine these alternatively expressed exons, enabling researchers to conduct experiments that were not possible before. More than a dozen leading academic and industrial research organizations are already using the exon arrays through an early technology access program.

Since each Affymetrix exon array has a capacity of more than six million data points, whole-genome exon expression analysis is possible on a single array. Other platforms with significantly lower data capacity require as many as 50 arrays, which demand more sample, reagent and labor to conduct a comparable experiment.

"Biology occurs at the transcript level and any one of these transcripts could hold the key to a new treatment or diagnostic. Affymetrix exon arrays offer scientists the only solution for accessing genome-wide exon data on a single array," said John E. Blume, Ph.D., vice president of RNA Products at Affymetrix. "We anticipate that access to this data could dramatically accelerate research, publication and, ultimately, bring new tests and therapies to market."

Exon Array Design

The first-generation genome-wide exon arrays from Affymetrix are designed to include all annotated exons. In addition, these new arrays include computationally predicted and empirically identified exon content, enabling researchers to study known splicing events as well as to discover novel splice variants. All of the probes are based on genomic sequence and annotated on the genome, making it easier for researchers to correlate expression and alternative splicing data with genetic variations or mutations related to the disease.

Exon Array Applications

Exon-level analysis can provide important new data in a number of different research areas. For example, the whole-genome, exon-level detail these arrays provide makes them ideal for identifying novel biomarkers. The array's alternative splicing and gene expression information could also be used to help find a precise, predictive signature for drug response or pathological mechanism. As a discovery tool, the exon array's high resolution view of coding sequence gene expression will help researchers validate new gene content, providing in-depth analysis for drug-discovery targets and biological mechanisms. In addition, the exon arrays provide a possible fast-track for linking genetic information such as SNPs to functional effects at the expression level.

Exon Array System

Affymetrix exon arrays are built on the same GeneChip technology that has been the industry standard in microarray research for the past decade. The Affymetrix exon array system launched today includes:

-- GeneChip Human Exon 1.0 ST (Sense Target) Array

-- Optimized and validated reagents that are based on a whole transcript assay

-- The Exon Array Computational Tool (ExACT), which incorporates new algorithms for detection and signal estimation. Together with the existing NetAffx(TM) Analysis Center and the Integrated Genome Browser, these tools provide basic analysis and array design information and annotation.

For more information on the Affymetrix GeneChip exon arrays, please visit the company's website at www.affymetrix.com/genechip/exon.

About Affymetrix

Affymetrix scientists invented the world's first microarray in 1989 and began selling the first commercial microarray in 1994. Since then, Affymetrix GeneChip(R) technology has become the industry standard in molecular biology research. Affymetrix technology is used by the world's top pharmaceutical, diagnostic and biotechnology companies as well as leading academic, government and not-for-profit research institutes. More than 1,200 systems have been shipped around the world and more than 3,000 peer-reviewed papers have been published using the technology. Affymetrix' patented photolithographic manufacturing process provides the most information capacity available today on an array, enabling researchers to use a whole-genome approach to analyze the relationship between genetics and health. Headquartered in Santa Clara, Calif., Affymetrix has subsidiaries in Europe and Asia in addition to manufacturing facilities in Sacramento, Calif. and Bedford, Mass. The company has about 900 employees worldwide. For more information about Affymetrix, please visit the company's website at www.Affymetrix.com

All statements in this press release that are not historical are "forward-looking statements" within the meaning of Section 21E of the Securities Exchange Act as amended, including statements regarding Affymetrix' "expectations," "beliefs," "hopes," "intentions," "strategies" or the like. Such statements are subject to risks and uncertainties that could cause actual results to differ materially for Affymetrix from those projected, including, but not limited to risks of the Company's ability to achieve and sustain higher levels of revenue, higher gross margins, reduced operating expenses, uncertainties relating to technological approaches, manufacturing, product development (including uncertainties relating to commercial and technological success of the GeneChip Human Exon 1.0 ST Arrays discussed in this press release and the timing of the Company's release of mouse and rat exon arrays), personnel retention, uncertainties related to cost and pricing of Affymetrix products, dependence on collaborative partners, uncertainties relating to sole source suppliers, uncertainties relating to FDA and other regulatory approvals, competition, risks relating to intellectual property of others and the uncertainties of patent protection and litigation. These and other risk factors are discussed in Affymetrix' Form 10-K for the year ended December 31, 2004 and other SEC reports, including its Quarterly Reports on Form 10-Q for subsequent quarterly periods. Affymetrix expressly disclaims any obligation or undertaking to release publicly any updates or revisions to any forward-looking statements contained herein to reflect any change in Affymetrix' expectations with regard thereto or any change in events, conditions, or circumstances on which any such statements are based.