Wednesday, August 30, 2006

R code for vector and phred score trimming

####################################################################################################
### R & BioConductor Script for Trimming Adaptor and Low Quality Positions of 454 sRNA Sequences ###
####################################################################################################
# Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside
# Utility: This script brings 454 sequences into the proper orientation, trims off the
# adaptor segments and tags the low quality calls (default score <14) with Ns. The
# current draft implementation is not very fast. The processing of 50,000 sequences
# will take 15-25 minutes.
# Requirements: cross_match program from the Phred/Phrap/Consed package
# Code tag where one can adjust filter settings: '### adjust'
# How it works:
# (A) Some format editing on the command-line
# Requirements:
# clean directory containing
# (a) 454.seq and 454.seq.qual files
# (b) The 2 adaptor files are exprected to be in parent directory under the names
# adaptfor.txt
# adaptrev.txt
# Make sure the extension of *.qual matches the one expected by cross_match, e.g.
# $ mv 454.qual 454.seq.qual
# For the import of the *.qual data into R, the *.qual file needs to have spaces at end of lines:
# $ perl -p -i -w -e 's/(^.*)/$1 /g' 454.seq.qual
# (B) Processing in R
# Run the following script from the directory with your 454 sequences like this:
# source("454_sRNA_Trim.R")
# or
# R CMD BATCH --no-save 454_sRNA_Trim.R

# Start of R script
# (1) Split sequence batch into smaller sets of 50,000 per file
# Define Sequence Import Function 'seq_imp_fct'
seq_imp_fct <- function(fileloc) {
my_fasta <- readLines(fileloc) # reads file line-wise into vector
y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign
y <- as.vector(y); y[y==-1] <- 0
index <- which(y==0)
distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry
distance <- data.frame(distance, dist=distance[,2]-distance[,1])
seq_no <- 1:length(y[y==0])
index <- rep(seq_no, as.vector(distance[,3]))
my_fasta <- data.frame(index, y, my_fasta)
my_fasta[my_fasta[,2]==0,1] <- 0
seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
seq <- as.vector(seq[2:length(seq)])
Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
my_fasta <- data.frame(Desc, Length=nchar(seq), seq)
my_fasta
}
# Generate smaller sequence batches of max 50,000 sequences per file. This is required for cross_match
seq <- seq_imp_fct(fileloc="454.seq")
export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
slice <- data.frame(FROM=seq(1,length(seq[,1]), by=50000), TO=seq(1,length(seq[,1]), by=50000)+49999)
slice[length(slice[,1]),2] <- length(seq[,1])
for (i in 1:length(slice[,1])) {
write.table(as.vector(as.character(t(export_seq[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
}
qual <- seq_imp_fct(fileloc="454.seq.qual")
export_qual <- data.frame(Desc=qual[,1], seq=qual[,3])
for (i in 1:length(slice[,1])) {
write.table(as.vector(as.character(t(export_qual[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq.qual", sep=""), quote=F, row.names=F, col.names=F)
}

# (2) Run cross_match in 2 steps to tag forward adaptors with Ns and reverse adaptors with Xs
# Run cross_match on all generated *.seq files:
for (i in 1:length(slice[,1])) {
if(i==1) { system(paste("rm -f ", "all.seq.screen", sep="")) } # removes in fist loop an already existing "all.seq.screen" file, since data slices will be appended to a file of this name
# next import steps protect Ns that are inserted by 454.com for positions with score=0
seq <- seq_imp_fct(fileloc=paste(i, ".seq", sep=""))
export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
export_seq$seq <- gsub("N", "Z", as.character(seq$seq))
write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
### adjust
system(paste("cross_match ", paste(i, ".seq ", sep=""), "../adaptfor.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for forward adaptor only
# next import steps tag forward adaptor with N's
seq <- seq_imp_fct(fileloc=paste(i, ".seq.screen", sep=""))
export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
export_seq$seq <- gsub("X", "N", as.character(seq$seq))
write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq.screen", sep=""), quote=F, row.names=F, col.names=F)
system(paste("mv ", paste(i, ".seq", sep=""), ".screen ", paste(i, ".seq", sep=""), sep=""))
### adjust
system(paste("cross_match ", paste(i, ".seq ", sep=""), "../adaptrev.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for reverse adaptor only, gets tagged with X's
system(paste("cat ", paste(i, ".seq", sep=""), ".screen ", ">> ", "all.seq.screen", sep=""))
system(paste("rm ", "-f ", paste(i, ".seq ", sep=""), paste(i, ".seq.qual ", sep=""), paste(i, ".seq.screen ", sep=""), paste(i, ".seq.log", sep=""), sep=""))
}

# (3) Import of cross_matched sequences into R
# Import of 'all.seq.screen'
seq <- seq_imp_fct(fileloc="all.seq.screen")
cat("\n", "Sequence batch imported containing ", length(seq[,1]), " entries", "\n")
# Tag forward adapter with "F" and reverse adapter with "R", and revert Zs (454 Ns) back to Ns
seqedit <- gsub("N", "F", as.character(seq$seq))
seqedit <- gsub("X", "R", as.character(seqedit))
seqedit <- gsub("Z", "N", as.character(seqedit))
seq <- data.frame(seq[,-3], seq=seqedit)
seq <- data.frame(Desc=gsub(" .*", "", as.character(seq[,1]), perl=T), seq[,-1])
qual <- seq_imp_fct(fileloc="454.seq.qual")
cat("\n", "Quality batch imported containing ", length(qual[,1]), " entries", "\n")
qual <- data.frame(qual[,-3], seq=gsub("[ ]{2,}", " ", as.character(qual$seq)))
# qual[qual[,3]=="N",3] <- 0 # removed May 08, 06
qual <- data.frame(Desc=gsub(" .*", "", as.character(qual[,1]), perl=T), qual[,-1])

# (4) For larger data sets, loop over slices of 10,000 sequences
# In this loop the sequences are (1) vector trimmed, (2) quality trimmed and (3) orientation adjusted.
slicedf <- data.frame(FROM=seq(1, length(seq[,1]), by=10000),
TO=c(seq(0, length(seq[,1]), by=10000)[2:((as.integer(length(seq[,1])/10000))+1)],
length(seq[,1])))
appendDF <- data.frame(Seq=NULL, Phred=NULL)
for(i in 1:length(slicedf[,1])){
cat("\n", "Start of 10K slice no: ", i, "\n")
seqslice <- seq[slicedf[i,1]:slicedf[i,2],]
qualslice <- qual[slicedf[i,1]:slicedf[i,2],]
# Write sequences and qual data in vectorized format into list files
seqslicel <- strsplit(as.character(seqslice[,3]),"")
qualslicel <- strsplit(as.character(qualslice[,3])," ")
names(seqslicel) <- seqslice[,1]
names(qualslicel) <- qualslice[,1]
qualslicel <- lapply(qualslicel, as.numeric)
# Replaces all NTs with Phred < 14 by Ns; this includes vector/adaptor positions
### adjust
seqtriml <- lapply(names(seqslicel), function(x) replace(seqslicel[[x]], which(qualslicel[[x]]<14), "N"))
names(seqtriml) <- names(seqslicel)
# Convert all Ns that fall into adaptor regions back to Fs and Rs
seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="F"), "F"))
names(seqtriml) <- names(seqslicel)
seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="R"), "R"))
names(seqtriml) <- names(seqslicel)

# To determine orientation of sequences, generate data frame with adaptor positions
zzz <- lapply(seqtriml, paste, collapse="")
posF <- lapply(names(zzz), function(x) gregexpr("(F){1,}", as.character(zzz[[x]]), perl=T))
posR <- lapply(names(zzz), function(x) gregexpr("(R){1,}", as.character(zzz[[x]]), perl=T))
names(posF) <- names(seqtriml); names(posR) <- names(seqtriml)
myposF <- unlist(lapply(names(posF), function(x) as.vector(unlist(posF[[x]][[1]]))[1]))
myposR <- unlist(lapply(names(posR), function(x) as.vector(unlist(posR[[x]][[1]]))[1]))
myposF[myposF==-1] <- 0; myposR[myposR==-1] <- 0
mylengthF <- lapply(names(posF), function(x) as.vector(unlist(attributes(posF[[x]][[1]])))[1])
mylengthF <- unlist(mylengthF)
mylengthF[mylengthF==-1] <- 0
mylengthR <- lapply(names(posR), function(x) as.vector(unlist(attributes(posR[[x]][[1]])))[1])
mylengthR <- unlist(mylengthR)
mylengthR[mylengthR==-1] <- 0
orientationDF <- data.frame(F1=myposF, F2=myposF+mylengthF-1, R1=myposR, R2=myposR+mylengthR-1, Length=seqslice[,2])
orientationDF <- data.frame(orientationDF,
Orient=(orientationDF[,3]-orientationDF[,2])/abs(orientationDF[,3]-orientationDF[,2]))
# The following steps are for sequences that contain only one adaptor. This approximation seems to give fairly good results.
orientationDF[orientationDF[,2]==-1 & orientationDF[,3] < (orientationDF[,5]/2), 6] <- -1
orientationDF[orientationDF[,4]==-1 & orientationDF[,1] < ((orientationDF[,5]/2)*0.4), 6] <- 1
orientationDF <- data.frame(ID=names(seqtriml), orientationDF)

# Generate data frame with insert parsing positions
zzz <- lapply(seqtriml, paste, collapse="")
# fix for gregexpr version change between R 2.2.0 and 2.3.0
# R version 2.0.0:# pos <- lapply(names(zzz), function(x) gregexpr("[ATGCN]{1,}", as.character(zzz[[x]]), perl=T))
pos <- lapply(names(zzz), function(x) gregexpr("[^ATGCN][ATGCN]+", paste("", as.character(zzz[[x]])), perl=T))
names(pos) <- names(seqtriml)
mypos <- as.vector(unlist(pos))
mypos[mypos==-1] <- 0
mylength <- lapply(names(pos), function(x) as.vector(unlist(attributes(pos[[x]][[1]]))))
mylength <- unlist(mylength)
# fix for gregexpr version change between R 2.2.0 and 2.3.0
mylength[mylength==-1] <- 0
mylength <- mylength-1
mylength[mylength==-1] <- 0
count <- lapply(names(pos), function(x) length(pos[[x]][[1]]))
count <- as.vector(unlist(count))
IDs <- rep(names(pos), count)
InsertFrame <- data.frame(ID=IDs, UniID=paste(IDs, unlist(sapply(count, function(x) seq(1,x))), sep="."),
InsertCount=rep(count,count), Pos=mypos, Length=mylength)
InsertFrame <- merge(InsertFrame, orientationDF, by.x="ID", by.y="ID", all.x=T)

# Position parsing of inserts using InsertFrame
InsertFrame <- data.frame(InsertFrame, End=InsertFrame[,4]+InsertFrame[,5]-1)
InsertFrame <- InsertFrame[,c(1:5, length(InsertFrame), 6:(length(InsertFrame)-1))]
InsertFrame <- InsertFrame[!InsertFrame$Pos==0, ] # added May 8, 06 to remove no-insert sequences
mylist <- apply(InsertFrame[,c(4,6)], 1, list)
mylist <- lapply(mylist, function(x) as.vector(as.matrix(unlist(x))))
names(mylist) <- as.vector(InsertFrame$UniID)
myindex <- as.vector(InsertFrame$ID)
seqtriml_long <- seqtriml[myindex]; names(seqtriml_long) <- as.vector(InsertFrame$UniID) # creates long version of 'seqtriml' that contains entries with several inserts in duplicates
quall_long <- qualslicel[myindex]; names(quall_long) <- as.vector(InsertFrame$UniID) # creates long version of 'quall' that contains entries with several inserts in duplicates
insert_list <- lapply(names(mylist), function(x) seqtriml_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
insert_qual_list <- lapply(names(mylist), function(x) quall_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
names(insert_list) <- as.vector(InsertFrame$UniID)
names(insert_qual_list) <- as.vector(InsertFrame$UniID)

# Reverse and complement of antisense sequences
insert_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_list[[x]]))
insert_list_rev <- lapply(insert_list_rev, paste, collapse="")
names(insert_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("A", "1", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("T", "2", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("C", "3", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("G", "4", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("1", "T", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("2", "A", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("3", "G", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("4", "C", x))
insert_qual_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_qual_list[[x]]))
names(insert_qual_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])
insert_qual_list_rev <- lapply(insert_qual_list_rev, paste, collapse=" ")

# Combine everything in final sequence object
insert_list_for <- insert_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])]
insert_list_for <- lapply(insert_list_for, paste, collapse="")
insert_list_final <- c(insert_list_rev, insert_list_for)
insert_list_final <- insert_list_final[names(insert_list)]
insert_qual_list_for <- insert_qual_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])]
insert_qual_list_for <- lapply(insert_qual_list_for, paste, collapse=" ")
insert_qual_list_final <- c(insert_qual_list_rev, insert_qual_list_for)
insert_qual_list_final <- insert_qual_list_final[names(insert_list)]

# Remove terminal Ns. Most of the code here is to adjust qual data set.
tempDF <- data.frame(as.data.frame(unlist(insert_list_final)), as.data.frame(unlist(insert_qual_list_final)))
names(tempDF)[1:2] <- c("Seq", "Phred")
remove_start <- nchar(as.character(tempDF$Seq)) - nchar(gsub("^N{1,}", "", as.character(tempDF$Seq), perl=T))
remove_end <- nchar(as.character(tempDF$Seq)) - nchar(gsub("N{1,}$", "", as.character(tempDF$Seq), perl=T))
NremoveDF <- data.frame(remove_start, remove_end)
tempDF2 <- data.frame(row.names=row.names(tempDF), Seq=gsub("^N{1,}|N{1,}$", "", as.character(tempDF$Seq), perl=T), tempDF[,2])
qual_removel <- strsplit(as.character(tempDF2[,2]), " ")
names(qual_removel) <- row.names(tempDF2)
NremoveDF <- data.frame(remove_start, remove_end)
Nremovel <- apply(NremoveDF, 1, list)
Nremovel <- lapply(Nremovel, function(x) as.vector(as.matrix(unlist(x))))
names(Nremovel) <- row.names(tempDF2)
qual_removel <- lapply(names(qual_removel), function(x) qual_removel[[x]][(1+Nremovel[[x]][1]):(length(as.vector(qual_removel[[x]]))-Nremovel[[x]][2])])
qual_removel <- lapply(qual_removel, paste, collapse=" ")
tempDF <- data.frame(row.names=row.names(tempDF2), Seq=tempDF2[,1], Phred=unlist(qual_removel))

appendDF <- rbind(appendDF, tempDF)
cat("\n", "End of 10K slice no: ", i, "\n")
}
# (5) Create final result data frame
finalDF <- appendDF
zzz <- gregexpr("N", as.character(finalDF[,1]), perl=T)
Ncount <- lapply(zzz, function(x) length(unlist(x))); Ncount <- unlist(Ncount)
# Obtain Ncount: for_loop here necessary for very large sequence sets (>100,000)
names(zzz) <- rownames(finalDF)
incDF <- data.frame(FROM=seq(1, length(zzz), by=10000),
TO=c(seq(0, length(zzz), by=10000)[2:((as.integer(length(zzz)/10000))+1)], length(zzz)))
Ncountfix <- NULL
for(i in 1:length(incDF[,1])) {
Ncountfix <- c(Ncountfix, unlist(lapply(names(zzz[incDF[i,1]:incDF[i,2]]), function(x) as.vector(unlist(zzz[[x]][[1]]))[1])))
cat("\n", "N count loop no: ", i, "\n")
}
Ncount[which(Ncountfix==-1)] <- 0

finalDF <- data.frame(finalDF,
InsertLength=nchar(as.character(finalDF[,1])),
mPhred=as.vector(apply(finalDF, 1, function(x) mean(as.numeric(unlist(strsplit((as.character(x[2]))," ")))))),
Ncount=Ncount
)
names(finalDF)[1:2] <- c("Seq", "Phred")
finalDF <- data.frame(ID=rownames(finalDF), finalDF)

# (6) Filtering steps
# Remove all sequences that have insert lengths of less than 15bp
### adjust
finalDF <- finalDF[finalDF$InsertLength>=15,]
# Remove inserts with >=20% Ns
### adjust
finalDF <- finalDF[(finalDF$Ncount/finalDF$InsertLength)<=0.2,]

# Adjust name extensions to filter results
namev <- gsub("\\..*$", "", as.character(rownames(finalDF)), perl=T)
namev <- paste(sort(namev), unlist(lapply(as.vector(table(sort(namev))), function(x) 1:x)), sep=".")
finalDF <- data.frame(finalDF, sort=1:length(finalDF[,1]))
finalDF <- data.frame(ID=sort(namev), finalDF[order(rownames(finalDF)),-1])
finalDF <- finalDF[order(finalDF$sort),]

# Export sequences and qual data to two fasta batch files
export_seq <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,2])
write.table(as.vector(as.character(t(export_seq))), file="final_seq.txt", quote=F, row.names=F, col.names=F)
export_qual <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,3])
write.table(as.vector(as.character(t(export_qual))), file="final_qual.txt", quote=F, row.names=F, col.names=F)

# # Optional steps
# # Average Phred scores of inserts
# mylist <- strsplit(as.character(finalDF[,3]), " "); mylist <- lapply(mylist, as.numeric); mean(unlist(lapply(mylist, mean)))


No comments: