genomics: R code for vector and phred score trimming

####################################################################################################
### R & BioConductor Script for Trimming Adaptor and Low Quality Positions of 454 sRNA Sequences ###
####################################################################################################
# Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside
# Utility: This script brings 454 sequences into the proper orientation, trims off the
# adaptor segments and tags the low quality calls (default score <14) with Ns. The
# current draft implementation is not very fast. The processing of 50,000 sequences
# will take 15-25 minutes.
# Requirements: cross_match program from the Phred/Phrap/Consed package
# Code tag where one can adjust filter settings: '### adjust'
# How it works:
# (A) Some format editing on the command-line
 # Requirements:
  # clean directory containing
   # (a) 454.seq and 454.seq.qual files
   # (b) The 2 adaptor files are exprected to be in parent directory under the names
    # adaptfor.txt
    # adaptrev.txt
 # Make sure the extension of *.qual matches the one expected by cross_match, e.g.
 # $ mv 454.qual 454.seq.qual
 # For the import of the *.qual data into R, the *.qual file needs to have spaces at end of lines:
 # $ perl -p -i -w -e 's/(^.*)/$1 /g' 454.seq.qual
# (B) Processing in R
 # Run the following script from the directory with your 454 sequences like this:
 # source("454_sRNA_Trim.R")
 #  or
 # R CMD BATCH --no-save 454_sRNA_Trim.R

# Start of R script
# (1) Split sequence batch into smaller sets of 50,000 per file
 # Define Sequence Import Function 'seq_imp_fct' 
 seq_imp_fct <- function(fileloc) {
  my_fasta <- readLines(fileloc) # reads file line-wise into vector
  y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign
  y <- as.vector(y);  y[y==-1] <- 0
  index <- which(y==0)
  distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
  distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry
  distance <- data.frame(distance, dist=distance[,2]-distance[,1])
  seq_no <- 1:length(y[y==0])
  index <- rep(seq_no, as.vector(distance[,3]))
  my_fasta <- data.frame(index, y, my_fasta)
  my_fasta[my_fasta[,2]==0,1] <- 0
  seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
  seq <- as.vector(seq[2:length(seq)])
  Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
  my_fasta <- data.frame(Desc, Length=nchar(seq), seq)
  my_fasta
 } 
 # Generate smaller sequence batches of max 50,000 sequences per file. This is required for cross_match 
 seq <- seq_imp_fct(fileloc="454.seq")
 export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
 slice <- data.frame(FROM=seq(1,length(seq[,1]), by=50000), TO=seq(1,length(seq[,1]), by=50000)+49999)
 slice[length(slice[,1]),2] <- length(seq[,1])
 for (i in 1:length(slice[,1])) {
  write.table(as.vector(as.character(t(export_seq[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
 }
 qual <- seq_imp_fct(fileloc="454.seq.qual")
 export_qual <- data.frame(Desc=qual[,1], seq=qual[,3])
 for (i in 1:length(slice[,1])) {
  write.table(as.vector(as.character(t(export_qual[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq.qual", sep=""), quote=F, row.names=F, col.names=F)
 }
 
# (2) Run cross_match in 2 steps to tag forward adaptors with Ns and reverse adaptors with Xs
#     Run cross_match on all generated *.seq files:
 for (i in 1:length(slice[,1])) {
  if(i==1) { system(paste("rm -f ", "all.seq.screen", sep="")) } # removes in fist loop an already existing "all.seq.screen" file, since data slices will be appended to a file of this name
  # next import steps protect Ns that are inserted by 454.com for positions with score=0
  seq <- seq_imp_fct(fileloc=paste(i, ".seq", sep=""))
  export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
  export_seq$seq <- gsub("N", "Z", as.character(seq$seq))
  write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
  ### adjust
     system(paste("cross_match ",  paste(i, ".seq ", sep=""), "../adaptfor.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for forward adaptor only
  # next import steps tag forward adaptor with N's
  seq <- seq_imp_fct(fileloc=paste(i, ".seq.screen", sep=""))
  export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
  export_seq$seq <- gsub("X", "N", as.character(seq$seq))
  write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq.screen", sep=""), quote=F, row.names=F, col.names=F)
  system(paste("mv ", paste(i, ".seq", sep=""), ".screen ",  paste(i, ".seq", sep=""),  sep=""))
  ### adjust
  system(paste("cross_match ",  paste(i, ".seq ", sep=""), "../adaptrev.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) #  runs cross_match for reverse adaptor only, gets tagged with X's
  system(paste("cat ", paste(i, ".seq", sep=""), ".screen ", ">> ", "all.seq.screen", sep=""))
  system(paste("rm ", "-f ", paste(i, ".seq ", sep=""), paste(i, ".seq.qual ", sep=""), paste(i, ".seq.screen ", sep=""), paste(i, ".seq.log", sep=""), sep=""))
 }
 
# (3) Import of cross_matched sequences into R
 # Import of 'all.seq.screen'
 seq <- seq_imp_fct(fileloc="all.seq.screen")
 cat("\n", "Sequence batch imported containing ", length(seq[,1]), " entries", "\n")
 # Tag forward adapter with "F" and reverse adapter with "R", and revert Zs (454 Ns) back to Ns
 seqedit <- gsub("N", "F", as.character(seq$seq))
 seqedit <- gsub("X", "R", as.character(seqedit))
 seqedit <- gsub("Z", "N", as.character(seqedit))
 seq <- data.frame(seq[,-3], seq=seqedit)
 seq <- data.frame(Desc=gsub(" .*", "", as.character(seq[,1]), perl=T), seq[,-1])
 qual <- seq_imp_fct(fileloc="454.seq.qual")
 cat("\n", "Quality batch imported containing ", length(qual[,1]), " entries", "\n")
 qual <- data.frame(qual[,-3], seq=gsub("[ ]{2,}", " ", as.character(qual$seq)))
 # qual[qual[,3]=="N",3] <- 0 # removed May 08, 06
 qual <- data.frame(Desc=gsub(" .*", "", as.character(qual[,1]), perl=T), qual[,-1])

# (4) For larger data sets, loop over slices of 10,000 sequences
# In this loop the sequences are (1) vector trimmed, (2) quality trimmed and (3) orientation adjusted.
 slicedf <- data.frame(FROM=seq(1, length(seq[,1]), by=10000),
                TO=c(seq(0, length(seq[,1]), by=10000)[2:((as.integer(length(seq[,1])/10000))+1)],
          length(seq[,1])))
 appendDF <- data.frame(Seq=NULL, Phred=NULL)
 for(i in 1:length(slicedf[,1])){
  cat("\n", "Start of 10K slice no: ", i, "\n")
  seqslice <- seq[slicedf[i,1]:slicedf[i,2],]
  qualslice <- qual[slicedf[i,1]:slicedf[i,2],]
  # Write sequences and qual data in vectorized format into list files
  seqslicel <- strsplit(as.character(seqslice[,3]),"")
  qualslicel <- strsplit(as.character(qualslice[,3])," ")
  names(seqslicel) <- seqslice[,1]
  names(qualslicel) <- qualslice[,1]
  qualslicel <- lapply(qualslicel, as.numeric)
  # Replaces all NTs with Phred < 14 by Ns; this includes vector/adaptor positions
  ### adjust
  seqtriml <- lapply(names(seqslicel), function(x) replace(seqslicel[[x]], which(qualslicel[[x]]<14), "N"))
  names(seqtriml) <- names(seqslicel)
  # Convert all Ns that fall into adaptor regions back to Fs and Rs
  seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="F"), "F"))
  names(seqtriml) <- names(seqslicel)
  seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="R"), "R"))
  names(seqtriml) <- names(seqslicel)
  
  # To determine orientation of sequences, generate data frame with adaptor positions
  zzz <- lapply(seqtriml, paste, collapse="")
  posF <- lapply(names(zzz), function(x) gregexpr("(F){1,}", as.character(zzz[[x]]), perl=T))
  posR <- lapply(names(zzz), function(x) gregexpr("(R){1,}", as.character(zzz[[x]]), perl=T))
  names(posF) <- names(seqtriml); names(posR) <- names(seqtriml)
  myposF <- unlist(lapply(names(posF), function(x) as.vector(unlist(posF[[x]][[1]]))[1]))
  myposR <- unlist(lapply(names(posR), function(x) as.vector(unlist(posR[[x]][[1]]))[1]))
  myposF[myposF==-1] <- 0; myposR[myposR==-1] <- 0
  mylengthF <- lapply(names(posF), function(x) as.vector(unlist(attributes(posF[[x]][[1]])))[1])
  mylengthF <- unlist(mylengthF)
  mylengthF[mylengthF==-1] <- 0
  mylengthR <- lapply(names(posR), function(x) as.vector(unlist(attributes(posR[[x]][[1]])))[1])
  mylengthR <- unlist(mylengthR)
  mylengthR[mylengthR==-1] <- 0
  orientationDF <- data.frame(F1=myposF, F2=myposF+mylengthF-1, R1=myposR, R2=myposR+mylengthR-1, Length=seqslice[,2])
  orientationDF <- data.frame(orientationDF,
    Orient=(orientationDF[,3]-orientationDF[,2])/abs(orientationDF[,3]-orientationDF[,2]))
  # The following steps are for sequences that contain only one adaptor. This approximation seems to give fairly good results.
  orientationDF[orientationDF[,2]==-1 & orientationDF[,3] < (orientationDF[,5]/2), 6] <- -1
  orientationDF[orientationDF[,4]==-1 & orientationDF[,1] < ((orientationDF[,5]/2)*0.4), 6] <- 1
  orientationDF <- data.frame(ID=names(seqtriml), orientationDF)
  
  # Generate data frame with insert parsing positions
  zzz <- lapply(seqtriml, paste, collapse="")
  # fix for gregexpr version change between R 2.2.0 and 2.3.0
  # R version 2.0.0:# pos <- lapply(names(zzz), function(x) gregexpr("[ATGCN]{1,}", as.character(zzz[[x]]), perl=T))
  pos <- lapply(names(zzz), function(x) gregexpr("[^ATGCN][ATGCN]+", paste("", as.character(zzz[[x]])), perl=T))
  names(pos) <- names(seqtriml)
  mypos <- as.vector(unlist(pos))
  mypos[mypos==-1] <- 0
  mylength <- lapply(names(pos), function(x) as.vector(unlist(attributes(pos[[x]][[1]]))))
  mylength <- unlist(mylength)
  # fix for gregexpr version change between R 2.2.0 and 2.3.0
  mylength[mylength==-1] <- 0
  mylength <- mylength-1
  mylength[mylength==-1] <- 0
  count <- lapply(names(pos), function(x) length(pos[[x]][[1]]))
  count <- as.vector(unlist(count))
  IDs <- rep(names(pos), count)
  InsertFrame <- data.frame(ID=IDs, UniID=paste(IDs, unlist(sapply(count, function(x) seq(1,x))), sep="."),
    InsertCount=rep(count,count), Pos=mypos, Length=mylength)
  InsertFrame <- merge(InsertFrame, orientationDF, by.x="ID", by.y="ID", all.x=T)
  
  # Position parsing of inserts using InsertFrame
  InsertFrame <- data.frame(InsertFrame, End=InsertFrame[,4]+InsertFrame[,5]-1)
  InsertFrame <- InsertFrame[,c(1:5, length(InsertFrame), 6:(length(InsertFrame)-1))]
  InsertFrame <- InsertFrame[!InsertFrame$Pos==0, ] # added May 8, 06 to remove no-insert sequences
  mylist <- apply(InsertFrame[,c(4,6)], 1, list)
  mylist <- lapply(mylist, function(x) as.vector(as.matrix(unlist(x))))
  names(mylist) <- as.vector(InsertFrame$UniID)
  myindex <- as.vector(InsertFrame$ID)
  seqtriml_long <- seqtriml[myindex]; names(seqtriml_long) <- as.vector(InsertFrame$UniID) # creates long version of 'seqtriml' that contains entries with several inserts in duplicates
  quall_long <- qualslicel[myindex]; names(quall_long) <- as.vector(InsertFrame$UniID) # creates long version of 'quall' that contains entries with several inserts in duplicates
  insert_list <- lapply(names(mylist), function(x) seqtriml_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
  insert_qual_list <- lapply(names(mylist), function(x) quall_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
  names(insert_list) <- as.vector(InsertFrame$UniID)
  names(insert_qual_list) <- as.vector(InsertFrame$UniID)
  
  # Reverse and complement of antisense sequences
  insert_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_list[[x]]))
  insert_list_rev <- lapply(insert_list_rev, paste, collapse="") 
  names(insert_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])
  insert_list_rev <- lapply(insert_list_rev, function(x) gsub("A", "1", x))
  insert_list_rev <- lapply(insert_list_rev, function(x) gsub("T", "2", x))
  insert_list_rev <- lapply(insert_list_rev, function(x) gsub("C", "3", x))
  insert_list_rev <- lapply(insert_list_rev, function(x) gsub("G", "4", x))
  insert_list_rev <- lapply(insert_list_rev, function(x) gsub("1", "T", x))
  insert_list_rev <- lapply(insert_list_rev, function(x) gsub("2", "A", x))
  insert_list_rev <- lapply(insert_list_rev, function(x) gsub("3", "G", x))
  insert_list_rev <- lapply(insert_list_rev, function(x) gsub("4", "C", x))
  insert_qual_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_qual_list[[x]]))
  names(insert_qual_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])  
  insert_qual_list_rev <- lapply(insert_qual_list_rev, paste, collapse=" ")
  
  # Combine everything in final sequence object 
  insert_list_for <- insert_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])]
  insert_list_for <- lapply(insert_list_for, paste, collapse="")
  insert_list_final <- c(insert_list_rev, insert_list_for)
  insert_list_final <- insert_list_final[names(insert_list)]
  insert_qual_list_for <- insert_qual_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])]
  insert_qual_list_for <- lapply(insert_qual_list_for, paste, collapse=" ")
  insert_qual_list_final <- c(insert_qual_list_rev, insert_qual_list_for)
  insert_qual_list_final <- insert_qual_list_final[names(insert_list)]
  
  # Remove terminal Ns. Most of the code here is to adjust qual data set.
  tempDF <- data.frame(as.data.frame(unlist(insert_list_final)), as.data.frame(unlist(insert_qual_list_final)))
  names(tempDF)[1:2] <- c("Seq", "Phred")
  remove_start <- nchar(as.character(tempDF$Seq)) - nchar(gsub("^N{1,}", "", as.character(tempDF$Seq), perl=T))
  remove_end <- nchar(as.character(tempDF$Seq)) - nchar(gsub("N{1,}$", "", as.character(tempDF$Seq), perl=T))
  NremoveDF <- data.frame(remove_start, remove_end)
  tempDF2 <- data.frame(row.names=row.names(tempDF), Seq=gsub("^N{1,}|N{1,}$", "", as.character(tempDF$Seq), perl=T), tempDF[,2]) 
  qual_removel <- strsplit(as.character(tempDF2[,2]), " ")
  names(qual_removel) <- row.names(tempDF2)
  NremoveDF <- data.frame(remove_start, remove_end)
  Nremovel <- apply(NremoveDF, 1, list)
  Nremovel <- lapply(Nremovel, function(x) as.vector(as.matrix(unlist(x))))
  names(Nremovel) <-  row.names(tempDF2)  
  qual_removel <- lapply(names(qual_removel), function(x) qual_removel[[x]][(1+Nremovel[[x]][1]):(length(as.vector(qual_removel[[x]]))-Nremovel[[x]][2])])
  qual_removel <- lapply(qual_removel, paste, collapse=" ")
  tempDF <- data.frame(row.names=row.names(tempDF2), Seq=tempDF2[,1], Phred=unlist(qual_removel))
  
  appendDF <- rbind(appendDF, tempDF)
  cat("\n", "End of 10K slice no: ", i, "\n")
 }
# (5) Create final result data frame
 finalDF <- appendDF
 zzz <- gregexpr("N", as.character(finalDF[,1]), perl=T)
 Ncount <- lapply(zzz, function(x) length(unlist(x))); Ncount <- unlist(Ncount)
 # Obtain Ncount: for_loop here necessary for very large sequence sets (>100,000)
 names(zzz) <- rownames(finalDF)
 incDF <- data.frame(FROM=seq(1, length(zzz), by=10000),
   TO=c(seq(0, length(zzz), by=10000)[2:((as.integer(length(zzz)/10000))+1)], length(zzz)))
 Ncountfix <- NULL
 for(i in 1:length(incDF[,1])) {
  Ncountfix <- c(Ncountfix, unlist(lapply(names(zzz[incDF[i,1]:incDF[i,2]]), function(x) as.vector(unlist(zzz[[x]][[1]]))[1])))
 cat("\n", "N count loop no: ", i, "\n")
 }
 Ncount[which(Ncountfix==-1)] <- 0
 
 finalDF <- data.frame(finalDF,
   InsertLength=nchar(as.character(finalDF[,1])),
   mPhred=as.vector(apply(finalDF, 1, function(x) mean(as.numeric(unlist(strsplit((as.character(x[2]))," ")))))),
   Ncount=Ncount
  )
 names(finalDF)[1:2] <- c("Seq", "Phred")
 finalDF <- data.frame(ID=rownames(finalDF), finalDF)

# (6) Filtering steps
 # Remove all sequences that have insert lengths of less than 15bp
 ### adjust
 finalDF <- finalDF[finalDF$InsertLength>=15,]
 # Remove inserts with >=20% Ns
 ### adjust
 finalDF <- finalDF[(finalDF$Ncount/finalDF$InsertLength)<=0.2,]
 
 # Adjust name extensions to filter results
       namev <- gsub("\\..*$", "", as.character(rownames(finalDF)), perl=T)
       namev <- paste(sort(namev), unlist(lapply(as.vector(table(sort(namev))), function(x) 1:x)), sep=".")
       finalDF <- data.frame(finalDF, sort=1:length(finalDF[,1]))
       finalDF <- data.frame(ID=sort(namev), finalDF[order(rownames(finalDF)),-1])
       finalDF <- finalDF[order(finalDF$sort),]
 
 # Export sequences and qual data to two fasta batch files
 export_seq <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,2])
 write.table(as.vector(as.character(t(export_seq))), file="final_seq.txt", quote=F, row.names=F, col.names=F)
 export_qual <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,3])
 write.table(as.vector(as.character(t(export_qual))), file="final_qual.txt", quote=F, row.names=F, col.names=F)

# # Optional steps
# # Average Phred scores of inserts
# mylist <- strsplit(as.character(finalDF[,3]), " "); mylist <- lapply(mylist, as.numeric); mean(unlist(lapply(mylist, mean)))
genomics

Wednesday, August 30, 2006

R code for vector and phred score trimming

No comments: