Wednesday, August 30, 2006

parsing sequence by positional co-ordinates using R

##############################################
# Sequence Parsing by Positional Coordinates #
##############################################
# Utility:
# Script parses sequences based on positional information provided
# by a coordinate matrix like the one below. The parsing is performed
# on vectorized sequences in a list container.
# Format of coordinate matrix
# ID start end
# ID001 20 60
# ID001 120 190
# ID002 20 60
# ID002 120 190
#
# To demo what the script does, run it like this:
# source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/positionParsing.txt")

# Create a sample set of sequences (here random sequences)
fx <- function(test) {x <- as.integer(runif(200, min=1, max=5)) # sets length of each random sequence to '200'
x[x==1] <- "A"; x[x==2] <- "T"; x[x==3] <- "G"; x[x==4] <- "C"; paste(x, sep = "", collapse ="")
}
z1 <- c()
for(i in 1:5) { # sets number of generated sequences to '5'
z1 <- c(fx(i), z1)
}
SeqID <- c("ID001","ID002","ID003","ID004","ID005")
z1 <- data.frame(SeqID=SeqID, Seq=z1)
cat("Source sequences:", "\n")
print(z1)

# Write the sequence data frame into a list object.
# This is an important step for handling sets of vectorized sequences of variable length, since
# list objects are more versatile in handling them than data frames.
seq_list <- apply(as.matrix(z1[,2]), 1, list) # generates list with list components
seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors
seq_list <- lapply(seq_list, strsplit, "") # vectorizes sequences by splitting them into separate vector fields
seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors
names(seq_list) <- SeqID # names sequence entries, e.g. by their IDs

# Generate coordinate data frame with positions to be parsed from sequences.
pos_matrix <- data.frame(ID=sort(rep(SeqID, 2)), start=rep(c(20,120),5), end=rep(c(60,190),5))
cat("Position matrix:", "\n")
print(pos_matrix)

# Define sequence parse function
seq_parse_fct <- function(seq_list, pos_matrix) {
# Parse sequences in nested 'for loop'
# nesting required to parse more than one fragment per sequence ID
parse_list_all <- list(NULL) # create empty list as list container
if (length(setdiff(names(seq_list), as.vector(pos_matrix[,1]))) > 1) stop("The number of sequences and coordinate groups differ") # Check if all sequence IDs are contained in the sequence list and in the coordinate matrix
for(i in names(seq_list)) { # loop over each sequence ID
sub_pos_matrix <- pos_matrix[pos_matrix[,1]==i,]
parse_list <- list(NULL) # creates empty list of vectorslength container
for(j in 1:length(sub_pos_matrix[,1])) { # loop over all fragments of a sequence
index <- eval(parse(text=paste(sub_pos_matrix[j,2],":", sub_pos_matrix[j,3])))
parse_list <- c(parse_list, list(seq_list[[i]][index]))
if (length(parse_list[[1]][[1]])==0) { parse_list <- parse_list[-1] } # removes in first loop first (empty) component
}
parse_list_all <- c(parse_list_all, list(parse_list))
if (length(parse_list_all[[1]][[1]])==0) { parse_list_all <- parse_list_all[-1] } # removes in first loop first (empty) component
}
names(parse_list_all) <- SeqID # names sequence entries, e.g. by their IDs
parse_list_all
}
parse_list_all <- seq_parse_fct(seq_list, pos_matrix)
cat("Vectorized sequence fragments are managed in the list object 'parse_list_all'.", "\n", "Print first fragment set as sample:", "\n")
print(parse_list_all[[1]])

# Collapse sequence vectors into strings
parse_seq_list <- lapply(parse_list_all, unlist) # joins sequence vectors in each list
parse_seq_list <- lapply(parse_seq_list, paste, collapse="") # collapses vectorised sequences into strings
cat("Concatenated sequence fragments:", "\n")
parsed_result <- data.frame(SeqID, Parsed_Seq=unlist(parse_seq_list))
print(parsed_result)

No comments: