##############################################

# Sequence Parsing by Positional Coordinates #

##############################################

# Utility:

# Script parses sequences based on positional information provided

# by a coordinate matrix like the one below. The parsing is performed

# on vectorized sequences in a list container.

# Format of coordinate matrix

# ID start end

# ID001 20 60

# ID001 120 190

# ID002 20 60

# ID002 120 190

#

# To demo what the script does, run it like this:

# source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/positionParsing.txt")

# Create a sample set of sequences (here random sequences)

fx <- function(test) {x <- as.integer(runif(200, min=1, max=5)) # sets length of each random sequence to '200'

x[x==1] <- "A"; x[x==2] <- "T"; x[x==3] <- "G"; x[x==4] <- "C"; paste(x, sep = "", collapse ="")

}

z1 <- c()

for(i in 1:5) { # sets number of generated sequences to '5'

z1 <- c(fx(i), z1)

}

SeqID <- c("ID001","ID002","ID003","ID004","ID005")

z1 <- data.frame(SeqID=SeqID, Seq=z1)

cat("Source sequences:", "\n")

print(z1)

# Write the sequence data frame into a list object.

# This is an important step for handling sets of vectorized sequences of variable length, since

# list objects are more versatile in handling them than data frames.

seq_list <- apply(as.matrix(z1[,2]), 1, list) # generates list with list components

seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors

seq_list <- lapply(seq_list, strsplit, "") # vectorizes sequences by splitting them into separate vector fields

seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors

names(seq_list) <- SeqID # names sequence entries, e.g. by their IDs

# Generate coordinate data frame with positions to be parsed from sequences.

pos_matrix <- data.frame(ID=sort(rep(SeqID, 2)), start=rep(c(20,120),5), end=rep(c(60,190),5))

cat("Position matrix:", "\n")

print(pos_matrix)

# Define sequence parse function

seq_parse_fct <- function(seq_list, pos_matrix) {

# Parse sequences in nested 'for loop'

# nesting required to parse more than one fragment per sequence ID

parse_list_all <- list(NULL) # create empty list as list container

if (length(setdiff(names(seq_list), as.vector(pos_matrix[,1]))) > 1) stop("The number of sequences and coordinate groups differ") # Check if all sequence IDs are contained in the sequence list and in the coordinate matrix

for(i in names(seq_list)) { # loop over each sequence ID

sub_pos_matrix <- pos_matrix[pos_matrix[,1]==i,]

parse_list <- list(NULL) # creates empty list of vectorslength container

for(j in 1:length(sub_pos_matrix[,1])) { # loop over all fragments of a sequence

index <- eval(parse(text=paste(sub_pos_matrix[j,2],":", sub_pos_matrix[j,3])))

parse_list <- c(parse_list, list(seq_list[[i]][index]))

if (length(parse_list[[1]][[1]])==0) { parse_list <- parse_list[-1] } # removes in first loop first (empty) component

}

parse_list_all <- c(parse_list_all, list(parse_list))

if (length(parse_list_all[[1]][[1]])==0) { parse_list_all <- parse_list_all[-1] } # removes in first loop first (empty) component

}

names(parse_list_all) <- SeqID # names sequence entries, e.g. by their IDs

parse_list_all

}

parse_list_all <- seq_parse_fct(seq_list, pos_matrix)

cat("Vectorized sequence fragments are managed in the list object 'parse_list_all'.", "\n", "Print first fragment set as sample:", "\n")

print(parse_list_all[[1]])

# Collapse sequence vectors into strings

parse_seq_list <- lapply(parse_list_all, unlist) # joins sequence vectors in each list

parse_seq_list <- lapply(parse_seq_list, paste, collapse="") # collapses vectorised sequences into strings

cat("Concatenated sequence fragments:", "\n")

parsed_result <- data.frame(SeqID, Parsed_Seq=unlist(parse_seq_list))

print(parsed_result)

## Wednesday, August 30, 2006

### parsing sequence by positional co-ordinates using R

Subscribe to:
Post Comments (Atom)

## No comments:

Post a Comment