我将您的数据加载为对象aa
。
mydata <- data.frame(seqs = aa$Sequence, mods = aa$modifications) # subset of aa with sequences and modifications
##to find number of "K"s
spl_seqs <- strsplit(as.character(mydata$seqs), split = "") # split all sequences (use "as.character" because they are turned into factor)
where_K <- lapply(spl_seqs, grep, pattern = "K") # find positions of "K"s in each sequence
No_K <- lapply(where_K, length) # count "K"s in each sequence
mydata$No_Ks <- No_K #add a column that informs about the number of "K"s in each sequence
##
我想出现在“修改”列中的所有大写字母要么是指正在进行的修改,要么是指“K”。我想不出任何其他方法来简化“修改”列以操纵它们。所以我,只是,保留不是“K”的大写字母:
names(LETTERS) <- LETTERS # DWin's idea in this http://stackoverflow.com/questions/4423460/is-there-a-function-to-find-all-lower-case-letters-in-a-character-vector
spl_mods <- strsplit(as.character(mydata$mods), split = "") # split the characters in each modification row
简化修改列仅保留每个修改的第一个字母:
mods_ls <- vector("list", length = nrow(mydata)) #list to fill with simplified modifications
for(i in 1:length(spl_mods))
{
res <- as.character(na.omit(LETTERS[strsplit(as.character(mydata$mods), split = "")[[i]]])) #keep only upper-case letters
res <- as.character(na.omit(gsub("K", NA, res))) # exclude "K"s
res <- as.character(na.omit(gsub("M", NA, res))) # and "M"s I guessed
mods_ls[[i]] <- res
}
mydata$simplified_mods <- unlist(lapply(mods_ls, paste, collapse = " ; "))
到目前为止,我们得到了什么:
mydata[1:10,]
# seqs mods No_Ks simplified_mods
#1 AAAAGAAAVANQGKK [14] Acetyl (K)|[15] Acetyl (K) 2 A ; A
#2 AAAAGAAAVANQGKK [14] Acetyl (K)|[15] Acetyl (K) 2 A ; A
#3 AAFTKLDQVWGSE [5] Acetyl (K) 1 A
#4 AAIKFIKFINPKINDGE [4] Acetyl (K)|[7] Acetyl (K)|[12] Acetyl (K) 3 A ; A ; A
#5 AAIKFIKFINPKINDGE [4] Acetyl (K)|[7] Acetyl (K)|[12] Acetyl (K) 3 A ; A ; A
#6 AAIKFIKFINPKINDGE [7] Acetyl (K)|[12] Acetyl (K) 3 A ; A
#7 AAIKFIKFINPKINDGE [4] Acetyl (K)|[7] Acetyl (K) 3 A ; A
#8 AAIYKLLKSHFRNE [5] Biotin (K)|[8] Acetyl (K) 2 B ; A
#9 AAKKFEE [3] Acetyl (K)|[4] Acetyl (K) 2 A ; A
#10 AAKYFRE [3] Acetyl (K) 1 A
然后,您可以对“K”的数量和您想要的特定修改进行子集化。例如:
how_many_K <- 2
what_mods <- "A ; A" #separated by [space];[space]
show_rows <- which(mydata$No_Ks == how_many_K & mydata$simplified_mods == what_mods)
mydata[show_rows,]
# seqs mods No_Ks simplified_mods
#1 AAAAGAAAVANQGKK [14] Acetyl (K)|[15] Acetyl (K) 2 A ; A
#2 AAAAGAAAVANQGKK [14] Acetyl (K)|[15] Acetyl (K) 2 A ; A
#9 AAKKFEE [3] Acetyl (K)|[4] Acetyl (K) 2 A ; A
#11 AANVKKTLVE [5] Acetyl (K)|[6] Acetyl (K) 2 A ; A
#14 AARDSKSPIILQTSNGGAAYFAGKGISNE [6] Acetyl (K)|[24] Acetyl (K) 2 A ; A
#20 AEKLKAE [3] Acetyl (K)|[5] Acetyl (K) 2 A ; A
#21
#....
编辑:所有这些都可以在像fun
. x
是你的data.frame
(作为上载的“为 Henrik”和structure
)。noK
是你想要的“K”的数量。mod
是您想要用 [space];[space] 分隔的修改(例如“B ; A ; O”)。:
fun <- function(x, noK, no_modK = NULL, mod = NULL) #EDIT_1e: update arguments; made optional
{
mydata <- data.frame(seqs = x$Sequence, mods = x$modifications)
spl_seqs <- strsplit(as.character(mydata$seqs), split = "")
where_K <- lapply(spl_seqs, grep, pattern = "K")
No_K <- lapply(where_K, length)
mydata$No_Ks <- No_K
names(LETTERS) <- LETTERS
spl_mods <- strsplit(as.character(mydata$mods), split = "")
mods_ls <- vector("list", length = nrow(mydata))
for(i in 1:length(spl_mods))
{
res <- as.character(na.omit(LETTERS[strsplit(as.character(mydata$mods), split = "")[[i]]]))
no_modedK <- length(grep("K", res)) #EDIT_1a: how many "K"s are modified?
res <- as.character(na.omit(gsub("K", NA, res)))
res <- as.character(na.omit(gsub("M", NA, res)))
mods_ls[[i]] <- list(mods = res, modified_K = no_modedK) #EDIT_1b: catch number of "K"s modified (along with the actual modifications)
}
mydata$no_modK <- unlist(lapply(lapply(lapply(mods_ls, `[`, 2), unlist), paste, collapse = " ; ")) #EDIT_1d: insert number of modified "K"s in "mydata"
mydata$simplified_mods <- unlist(lapply(lapply(lapply(mods_ls, `[`, 1), unlist), paste, collapse = " ; ")) #EDIT_1c: insert mods in "mydata"
if(!is.null(no_modK) & !is.null(mod)) #EDIT_1f: update "return"
{
show_rows <- which(mydata$No_Ks == noK & mydata$no_modK == no_modK & mydata$simplified_mods == mod)
}
if(is.null(no_modK) & !is.null(mod))
{
show_rows <- which(mydata$No_Ks == noK & mydata$simplified_mods == mod)
}
if(is.null(mod) & !is.null(no_modK))
{
show_rows <- which(mydata$No_Ks == noK & mydata$no_modK == no_modK)
}
if(is.null(no_modK) & is.null(mod))
{
show_rows <- which(mydata$No_Ks == noK)
}
return(mydata[show_rows,])
}
例如:
fun(aa, noK = 3) #aa is the the "for Henrik" loaded in `R` (aa <- structure( ... )
seqs mods No_Ks no_modK simplified_mods
4 AAIKFIKFINPKINDGE [4] Acetyl (K)|[7] Acetyl (K)|[12] Acetyl (K) 3 3 A ; A ; A
5 AAIKFIKFINPKINDGE [4] Acetyl (K)|[7] Acetyl (K)|[12] Acetyl (K) 3 3 A ; A ; A
6 AAIKFIKFINPKINDGE [7] Acetyl (K)|[12] Acetyl (K) 3 2 A ; A
#...
fun(aa, noK = 3, no_modK = 2)
seqs mods No_Ks no_modK simplified_mods
6 AAIKFIKFINPKINDGE [7] Acetyl (K)|[12] Acetyl (K) 3 2 A ; A
7 AAIKFIKFINPKINDGE [4] Acetyl (K)|[7] Acetyl (K) 3 2 A ; A
#...
fun(aa, noK = 2, mod = "A ; B")
seqs mods No_Ks no_modK simplified_mods
200 ISAMVLTKMKE [8] Acetyl (K)|[10] Biotin (K) 2 2 A ; B
441 NLKPSKPSYYLDPE [3] Acetyl (K)|[6] Biotin (K) 2 2 A ; B
#...
fun(aa, noK = 2, no_modK = 1, mod = "A")
seqs mods No_Ks no_modK simplified_mods
15 AARDSKSPIILQTSNGGAAYFAGKGISNE [24] Acetyl (K) 2 1 A
27 AKALVAQGVKFIAE [2] Acetyl (K) 2 1 A
#...
EDIT_1:更新fun
和示例。