我决定从 R 中的 Rosalind 实现这个问题。它适用于小样本数据集:
>Rosalind_56
ATTAGACCTG
>Rosalind_57
CCTGCCGGAA
>Rosalind_58
AGACCTGCCG
>Rosalind_59
GCCGGAATAC
但是,当我为真实数据集运行它时需要很长时间(50 个序列,长度约为 1000 个字符)。您能否看看我的代码并给出一些建议,我应该更换什么来提高速度?我只有五分钟的时间来发布答案......这是我写的一些解释。
library(Biostrings)
library(stringr)
fastaFile = readDNAStringSet("input.fasta")
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df <- data.frame(seq_name, sequence) #read fasta and convert it to simple df
allsubstr <- function(x, n) unique(substring(x, 1:(nchar(x) - n + 1), n:nchar(x))) #helping function, creating all substring of given string
Shortest_substring_R <- function(df){
tmp_ss <- df[1,2] #first element will be modified to final result
tmp_df <- df #put everything to temporary df which will be also modified (number of elements)
while(nrow(tmp_df)>1){ #loop until we heave more than one sequence in our temp df
max_ov_all <- '' #it's better to clear it
max_ov <- ''
for(i in 2:nrow(tmp_df)){ #this loop finds the maximum overlap between result and ith sequence (result - shortest substring)
res <- unlist(sapply(1:nchar(tmp_df[i,2]), function(x) allsubstr(tmp_df[i,2],x))) #produce all substrings from ith sequence
ovs <- res[str_detect(tmp_ss, res)] #only substrings of ith sequence which overlaps with first (result) sequence
max_ov <- ovs[which.max(nchar(ovs))] #give me the longest common substring of first (result) and ith sequence
if(nchar(max_ov) > nchar(max_ov_all)){ #check which overlap is the longest (between first (result) and remaining sequences)
max_ov_all <- max_ov
ind <- i
}
}
if(str_detect(tmp_ss, tmp_df[ind,2])){ #now three cases, our substring (coming from ith sequence) is inside our result string
tmp_df <- tmp_df[-ind,] #so we do not have do anything, juest remove it from the df
}else{ #our first (result) sequence may start or end with longest overlap, so modify final result as needed
#and then of course remove ith sequence (giving longest overlap), because it has been added to aour result sequence
if(startsWith(tmp_ss, max_ov_all)){
tmp_ss <- paste(tmp_df[ind,2],sub(max_ov_all, "", tmp_ss), sep='')
tmp_df <- tmp_df[-ind,]
}
if(endsWith(tmp_ss, max_ov_all)){
tmp_ss <- paste(sub(max_ov_all, "", tmp_ss), tmp_df[ind,2], sep='')
tmp_df <- tmp_df[-ind,]
}
}
}
print(tmp_ss)
}
结果:
> Shortest_substring_R(df)
[1] "ATTAGACCTGCCGGAATAC"
我认为主要问题在于for()
循环,sapply()
以及非常有用的功能,例如str_detect()
您认为什么?
样本输入:
Input = ('seq_name sequence
1 Rosalind_56 ATTAGACCTG
2 Rosalind_57 CCTGCCGGAA
3 Rosalind_58 AGACCTGCCG
4 Rosalind_59 GCCGGAATAC')
small_df = as.data.frame(read.table(textConnection(Input), header = T, row.names = 1))
small_df