找到两个字符串长度的所有公共子序列的好方法是什么?k
例子:
s1
=AAGACC
s2
=AGATAACCAGGAGCTGC
长度为 5 的所有常见子序列:AAGAC
AAACC
AGACC
AAGCC
找到两个字符串长度的所有公共子序列的好方法是什么?k
例子:
s1
=AAGACC
s2
=AGATAACCAGGAGCTGC
长度为 5 的所有常见子序列:AAGAC
AAACC
AGACC
AAGCC
一种相对直接的方法是从 LCS 矩阵重构序列。这是一个 O(n^2 * k + x * n) 算法,其中x是输出的大小(即长度为k的公共子序列的数量)。它在 C++ 中,但它应该很容易转换为 C:
const int N = 100;
int lcs[N][N];
set<tuple<string,int,int,int>> vis;
string s1 = "AAGACC";
string s2 = "AGATAACCAGGAGCTGC";
void reconstruct(const string& res, int i, int j, int k) {
tuple<string,int,int,int> st(res, i, j, k);
if (vis.count(st))
return;
vis.insert(st);
if (lcs[i][j] < k) return;
if (i == 0 && j == 0 && k == 0) {
cout << res << endl;
return;
}
if (i > 0)
reconstruct(res, i-1, j, k);
if (j > 0)
reconstruct(res, i, j-1, k);
if (i>0 && j>0 && s1[i-1] == s2[j-1])
reconstruct(string(1,s1[i-1]) + res, i-1, j-1, k-1);
}
int main() {
lcs[0][0] = 0;
for (int i = 0; i <= s1.size(); ++i)
lcs[i][0] = 0;
for (int j = 0; j <= s1.size(); ++j)
lcs[0][j] = 0;
for (int i = 0; i <= s1.size(); ++i) {
for (int j = 0; j <= s2.size(); ++j) {
if (i > 0)
lcs[i][j] = max(lcs[i][j], lcs[i-1][j]);
if (j > 0)
lcs[i][j] = max(lcs[i][j], lcs[i][j-1]);
if (i > 0 && j > 0 && s1[i-1] == s2[j-1])
lcs[i][j] = max(lcs[i][j], lcs[i-1][j-1] + 1);
}
}
reconstruct("", s1.size(), s2.size(), 5);
}
还应该有一个 O(n * (k + x)) 的方法来解决这个问题,基于稍微不同的 DP 方法:让f(i, k)是最小索引j使得lcs(i, j) >= ķ。我们有复发
f(i, 0) = 0 for all i
f(i, k) = min{f(i-1, k),
minimum j > f(i-1, k-1) such that s2[j] = s1[i]}
我们还可以从矩阵f重构长度为k的序列。
从给定长度的所有子序列中创建一个trie ,然后检查每个长度序列是否在 trie 中。k
s1
s2
k
这是一个使用递归的算法版本,堆栈大小k
并包括两个优化来跳过已经看到的字符和跳过不存在的子序列。字符串不是唯一的(可能有重复),因此通过uniq
.
#include <stdio.h>
#include <string.h>
/* s1 is the full first string, s2 is the suffix of the second string
* (starting after the subsequence at depth r),
* pos is the positions of chars in s1, r is the recursion depth,
* and k is the length of subsequences that we are trying to match
*/
void recur(char *s1, char *s2, size_t pos[], size_t r, size_t k)
{
char seen[256] = {0}; /* have we seen a character in s1 before? */
size_t p0 = (r == 0) ? 0 : pos[r-1] + 1; /* start at 0 or pos[r-1]+1 */
size_t p1 = strlen(s1) - k + r; /* stop at end of string - k + r */
size_t p;
if (r == k) /* we made it, print the matching string */
{
for (p=0; p<k; p++)
putchar(s1[pos[p]]);
putchar('\n');
return;
}
for (p=p0; p<=p1; p++) /* at this pos[], loop through chars to end of string */
{
char c = s1[p]; /* get the next char in s1 */
if (seen[c])
continue; /* don't go any further if we've seen this char already */
seen[c] = 1;
pos[r] = p;
char *s = strchr(s2, c); /* is this char in s2 */
if (s != NULL)
recur(s1, s+1, pos, r+1, k); /* recursively proceed with next char */
}
}
int main()
{
char *s1 = "AAGACC";
char *s2 = "AGATAACCAGGAGCTGC";
size_t k = 5;
size_t pos[k];
if (strlen(s1) < k || strlen(s2) < k) /* make sure we have at least k chars in each string */
return 1; /* exit with error */
recur(s1, s2, pos, 0, k);
return 0;
}
输出是:
AAGAC
AAGCC
AAACC
AGACC