c++ - 如何在大字符串上应用最长公共子序列算法？

Question

如何在更大的字符串（600000 个字符）上应用最长的公共子序列。有没有办法在 DP 中做到这一点？我已经为较短的字符串做到了这一点。

#include <iostream>
#include <algorithm>
#include <cstring>
#include <cstdio>
using namespace std;

int dp[1005][1005];
char a[1005], b[1005];

int lcs(int x,int y)
{
    if(x==strlen(a)||y==strlen(b))
        return 0;
    if(dp[x][y]!=-1)
        return dp[x][y];
    else if(a[x]==b[y])
        dp[x][y]=1+lcs(x+1,y+1);
    else
        dp[x][y]=max(lcs(x+1,y),lcs(x,y+1));
    return dp[x][y];
}

int main()
{
    while(gets(a)&&gets(b))
    {
        memset(dp,-1,sizeof(dp));
        int ret=lcs(0,0);
        printf("%d\n",ret);
    }
}

score 3 · Accepted Answer

您应该看看这篇讨论各种设计和实现注意事项的文章。有人指出，您可以查看Hirschberg 的算法，该算法使用编辑距离（或 Levenshtein 距离）找到两个字符串之间的最佳对齐方式。它可以代表您简化所需的空间量。

在底部，您会发现“节省空间的 LCS”，因此被定义为一种混合/伪代码，其中m是的长度，A是n的长度B：

int lcs_length(char *A, char *B) {
  // Allocate storage for one-dimensional arrays X and Y.

  for (int i = m; i >= 0; i--) {
    for (int j = n; j >= 0; j--) {
      if (A[i] == '\0' || B[j] == '\0') {
        X[j] = 0;
      }
      else if (A[i] == B[j]) {
        X[j] = 1 + Y[j+1];
      }
      else {
        X[j] = max(Y[j], X[j+1]);
      }
    }

    // Copy contents of X into Y. Note that the "=" operator here
    // might not do what you expect. If Y and X are pointers then
    // it will assign the address and not copy the contents, so in
    // that case you'd do a memcpy. But they could be a custom
    // data type with an overridden "=" operator.
    Y = X;
  }

  return X[0];
}

如果你有兴趣，这里有一篇关于 LCS 的关于大字母字符串的论文。在第 3.2 节中查找算法Approx2LCS。

score 2 · Accepted Answer

首先，使用自下而上的动态规划方法：

// #includes and using namespace std;

const int SIZE = 1000;
int dp[SIZE + 1][SIZE + 1];
char a[SIZE + 1], b[SIZE + 1];

int lcs_bottomUp(){
    int strlenA = strlen(a), strlenB = strlen(b);
    for(int y = 0; y <= strlenB; y++)
        dp[strlenA][y] = 0;
    for(int x = strlenA - 1; x >= 0; x--){
        dp[x][strlenB] = 0;
        for(int y = strlenB - 1; y >= 0; y--)
            dp[x][y] = (a[x]==b[y]) ? 1 + dp[x+1][y+1] :
                    max(dp[x+1][y], dp[x][y+1]);
    }
    return dp[0][0];
}

int main(){
    while(gets(a) && gets(b)){
        printf("%d\n", lcs_bottomUp());
    }
}

请注意，您只需要保留 2 行（或列），一个用于dp[x]，另一个用于dp[x + 1]：

// #includes and using namespace std;

const int SIZE = 1000;
int dp_x[SIZE + 1]; // dp[x]
int dp_xp1[SIZE + 1]; // dp[x + 1]
char a[SIZE + 1], b[SIZE + 1];

int lcs_bottomUp_2row(){
    int strlenA = strlen(a), strlenB = strlen(b);
    for(int y = 0; y <= strlenB; y++)
        dp_x[y] = 0; // assume x == strlenA
    for(int x = strlenA - 1; x >= 0; x--){
        // x has been decreased
        memcpy(dp_xp1, dp_x, sizeof(dp_x)); // dp[x + 1] <- dp[x]

        dp_x[strlenB] = 0;
        for(int y = strlenB - 1; y >= 0 ; y--)
            dp_x[y] = (a[x]==b[y]) ? 1 + dp_xp1[y+1] :
                    max(dp_xp1[y], dp_x[y+1]);
    }
    return dp_x[0]; // assume x == 0
}

int main(){
    while(gets(a) && gets(b)){
        printf("%d\n", lcs_bottomUp_2row());
    }
}

现在可以安全地更改SIZE为600000.

score 1 · Accepted Answer

正如 OP 所说，其他答案花费了太多时间，主要是因为每次外部迭代都会复制 600000 个字符。为了改进它，可以在逻辑上改变它，而不是物理地改变列。因此：

int spaceEfficientLCS(std::string a, std::string b){
  int i, j, n = a.size(), m = b.size();

  // Size of columns is based on the size of the biggest string
  int maxLength = (n < m) ? m : n;
  int costs1[maxLength+1], costs2[maxLength+1];

  // Fill in data for costs columns
  for (i = 0; i <= maxLength; i++){
    costs1[i] = 0;
    costs2[i] = 0;
  }

  // Choose columns in a way that the return value will be costs1[0]
  int* mainCol, *secCol;
  if (n%2){
    mainCol = costs2;
    secCol = costs1;
  }
  else{
    mainCol = costs1;
    secCol = costs2;
  }

  // Compute costs
  for (i = n; i >= 0; i--){
    for (j = m; j >= 0; j--){
      if (a[i] == '\0' || b[j] == '\0') mainCol[j] = 0;
      else mainCol[j] = (a[i] == b[j]) ? secCol[j+1] + 1 :
                        std::max(secCol[j], mainCol[j+1]);
    }

    // Switch logic column
    int* aux = mainCol;
    mainCol = secCol;
    secCol = aux;
  }


  return costs1[0];
}

c++ - 如何在大字符串上应用最长公共子序列算法？

3 回答 3

Related

Reference