c++ - c ++中的Rabin-Karp算法

Question

我试图了解 Rabin-Karp 算法的实现。d 是输入字母表中的字符数，但如果我替换 0 或任何其他值而不是 20，它不会影响任何内容。为什么会这样？

    // Rabin-Karp algorithm in C++
#include <string.h>
#include <iostream>
using namespace std;
#define d 20

void rabinKarp(char pattern[], char text[], int q) {
    int m = strlen(pattern);
    int n = strlen(text);
    int i, j;
    int p = 0;
    int t = 0;
    int h = 1;

    for (i = 0; i < m - 1; i++)
        h = (h * d) % q;

    // Calculate hash value for pattern and text
    for (i = 0; i < m; i++) {
        p = (d * p + pattern[i]) % q;
        t = (d * t + text[i]) % q;
    }

    // Find the match
    for (i = 0; i <= n - m; i++) {
        if (p == t) {
            for (j = 0; j < m; j++) {
                if (text[i + j] != pattern[j])
                    break;
            }

            if (j == m)
                cout << "Pattern is found at position: " << i + 1 << endl;
        }

        if (i < n - m) {
            t = (d * (t - text[i] * h) + text[i + m]) % q;

            if (t < 0)
                t = (t + q);
        }
    }
}

int main() {

   // char text[] = "ABCCDXAEFGX";
    char text[] = "QWERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
    char pattern[] = "KLXQW";
    int q = 13;
    rabinKarp(pattern, text, q);
}

score 0 · Accepted Answer

我相信简短的回答是越低d的哈希冲突就越多，但是无论如何你都要验证匹配，所以它不会影响任何事情。

更详细一点：

首先让我修改您的代码，使其具有更具表现力的变量：

// Rabin-Karp algorithm in C++
#include <string.h>
#include <iostream>
using namespace std;
#define HASH_BASE 0

void rabinKarp(char pattern[], char text[], int inputBase) {
    int patternLen = strlen(pattern);
    int textLen = strlen(text);
    int i, j; //predefined iterators
    int patternHash = 0;
    int textHash = 0;
    int patternLenOut = 1;

    for (i = 0; i < patternLen - 1; i++)
        patternLenOut = (patternLenOut * HASH_BASE) % inputBase; // hash of pattern len

    // Calculate hash value for pattern and text
    for (i = 0; i < patternLen; i++) {
        patternHash = (HASH_BASE * patternHash + pattern[i]) % inputBase;
        textHash = (HASH_BASE * textHash + text[i]) % inputBase;
    }

    // Find the match
    for (i = 0; i <= textLen - patternLen; i++) {
        if (patternHash == textHash) {
            for (j = 0; j < patternLen; j++) {
                if (text[i + j] != pattern[j])
                    break;
            }

            if (j == patternLen)
                cout << "Pattern is found at position: " << i + 1 << endl;
        }

        if (i < textLen - patternLen) {
            textHash = (HASH_BASE * (textHash - text[i] * patternLenOut) + text[i + patternLen]) % inputBase;
            
            if (textHash < 0)
                textHash = (textHash + inputBase);
        }
    }
}

int main() {

   // char text[] = "ABCCDXAEFGX";
    char text[] = "QWEEERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
    char pattern[] = "EE";
    int q = 13;
    rabinKarp(pattern, text, q);
}

攻击它的最简单方法是将HASH_BASE（以前d）设置为零，然后看看我们可以在哪里简化。然后可以将 rabinKarp 函数简化为：

void rabinKarp(char pattern[], char text[], int inputBase) {
    int patternLen = strlen(pattern);
    int textLen = strlen(text);
    int i, j; //predefined iterators
    int patternHash = 0;
    int textHash = 0;
    int patternLenOut = 0;

    // Calculate hash value for pattern and text
    for (i = 0; i < patternLen; i++) {
        patternHash = (pattern[i]) % inputBase;
        textHash = (text[i]) % inputBase;
    }

    // Find the match
    for (i = 0; i <= textLen - patternLen; i++) {
        if (patternHash == textHash) {
            for (j = 0; j < patternLen; j++) {
                if (text[i + j] != pattern[j])
                    break;
            }

            if (j == patternLen)
                cout << "Pattern is found at position: " << i + 1 << endl;
        }

        if (i < textLen - patternLen) {
            textHash = (text[i + patternLen]) % inputBase;
            
            if (textHash < 0)
                textHash = (textHash + inputBase);
        }
    }
}

现在你会注意到所有的哈希变成了字母 mod 某个数字的总和（在你的情况下是 13，在我的情况下是 2）。这是一个糟糕的哈希值，这意味着许多事物的总和将是相同的数字。但是，在这部分代码中：

if (patternHash == textHash) {
    for (j = 0; j < patternLen; j++) {
        if (text[i + j] != pattern[j])
              break;
    }
    
    if (j == patternLen)
        cout << "Pattern is found at position: " << i + 1 << endl;
}

如果散列匹配，您可以逐个字母显式检查匹配。您的哈希函数越差，误报的频率就越高（这意味着您的函数运行时间越长）。还有更多细节，但我相信这可以直接回答您的问题。可能有趣的是记录误报并查看误报率如何增加d和q减少。

c++ - c ++中的Rabin-Karp算法

1 回答 1

Related

Reference