c++ - 我的 KMP 算法有问题吗？

Question

我正在做一个项目来比较 2 个字符串搜索算法的时间复杂度。由于某些问题，我丢失了以前的代码，因此不得不重写大部分代码。然而，这一次，我的 KMP 算法似乎运行得慢了很多，而且无论我输入什么，我都无法真正让它比我的 Boyer-Moore 运行得更快。这是我的代码：

#include <fstream>
#include <iostream>
#include <algorithm>
#include <vector>
#include <chrono>
using namespace std;

//set number of characters for lookup table
# define number_chars 256

//type alias for clock
typedef chrono::steady_clock clocktime;



void boyer_moore(string text, string pattern, int textlength, int patlength) {

    //start timer
    clocktime::time_point start = clocktime::now();

    //create vector list of all the locations the pattern was found
    vector<int> indexes;

    //create array for lookup table
    int lookup[number_chars];

    //initially fill every index of lookup table with -1
    for (int i = 0; i < number_chars; i++) {
        lookup[i] = -1;
    }
    //for every character in pattern, set lookup table at index = integer value for character to be the index of the character within the pattern
    for (int i = 0; i < patlength; i++) {
        lookup[(int)pattern[i]] = i;
    }
    //shift variable for keeping track of location in text
    int shift = 0;


    while (shift <= (textlength - patlength)) {
        int j = patlength - 1;

        //works through pattern from end to start checking if characters in text match, stops when j<0 or character in text doesn't match character in pattern
        while (j >= 0 && pattern[j] == text[shift + j]) {
            j--;
        }
        //if pattern found
        if (j < 0) {

            //add location pattern was found to location list
            indexes.push_back(shift);

            //if have not reached end of text
            if (shift + patlength < textlength) {

                //skip over pattern just found so next character in text lines up with last character of pattern
                shift += patlength - lookup[text[shift + patlength]];
            }

            //if pattern occurs at end of text add 1 to shift -> next while loop will fail
            else {
                shift += 1;
            }
        }
        //if pattern not found
        else {

            //checks if current character in text is an ASCII character -> if not it would be negative
            //(was having some issues previously with non-ASCII characters setting my array to out of bounds so added this condition)
            if (lookup[text[shift + j]] > -1) {

                //shifts pattern so unmtaching character in text lines up with last character in pattern
                shift += max(1, j - lookup[text[shift + j]]);
            }

            //if non-ASCII character, then it is not in pattern being searched anyway -> skip to next character
            else {
                shift += 1;
            }
        }
    }
    //end timer
    clocktime::time_point end = clocktime::now();

    //calculate time taken
    auto time_taken = chrono::duration_cast<chrono::milliseconds>(end - start).count();

    //display index locations of pattern
    cout << "Index locations of '" << pattern << "':" << endl;
    for (int in : indexes) {
        cout << in << ", ";
    }

    //display time taken
    cout << endl << endl << endl << "Boyer-Moore: time taken: " << time_taken << "ms" << endl << endl << endl;
}



void kmp(string text, string pattern, int textlength, int patlength) {

    //start time
    clocktime::time_point start = clocktime::now();

    //create vector list of all the locations the pattern was found
    vector<int> indexes;

    //creates prefix table 
    //had to use vector as compiler was complaining about patlength not being a constant value, even when I set it it const int
    vector<int> table(patlength);

    //keeps track of length of previous longest prefix suffix
    int k = 0;


    //filling in prefix table
    for (int i = 1; i < patlength; i++) {
        while (k > 0 && pattern[k] != pattern[i]) {
            //u
            k = table[k-1];
        }
        if (pattern[k] == pattern[i]) {

            //update longest suffix
            k = k + 1;
        }
        table[i] = k;
    }
    int textindex = 0;
    int patindex = 0;
    while (textindex < textlength) {
        if (pattern[patindex] == text[textindex]) {
            textindex++;
            patindex++;
        }
        if (patindex == patlength) {
            indexes.push_back(textindex-patlength);
            patindex = table[patindex - 1];
        }
        else if (textindex < textlength && pattern[patindex] != text[textindex]){
            if (patindex != 0) {
                patindex = table[patindex - 1];
            }
            else {
                textindex++;
            }
        }
    }

    //end timer
    clocktime::time_point end = clocktime::now();
    auto time_taken = chrono::duration_cast<chrono::milliseconds>(end - start).count();
    cout << "Index locations of '" << pattern << "':" << endl;
    for (int in : indexes) {
        cout << in << ", ";
    }
    cout << endl << endl << endl << "KMP: time taken: " << time_taken << "ms" << endl << endl << endl;
}



//load file method borrowed from lab exercise
string load_file() {
    std::string directory = "";
    string text;
    for (int i = 0; i < 6; i++) {
        const string & filename = "dna.txt";
        //https://www.kaggle.com/ashishsinhaiitr/lord-of-the-rings-text/version/1#01%20-%20The%20Fellowship%20Of%20The%20Ring.txt
        ifstream f(directory + filename, std::ios_base::binary);
        if (!f.good()) {
            directory = "../" + directory;
            continue;
        }

        f.seekg(0, std::ios_base::end);
        const size_t length = f.tellg();

        vector<char> buf(length);
        f.seekg(0);
        f.read(buf.data(), length);
        text.assign(buf.begin(), buf.end());

        return text;
    }
}



void main() {

    //loads text file
    string text = load_file();
    bool end = false;
    while (end == false) {

        //gets pattern from user
        cout << "pattern: ";
        string pattern;
        cin >> pattern;
        int n = text.size();
        int m = pattern.size();

        //call both string search algorithms
        boyer_moore(text, pattern, n, m);
        kmp(text, pattern, n, m);

        //ask user if they want to end the program
        cout << "Close program? (1 = yes, 0 = no): ";
        bool valid = false;

        //checks if user provided valid response -> asks for another if not
        while (valid == false) {
            int userend;
            cin >> userend;
            if ((int)userend == 1) {
                valid = true;
                end = true;
            }
            else if ((int)userend == 0) {
                valid = true;
            }
            else {
                "Please enter a valid number: ";
            }
        }
    }
}

例如，在我之前的结果中，对于单词“the”，Boyer-Moore 大约需要 260 毫秒，而 KMP 大约需要 184 毫秒。现在，Boyer-Moore 当然仍然需要大约 260 毫秒，但是 KMP 现在需要大约 330 毫秒。任何指向正确方向的指针将不胜感激。谢谢你。

c++ - 我的 KMP 算法有问题吗？

0 回答 0

Related

Reference