c++ - 有没有更好的方法来搜索文件中的字符串？

Question

我需要在（非文本）文件中搜索字节序列“9µ}Æ”（或“\x39\xb5\x7d\xc6”）。

在网上搜索了 5 个小时后，这是我能做的最好的。它有效，但我想知道是否有更好的方法：

char buffer;

int pos=in.tellg();

// search file for string
while(!in.eof()){
    in.read(&buffer, 1);
    pos=in.tellg();
    if(buffer=='9'){
        in.read(&buffer, 1);
        pos=in.tellg();
        if(buffer=='µ'){
            in.read(&buffer, 1);
            pos=in.tellg();
            if(buffer=='}'){
                in.read(&buffer, 1);
                pos=in.tellg();
                if(buffer=='Æ'){
                    cout << "found";
                }
            }
        }
    }

    in.seekg((streampos) pos);

笔记：

我不能用getline()。它不是文本文件，因此可能没有很多换行符。
在我尝试使用多字符缓冲区，然后将缓冲区复制到 C++ 字符串，然后使用string::find(). 这不起作用，因为'\0'整个文件中有很多字符，因此缓冲区中的序列在复制到字符串时会被剪得很短。

score 5 · Accepted Answer

类似于 bames53 发布的内容；我使用向量作为缓冲区：

std::ifstream ifs("file.bin");

ifs.seekg(0, std::ios::end);
std::streamsize f_size = ifs.tellg();
ifs.seekg(0, std::ios::beg);

std::vector<unsigned char> buffer(f_size);
ifs.read(buffer.data(), f_size);

std::vector<unsigned char> seq = {0x39, 0xb5, 0x7d, 0xc6};

bool found = std::search(buffer.begin(), buffer.end(), seq.begin(), seq.end()) != buffer.end();

score 0 · Accepted Answer

因为您说由于字符串中的空终止符字符而无法搜索整个文件，所以这里有一个替代方法，它读取整个文件并使用递归来查找整个文件中第一次出现的字符串。

    #include <iostream>
    #include <fstream>
    #include <string>

    using namespace std;

    string readFile (char *fileName) {
      ifstream fi (fileName);
      if (!fi)
        cerr << "ERROR: Cannot open file" << endl;
      else {
        string str ((istreambuf_iterator<char>(fi)), istreambuf_iterator<char>());
        return str;
      }
      return NULL;
    }

    bool findFirstOccurrenceOf_r (string haystack, char *needle, int haystack_pos, int needle_pos, int needle_len) {
      if (needle_pos == needle_len)
        return true;
      if (haystack[haystack_pos] == needle[needle_pos]) 
        return findFirstOccurrenceOf_r (haystack, needle, haystack_pos+1, needle_pos+1, needle_len);
      return false;
    }

    int findFirstOccurrenceOf (string haystack, char *needle, int length) {
      int pos = -1;
      for (int i = 0; i < haystack.length() - length; i++) {
        if (findFirstOccurrenceOf_r (haystack, needle, i, 0, length))
          return i;
      }
      return pos;
    }

    int main () {
      char str_to_find[4] = {0x39, 0xB5, 0x7D, 0xC6};
      string contents = readFile ("input");

      int pos = findFirstOccurrenceOf (contents, str_to_find, 4);

      cout << pos << endl;
    }

如果文件不是太大，您最好的解决方案是将整个文件加载到内存中，这样您就不需要继续从驱动器读取。如果文件太大而无法一次加载，您可能希望一次加载文件的块。但是，如果您确实加载了夹头，请确保检查块的边缘。您的块可能恰好在您正在搜索的字符串的中间分裂。

score 0 · Accepted Answer

该程序将整个文件加载到内存中，然后std::search在其上使用。

int main() {
    std::string filedata;
    {
        std::ifstream fin("file.dat");
        std::stringstream ss;
        ss << fin.rdbuf();
        filedata = ss.str();
    }

    std::string key = "\x39\xb5\x7d\xc6";
    auto result = std::search(std::begin(filedata), std::end(filedata),
                              std::begin(key), std::end(key));
    if (std::end(filedata) != result) {
        std::cout << "found\n";
        // result is an iterator pointing at '\x39'
    }
}

score 0 · Accepted Answer

如果您不介意将整个文件加载到内存数组中（或使用 mmap() 使其看起来像文件在内存中），那么您可以在内存中搜索您的字符序列，这有点更容易做到：

// Works much like strstr(), except it looks for a binary sub-sequence rather than a string sub-sequence
const char * MemMem(const char * lookIn, int numLookInBytes, const char * lookFor, int numLookForBytes)
{
        if (numLookForBytes == 0)              return lookIn;  // hmm, existential questions here
   else if (numLookForBytes == numLookInBytes) return (memcmp(lookIn, lookFor, numLookInBytes) == 0) ? lookIn : NULL;
   else if (numLookForBytes < numLookInBytes)
   {
      const char * startedAt = lookIn;
      int matchCount = 0;
      for (int i=0; i<numLookInBytes; i++)
      {
         if (lookIn[i] == lookFor[matchCount])
         {
            if (matchCount == 0) startedAt = &lookIn[i];
            if (++matchCount == numLookForBytes) return startedAt;
         }
         else matchCount = 0;
      }
   }
   return NULL;
}

....然后您可以在内存数据数组上调用上述函数：

char * ret = MemMem(theInMemoryArrayContainingFilesBytes, numBytesInFile, myShortSequence, 4);
if (ret != NULL) printf("Found it at offset %i\n", ret-theInMemoryArrayContainingFilesBytes);
            else printf("It's not there.\n");

score 0 · Accepted Answer

const char delims[] = { 0x39, 0xb5, 0x7d, 0xc6 };
char buffer[4];
const size_t delim_size = 4;
const size_t last_index = delim_size - 1;

for ( size_t i = 0; i < last_index; ++i )
{
  if ( ! ( is.get( buffer[i] ) ) )
    return false; // stream to short
}

while ( is.get(buffer[last_index]) )
{
  if ( memcmp( buffer, delims, delim_size ) == 0 )
    break; // you are arrived
  memmove( buffer, buffer + 1, last_index );
}

您正在寻找 4 个字节：

unsigned int delim = 0xc67db539;
unsigned int uibuffer;
char * buffer = reinterpret_cast<char *>(&uibuffer);

for ( size_t i = 0; i < 3; ++i )
{
  if ( ! ( is.get( buffer[i] ) ) )
    return false; // stream to short
}

while ( is.get(buffer[3]) )
{
  if ( uibuffer == delim )
    break; // you are arrived
  uibuffer >>= 8;
}

c++ - 有没有更好的方法来搜索文件中的字符串？

5 回答 5

Related

Reference