c - C（非 C++）中的词频统计

Question

给定一个字符串，由单个空格分隔的单词组成，按照它们在字符串中出现的次数降序打印出单词。

例如，“ab bc bc”的输入字符串将生成以下输出：

bc : 2
ab : 1

如果使用像地图这样的 C++ 数据结构，这个问题将很容易解决。但如果这个问题只能用普通的旧 C 来解决，那看起来要困难得多。

我应该在这里使用什么样的数据结构和算法？请尽可能详细。我在 DS 和 Algo 方面很弱。:-(

score 4 · Accepted Answer

您可以使用的一种数据结构是一个简单的二叉树，其中包含您可以使用 strcmp 比较的单词。（我现在将忽略案例问题）。

您需要确保树在生长时保持平衡。为此，请在维基百科或其他地方查找 AVL 树或 1-2 树或红黑树。

除了创建二叉树结构外，我不会提供太多细节，每个节点都有一个左右子节点，可以为空，对于叶节点，两个子节点都为空。为了使其更简单，请使用具有值和两个子节点的“侵入式”节点。就像是：

struct Node
{
  char * value;
  size_t frequency; 
  struct Node * left;
  struct Node * right;
};

显然是 C，你需要做所有的内存管理。

您将拥有一个沿树递归的函数，比较并酌情向左或向右移动。如果找到它只会提高频率。如果不是，您的函数应该能够确定插入节点的位置，然后是您的插入和重新平衡逻辑。当然，新节点将包含频率为 1 的单词。

最后，您将需要一种通过树递归打印结果的方法。在您的情况下，这可以是一个递归函数。

请注意，另一种数据结构将是某种哈希表。

如果您正在寻找最有效的解决方案并且手头有大量内存，那么您将使用一种数据结构，在遇到每个字母时分支它。所以“a”给你所有以a开头的单词，然后移到第二个字母“b”等。对于不了解数据结构的人来说，实现起来相当复杂，所以我建议你去用简单的二叉树。

请注意，在打印输出时，它不会按照频率的相反顺序排列，因此您必须先对结果进行排序。（在使用 map 的 C++ 中，您也不会按该顺序获取它们）。

score 2 · Accepted Answer

我会为此使用三叉树。以下由 Jon Bentley 和 Robert Sedgewick 介绍数据结构的文章有一个 C 语言示例。

http://www.cs.princeton.edu/~rs/strings/

score 1 · Accepted Answer

这是我如何做的一个示例。findWord() 中的搜索可以优化。也可以通过分配字块而不是一次分配一个字块来减少分配的数量。也可以为这种情况实现一个链表。它缺少内存释放。这应该有希望让你继续前进。

    #include <stdio.h>
    #include <assert.h>
    #include <stdlib.h>

    #define MAXWORDLEN 128

    const char* findWhitespace(const char* text)
    {
        while (*text && !isspace(*text))
            text++;
        return text;
    }

    const char* findNonWhitespace(const char* text)
    {
        while (*text && isspace(*text))
            text++;
        return text;
    }

    typedef struct tagWord
    {
        char word[MAXWORDLEN + 1];
        int count;
    } Word;

    typedef struct tagWordList
    {
        Word* words;
        int count;
    } WordList;

    WordList* createWordList(unsigned int count);

    void extendWordList(WordList* wordList, const int count)
    {
        Word* newWords = (Word*)malloc(sizeof(Word) * (wordList->count + count));
        if (wordList->words != NULL) {
            memcpy(newWords, wordList->words, sizeof(Word)* wordList->count);
            free(wordList->words);
        }
        for (int i = wordList->count; i < wordList->count + count; i++) {
            newWords[i].word[0] = '\0';
            newWords[i].count = 0;
        }
        wordList->words = newWords;
        wordList->count += count;
    }

    void addWord(WordList* wordList, const char* word)
    {
        assert(strlen(word) <= MAXWORDLEN);
        extendWordList(wordList, 1);
        Word* wordNode = &wordList->words[wordList->count - 1];
        strcpy(wordNode->word, word);
        wordNode->count++;  
    }

    Word* findWord(WordList* wordList, const char* word)
    {
        for(int i = 0; i < wordList->count; i++) {
            if (stricmp(word, wordList->words[i].word) == 0) {
                return &wordList->words[i];
            }
        }
        return NULL;
    }

    void updateWordList(WordList* wordList, const char* word)
    {
        Word* foundWord = findWord(wordList, word);
        if (foundWord == NULL) {
            addWord(wordList, word);
        } else {
            foundWord->count++;
        }
    }

    WordList* createWordList(unsigned int count)
    {
        WordList* wordList = (WordList*)malloc(sizeof(WordList));
        if (count > 0) {
            wordList->words = (Word*)malloc(sizeof(Word) * count);
            for(unsigned int i = 0; i < count; i++) {
                wordList->words[i].count = 0;
                wordList->words[i].word[0] = '\0';
            }
        }
        else {
            wordList->words = NULL;
        }
        wordList->count = count;    
        return wordList;
    }

    void printWords(WordList* wordList)
    {
        for (int i = 0; i < wordList->count; i++) {
            printf("%s: %d\n", wordList->words[i].word, wordList->words[i].count);
        }
    }

    int compareWord(const void* vword1, const void* vword2)
    {
        Word* word1 = (Word*)vword1;
        Word* word2 = (Word*)vword2;
        return strcmp(word1->word, word2->word);
    }

    void sortWordList(WordList* wordList)
    {
        qsort(wordList->words, wordList->count, sizeof(Word), compareWord);
    }

    void countWords(const char* text)
    {
        WordList   *wordList = createWordList(0);
        Word       *foundWord = NULL;
        const char *beg = findNonWhitespace(text);
        const char *end;
        char       word[MAXWORDLEN];

        while (beg && *beg) {
            end = findWhitespace(beg);
            if (*end) {
                assert(end - beg <= MAXWORDLEN);
                strncpy(word, beg, end - beg);
                word[end - beg] = '\0';
                updateWordList(wordList, word);
                beg = findNonWhitespace(end);
            }
            else {
                beg = NULL;
            }
        }

        sortWordList(wordList);
        printWords(wordList);
    }

int main(int argc, char* argv[])
{
    char* text = "abc 123 abc 456 def 789 \tyup this \r\ncan work yup 456 it can";
    countWords(text);
}

c - C（非 C++）中的词频统计

3 回答 3

Related

Reference