c - 将单词添加到 Trie 结构字典

Question

我正在尝试创建一个可以插入单词的 trie 结构，但该结构必须完全像这样：

typedef struct tNode_t {
    struct tNode_t **l;
    char *w;
  } tNode;

**l是一个指向 27 个指向 tNodes 的指针数组的指针，这就是我不明白的部分。

如果数组是指向 tNodes 的指针，我如何在其中插入单词？而且由于数组的大小是 27（26 个小写字母 az 和终止字符），您如何知道在哪里输入单词，具体取决于起始字母是什么？

score 2 · Accepted Answer

让我们假设这个词是“出租车”。

粗略地说，成员w指向一个字符。列表中的 26 个节点l用于字母“a”到“z”。

给定单词“cab”，第一个字母是“c”，因此您将查看根节点指针 for root->l[3]，其中包含指向所有以“c”开头的单词的指针；叫它tNode *letter = root->l[3];。然后你会寻找letter->l[1]以'ca'开头的单词；letter = letter->l[1];. 然后，您将查找letter->l[2]以“cab”开头的单词。在这一点上，您知道您已经到了要搜索的单词的末尾，并letter->w != 0告诉您该单词是有效的，并为您提供了该单词的文本。在树的下方可能还有其他词（用于“出租车”、“电缆”、“阴谋集团”等）。

你会被教导一些关于这个的东西，或者给出一些规范。可能还有其他方法可以填写详细信息。

我不清楚使用动态分配的数组是否比tNode *l[27];在结构中使用更好（对声明进行适当调整），但这是一个单独的讨论。我愉快地假设列表中的第 27 个节点没有更深层次的意义。当然，您可以在网上查找此类内容：维基百科往往是与计算机科学相关的非争议性问题的合理来源，但我尚未研究链接页面。

l不是数组；它是一个指向指针数组的指针，这让我感到困惑。

指针变量过着双重生活——杰基尔博士和海德先生——并且可以被视为指针或（随便）数组。

如果你写char *str,str不是字符数组；它是一个指向字符数组（的第一个字符）的指针。您可以使用str[i]访问字符串中的i^{第 th 个}字符。同理，char **argv不是字符串数组；它是一个指向字符串数组（第一个字符串）的指针。您可以使用^thargv[i]参数访问主程序。i

在这个 trie 结构中，l不是 ; 的数组tNode *。它是指向 trie 结构数组（的第一个元素）的指针。但是您可以使用trie->l[i]来访问指向的列表中的i^thtrie->l trie 结构。

工作代码

我之前没有尝试过，所以我根据您的数据结构将下面的代码放在一起。您需要调查它是否正式正确；下面的代码适用于给定的测试用例，并valgrind为代码提供了一个干净的健康状况（没有泄漏内存，没有内存滥用）。

该代码强制包含通常在头文件中的内容。就外界而言，trie 类型 ( tNode) 是不透明的。

#ifndef TRIE_H_INCLUDED
#define TRIE_H_INCLUDED

typedef struct tNode_t tNode;
extern void trie_add_word(tNode *trie, char const *word);
extern char const *trie_find_word(tNode *trie, char const *word);
extern void trie_print(tNode *trie);
extern tNode *trie_new(void);
extern void trie_free(tNode *trie);

#endif /* TRIE_H_INCLUDED */

/*#include "trie.h"*/
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>

struct tNode_t
{
    struct tNode_t **l;
    char            *w;
};

static void db_print(char const *fmt, ...);

tNode *trie_new(void)
{
    tNode *trie = malloc(sizeof(tNode));
    assert(trie != 0);      // Abysmal way to validate memory allocation
    trie->w = 0;
    trie->l = (tNode **)calloc(27, sizeof(tNode *));
    assert(trie->l != 0);   // Abysmal way to validate memory allocation
    return(trie);
}

void trie_free(tNode *trie)
{
    assert(trie != 0);
    assert(trie->l != 0);
    for (size_t i = 0; i < 27; i++)
    {
        if (trie->l[i] != 0)
            trie_free(trie->l[i]);
    }
    free(trie->l);
    free(trie->w);
    free(trie);
}

static void add_word_suffix(tNode *trie, char const *word, char const *suffix)
{
    int c;
    assert(trie != 0);
    assert(trie->l != 0);
    db_print("-->> %s: word [%s], suffix [%s]\n", __func__, word, suffix);
    while ((c = *suffix++) != '\0')
    {
        if (isalpha(c))
        {
            db_print("---- %s: letter %c (index %d)\n", __func__, c, c - 'a' + 1);
            c = tolower(c) - 'a' + 1;
            assert(trie->l != 0);
            if (trie->l[c] == 0)
                trie->l[c] = trie_new();
            db_print("---- %s: recurse: [%s]/[%s]\n", __func__, word, suffix);
            add_word_suffix(trie->l[c], word, suffix);
            db_print("<<-- %s\n", __func__);
            return;
        }
    }
    if (trie->w != 0)
    {
        db_print("---- %s: trie already contains word [%s] at [%s]\n", __func__, word, trie->w);
        return;
    }
    trie->w = strdup(word);
    db_print("<<-- %s: inserted word [%s]\n", __func__, trie->w);
}

void trie_add_word(tNode *trie, char const *word)
{
    add_word_suffix(trie, word, word);
}

static char const *find_word_suffix(tNode *trie, char const *word, char const *suffix)
{
    int c;
    db_print("-->> %s: word [%s] suffix[%s]\n", __func__, word, suffix);
    for ( ; (c = *suffix) != '\0'; suffix++)
    {
        if (isalpha(c))
        {
            db_print("---- %s: letter %c\n", __func__, c);
            c = tolower(c) - 'a' + 1;
            if (trie->l[c] == 0)
                return(0);
            char const *rv = find_word_suffix(trie->l[c], word, suffix+1);
            if (rv == 0)
            {
                db_print("<<-- %s: missing [%s]/[%s]\n", __func__, word, suffix);
                return 0;
            }
            db_print("<<-- %s: found [%s] for [%s]/[%s]\n", __func__, rv, word, suffix);
            return rv;
        }
    }
    if (trie->w == 0)
    {
        db_print("<<-- %s: missing [%s]/[%s]\n", __func__, word, suffix);
        return 0;
    }
    db_print("<<-- %s: found [%s] for [%s]/[%s]\n", __func__, trie->w, word, suffix);
    return(trie->w);
}

char const *trie_find_word(tNode *trie, char const *word)
{
    return find_word_suffix(trie, word, word);
}

void trie_print(tNode *trie)
{
    assert(trie != 0);
    assert(trie->l != 0);
    if (trie->w != 0)
        printf("%s\n", trie->w);
    for (size_t i = 0; i < 27; i++)
    {
        if (trie->l[i] != 0)
            trie_print(trie->l[i]);
    }
}

static int debug = 0;

static void db_print(char const *fmt, ...)
{
    if (debug > 0)
    {
        va_list args;
        va_start(args, fmt);
        vprintf(fmt, args);
        va_end(args);
    }
}

int main(int argc, char **argv)
{
    tNode *trie = trie_new();

    /* Set debugging - and use argv */
    if (argc > 1 && argv[argc] == 0)
        debug = 1;

    /* First test */
    char const *word = "cab";
    trie_add_word(trie, word);
    char const *leaf = trie_find_word(trie, word);
    printf("Leaf word = %s\n", leaf);
    trie_free(trie);

    /* Second, more comprehensive test */
    static char const *words[] =
    {
        "cabal",         "cabbie",
        "cab",           "centre",
        "cinema",        "cold",
        "culminate",     "culmination",
        "duck",          "cabs",
        "amniocentesis", "amniocentesis",
        "amniocentesis", "cam",
        "cab",           "cab",
        "zulu",          "alpha",
        "bravo",         "Charlie",
        "delta",         "echo",
        "foxtrot",       "golf",
        "hotel",         "India",
        "Juliet",        "kilo",
        "Lima",          "Mike",
        "November",      "Oscar",
        "Papa",          "Quebec",
        "Romeo",         "Sierra",
        "Tango",         "uMBRelLA",
        "Victor",        "Whisky",
        "X-ray",         "Yankee",
        "Zulu",          "Aquarius",
    };
    size_t num_words = sizeof(words) / sizeof(words[0]);
    size_t counter = 0;

    /* First time, add every other word; second time, every word */
    for (size_t mod = 2; mod > 0; mod--)
    {
        trie = trie_new();
        printf("\nTest %zu\n", ++counter);
        for (size_t i = 0; i < num_words; i++)
        {
            if (i % mod == 0)
                trie_add_word(trie, words[i]);
            char const *leaf = trie_find_word(trie, words[i]);
            if (leaf == 0)
                printf("Word [%s] is missing\n", words[i]);
            else
                printf("Word [%s] for [%s]\n", leaf, words[i]);
        }
        printf("\nTrie:\n");
        trie_print(trie);
        trie_free(trie);
    }

    return(0);
}

该代码assert()用于内存分配检查。这是方便但糟糕的风格。不要模仿。如果您愿意，您可以决定将trie_find_word()andtrie_add_word()功能组合成一个函数。您可以通过将参数（任何参数）传递给测试程序来查看相当详细的诊断信息。

该代码使用 27 （不在宏或枚举中），但如果您删除+ 1出现在几个要转换'a'为的位置1（它会将其转换为），则它仅适用于数组中的 26 个元素0。

c - 将单词添加到 Trie 结构字典

1 回答 1

工作代码

Related

Reference