8

我的程序的目标是通过一个字符串查看并能够取出对话问题和答案。

例如: ("do you like me?" ("yes" ("we're friends")) ("no" ("i hate you")) )

该程序会取出“你喜欢我吗?” 并会让您选择输入是或否。一旦你选择了相应的选项,它就会抛出“我们是朋友”或“我恨你”。

有没有关于如何做到这一点的库或任何解决方案?

4

2 回答 2

58

如果我错了,请纠正我,但 Lisp 解析器可以很好地完成这项工作。:P 说真的,这看起来像是带括号的字符串列表或其他带括号的表达式。一个简单的递归解析器就足够了,只需发明一个数据结构来创建为适合您需要的解析树。

编辑:该死,我终于成功了……嗯,我必须承认,在晚上 10 点到 12 点之间正确地启动一个非常简单的解析器也不是一件容易的事。

/*
 * lrparser.c
 * LR-parser
 * A recursive Lisp-subset parser
 * that has a misleading name (it's not an LALR, but a recursive descent one).
 *
 * Originally written to answer
 * http://stackoverflow.com/questions/15371008/string-processing-in-c/
 *
 * Made in some *really* bored hours by Árpád Goreity (H2CO3)
 * on 12-03-2013
 *
 * Language: C99 (not sure if POSIX)
 */

#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <assert.h>
#include <stdarg.h>
#include <stdbool.h>

// AST node type
enum {
    NODE_STRING,
    NODE_LIST
};

// Permitted tokens
enum {
    TOKEN_INVALID   = -1,
    TOKEN_LPAREN    =  0,
    TOKEN_RPAREN,
    TOKEN_STRING,
    TOKEN_END
};

// Useful for debugging and error reporting
static const char *toknames[] = {
    "Left paren",
    "Right paren",
    "String",
    "End"
};

// ...Or simply an AST node...
struct ParseTree {
    int type; // string or list
    char *string; // if any
    struct ParseTree **children;
    size_t n_children;
};

// Construct a node structure from a type and any necessary data
static struct ParseTree *node_new(int type, ...)
{
    va_list args;
    va_start(args, type);
    struct ParseTree *node = malloc(sizeof(*node));
    assert(node != NULL);

    node->type = type;
    if (type == NODE_STRING) {
        /* If the node is a string, fill it
         * (ownership transfer: the argument will be
         * free()'d by the node_free() function)
         */
        node->string = va_arg(args, char *);
    }

    node->children = NULL;
    node->n_children = 0;

    va_end(args);

    return node;
}

void node_free(struct ParseTree *tree)
{
    switch (tree->type) {
    case NODE_STRING:
        free(tree->string);
        break;
    case NODE_LIST:
        for (int i = 0; i < tree->n_children; i++) {
            node_free(tree->children[i]);
        }
        free(tree->children);
        break;
    default:
        fprintf(stderr, "Warning: unknown node type %d\n", tree->type);
        break;
    }

    free(tree);
}

// Sorry, the usual logarithmic storage expansion is omitted for clarity
void node_add(struct ParseTree *parent, struct ParseTree *child)
{
    assert(parent != NULL);
    assert(child != NULL);

    parent->n_children++;
    parent->children = realloc(parent->children, sizeof(parent->children[0]) * parent->n_children);
    // Lazy error checking: assert() instead of compare to NULL
    assert(parent->children != NULL);
    parent->children[parent->n_children - 1] = child;
}

// Just in order to break thread safety
static const char *s = NULL; // the string to be parsed
static char *curstr = NULL; // the contents of the string value of the current token
static int curtok; // the current token

// The tokenizer
static int lex()
{
    // Whitespace doesn't matter
    while (isspace(s[0])) {
        s++;
    }

    // end of string
    if (s[0] == 0) {
        return TOKEN_END;
    }

    // The followin four are obvious
    if (s[0] == '(') {
        s++;
        return curtok = TOKEN_LPAREN;
    }

    if (s[0] == ')') {
        s++;
        return curtok = TOKEN_RPAREN;
    }

    if (s[0] == '"') {
        const char *begin = s;
        while (*++s != '"')
            ;

        size_t sz = s - begin - 2 + 1;
        curstr = malloc(sz + 1);
        memcpy(curstr, begin + 1, sz);
        curstr[sz] = 0;

        // skip trailing quotation mark (")
        s++;
        return curtok = TOKEN_STRING;
    }

    return curtok = TOKEN_INVALID;
}

void expect(int tok)
{
    if (curtok != tok) {
        fprintf(stderr, "Error: expected token %s, got %s\n", toknames[tok], toknames[curtok]);
        abort();
    }

    lex();
}

// a. k. a. "parse()"
// Simple recursive (one-level...) descent (root == list) approach
static struct ParseTree *recurse_and_descend()
{
    expect(TOKEN_LPAREN);       

    struct ParseTree *node = node_new(NODE_LIST);

    struct ParseTree *child;
    while (curtok != TOKEN_RPAREN) {
        if (curtok == TOKEN_LPAREN) {
            child = recurse_and_descend();
        } else if (curtok == TOKEN_STRING) {
            child = node_new(NODE_STRING, curstr);
            lex();
        } else {
            fprintf(stderr, "Unexpected token '%s'\n", toknames[curtok]);
            // lazy programmer's safety system, let the kernel do the dirty work
            abort();
        }
        node_add(node, child);
    }

    expect(TOKEN_RPAREN);

    return node;
}

static struct ParseTree *parse(const char *str)
{
    s = str; // poor man's initialization
    lex(); // The first breath of fresh token makes the baby's heart beat
    return recurse_and_descend(); // Let's do the Harlem shake!
}

// petite helper function
static void dump_indent(int indent)
{
    for (int i = 0; i < indent; i++) {
        printf("\t");
    }
}

// Because 0x7f502a00 is not very meaningful for the human eye
static void dump_tree(struct ParseTree *tree, int indent)
{
    dump_indent(indent);

    switch (tree->type) {
    case NODE_STRING:
        printf("<String \"%s\">\n", tree->string);
        break;
    case NODE_LIST:
        printf("<List>\n");
        for (int i = 0; i < tree->n_children; i++) {
            dump_tree(tree->children[i], indent + 1);
        }
        break;
    default:
        printf("Unknown node\n");
        break;
    }
}

int main(int argc, char *argv[])
{
    struct ParseTree *tree = parse(argv[1]);
    dump_tree(tree, 0);
    node_free(tree);

    return 0;
}

用法:

h2co3-macbook:~ h2co3$ ./lrparser "(\"do you like me?\" (\"yes\" (\"we're friends\")) (\"no\" (\"i hate you\" \"me too\")) )"
<List>
    <String "do you like me?">
    <List>
        <String "yes">
        <List>
            <String "we're friends">
    <List>
        <String "no">
        <List>
            <String "i hate you">
            <String "me too">
于 2013-03-12T20:06:03.163 回答
7

如果您想要“有效的东西”但不强大,那么很多技巧都可以奏效。如果你真的想让它工作,你需要研究一下LALR(1) 解析器,然后决定这是否足够简单来滚动你自己的解析器(它是),或者你是否想使用像 YACC 这样的东西。

上下文无关语法似乎看起来像

QUESTION => '(' '"' TEXT '"' RESPONSES ')'
RESPONSES => null | RESPONSE RESPONSES
RESPONSE => '(' '"' TEXT '(' '"' TEXT '"' ')' ')'
TEXT => all characters except '(' '"' ')'

然后你分析一下上述语言的哪些组合会导致处理的变化。基本上 RESPONSES 可以还原为什么都没有或以 '(' 开头的东西,这意味着在处理的那个时刻,您可以通过查看前瞻(不是但解析的字符)是'('或')'。

模式内的解析非常简单。如果字符是固定的,只需检查它是否与预期匹配,并推进已解析元素的索引。如果字符不固定(如在文本中),则使用例程检查它是否有界,并且任何超出预期的内容都会使解析器处于错误状态。

于 2013-03-12T20:27:22.513 回答