spacy - 如何在 spaCy 上训练伪投影解析器？

Question

我正在尝试按照https://raw.githubusercontent.com/explosion/spaCy/master/examples/training/train_intent_parser.py中的示例代码训练自定义语义解析器这个想法是得到一个非投影解析所以当我传递了这样的文本：ROOT AAAA BBBB 12 2112 成为 AAAA 的孩子，21 成为 BBBB 的孩子。为了测试这一点，我只训练这个案例并测试同样的案例，但它似乎不起作用，我得到的回应是：

[('ROOT', 'ROOT', 'ROOT'), ('AAAA', 'LETTERS', 'ROOT'), ('BBBB', 'LETTERS', 'ROOT'), ('12', 'NUMBERS', 'BBBB'), ('21', 'NUMBERS', 'BBBB')]

如您所见，这两个数字都依赖于 BBBB，而 12 应该依赖于 AAAA。

我用来训练和测试的代码是：

import plac
import random
import spacy
from spacy.util import minibatch, compounding

TRAIN_DATA = list()

samples = 1000
for _ in range(samples):
    sample = (
        'ROOT AAAA BBBB 12 21',
        {
            'heads': [0, 0, 0, 1, 2],
            'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
        }
    )
    TRAIN_DATA.append(sample)

def test_model(nlp):
    texts = ['ROOT AAAA BBBB 12 21']
    docs = nlp.pipe(texts)
    for doc in docs:
        print(doc.text)
        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])

@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    n_iter=("Number of training iterations", "option", "n", int),
)

#  Just in case I am using the german model since it supports pseudo-projective parsing (https://explosion.ai/blog/german-model#word-order)
def main(model='de_core_news_sm', n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if "parser" in nlp.pipe_names:
        nlp.remove_pipe("parser")
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_model(nlp)

if __name__ == "__main__":
    plac.call(main)

那么，我做错了什么？

提前感谢您对此提供的任何帮助！

score 1 · Accepted Answer

问题是简单的训练示例脚本在初始化和训练模型时没有投影训练实例。解析算法本身只能处理投影解析，但如果解析器组件在其输出中找到投影标签，它们会在后处理步骤中被反投影。您不需要修改任何解析器设置（因此从德国模型开始没有区别），只需以正确的格式提供投影化输入。

初始投影由 train CLI 自动处理，用于GoldCorpus.train_docs()准备训练示例nlp.update()并make_projective=True在创建GoldParses. 一般来说，我建议切换到 train CLI（这也需要切换到内部 JSON 训练格式，这无疑是一个小麻烦），因为 train CLI 设置了很多更好的默认值。

但是，只要您创建投影化训练示例（使用GoldParse(make_projective=True），将所有投影化依赖标签添加到解析器，并使用Doc投影GoldParse化输入而不是文本/注释输入进行训练，玩具示例也可以正常工作：

# tested with spaCy v2.2.4
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse

TRAIN_DATA = [
    (
        'ROOT AAAA BBBB 12 21',
        {
            'heads': [0, 0, 0, 1, 2],
            'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
        }
    )
]

samples = 200

def test_model(nlp):
    texts = ["ROOT AAAA BBBB 12 21"]
    for doc in nlp.pipe(texts):
        print(doc.text)
        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
        spacy.displacy.serve(doc)

@plac.annotations(
    n_iter=("Number of training iterations", "option", "n", int),
)

def main(n_iter=10):
    """Load the model, set up the pipeline and train the parser."""
    nlp = spacy.blank("xx")
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser)

    docs_golds = []
    for text, annotation in TRAIN_DATA:
        doc = nlp.make_doc(text)
        gold = GoldParse(doc, **annotation, make_projective=True)
        # add the projectivized labels
        for dep in gold.labels:
            parser.add_label(dep)
        docs_golds.append((doc, gold))
    # duplicate the training instances
    docs_golds = docs_golds * samples

    pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training(min_action_freq=1)
        for itn in range(n_iter):
            random.shuffle(docs_golds)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(docs_golds, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                docs, golds = zip(*batch)
                nlp.update(docs, golds, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_model(nlp)

if __name__ == "__main__":
    plac.call(main)

spacy - 如何在 spaCy 上训练伪投影解析器？

1 回答 1

Related

Reference