2

我正在关注本教程https://spacy.io/usage/training#quickstart以训练 distilbert 的自定义模型。一切都已安装,数据已转换,配置文件已准备就绪。

当我启动这个训练命令时:

 python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

发生此错误:

2021-08-30 11:43:04.292025: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
ℹ Saving to output directory: output
ℹ Using CPU

=========================== Initializing pipeline ===========================
[2021-08-30 11:43:08,117] [INFO] Set up nlp object from config
Traceback (most recent call last):
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\__main__.py", line 4, in <module>
    setup_cli()
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\cli\_util.py", line 69, in setup_cli
    command(prog_name=COMMAND)
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 829, in __call__
    return self.main(*args, **kwargs)
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 782, in main
    rv = self.invoke(ctx)
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 1259, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 1066, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 610, in invoke
    return callback(*args, **kwargs)
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\typer\main.py", line 497, in wrapper
    return callback(**use_params)  # type: ignore
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\cli\train.py", line 60, in train_cli
    nlp = init_nlp(config, use_gpu=use_gpu)
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\training\initialize.py", line 59, in init_nlp
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
  File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\util.py", line 470, in resolve_dot_names
    if registry.is_promise(config[section]):
KeyError: 'train'

我在 python 3.6 上,这些是安装的 spacy 版本:

spacy                    3.1.2
spacy-alignments         0.8.3
spacy-legacy             3.0.8
spacy-transformers       1.0.5

为了完整起见,这些是 config.cfg 文件和用于将 imdb 数据转换为 spacy 的 python 代码(train e dev .spacy 文件):

[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = "pytorch"
seed = 0

[nlp]
lang = "it"
pipeline = ["transformer","textcat"]
batch_size = 128
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.textcat]
factory = "textcat"
threshold = 0.5

[components.textcat.model]
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
nO = null

[components.transformer]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "distilbert-base-multilingual-cased"

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.transformer.model.tokenizer_config]
use_fast = true

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
accumulate_gradient = 3
dev_corpus = "dev.spacy"
train_corpus = "train.spacy"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
get_length = null

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.00005

[training.score_weights]
cats_score = 1.0
cats_score_desc = null
cats_micro_p = null
cats_micro_r = null
cats_micro_f = null
cats_macro_p = null
cats_macro_r = null
cats_macro_f = null
cats_macro_auc = null
cats_f_per_type = null
cats_macro_auc_per_type = null

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]

编码:

import spacy
from tqdm.auto import tqdm
from ml_datasets import imdb
from spacy.tokens import DocBin

train_data, valid_data = imdb()

nlp = spacy.load('en_core_web_sm')


def make_docs(data):
    docs = []
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total=len(data)):
        doc.cats['positive'] = label
        docs.append(doc)
    return docs


train_docs = make_docs(train_data)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk('./train.spacy')

valid_docs = make_docs(valid_data)
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk('./dev.spacy')
4

1 回答 1

1

这部分配置是错误的。

[training]
accumulate_gradient = 3
dev_corpus = "dev.spacy"
train_corpus = "train.spacy"

这有点令人困惑,但corpus这里的值不是文件路径,它们是config中值的位置。默认情况下它们是corpora.trainand corpora.dev; 通常,您希望保持这种状态。请参阅文档

发生此错误是因为 spaCy[train]在配置中查找块但没有这样的事情。

如果你把它改回来,它应该可以工作。

于 2021-09-01T06:19:10.623 回答