我正在关注本教程https://spacy.io/usage/training#quickstart以训练 distilbert 的自定义模型。一切都已安装,数据已转换,配置文件已准备就绪。
当我启动这个训练命令时:
python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
发生此错误:
2021-08-30 11:43:04.292025: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
ℹ Saving to output directory: output
ℹ Using CPU
=========================== Initializing pipeline ===========================
[2021-08-30 11:43:08,117] [INFO] Set up nlp object from config
Traceback (most recent call last):
File "C:\Miniconda3\envs\tensorflow-2.1\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "C:\Miniconda3\envs\tensorflow-2.1\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\__main__.py", line 4, in <module>
setup_cli()
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\cli\_util.py", line 69, in setup_cli
command(prog_name=COMMAND)
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 829, in __call__
return self.main(*args, **kwargs)
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 782, in main
rv = self.invoke(ctx)
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 1259, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 1066, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\click\core.py", line 610, in invoke
return callback(*args, **kwargs)
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\typer\main.py", line 497, in wrapper
return callback(**use_params) # type: ignore
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\cli\train.py", line 60, in train_cli
nlp = init_nlp(config, use_gpu=use_gpu)
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\training\initialize.py", line 59, in init_nlp
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
File "C:\Miniconda3\envs\tensorflow-2.1\lib\site-packages\spacy\util.py", line 470, in resolve_dot_names
if registry.is_promise(config[section]):
KeyError: 'train'
我在 python 3.6 上,这些是安装的 spacy 版本:
spacy 3.1.2
spacy-alignments 0.8.3
spacy-legacy 3.0.8
spacy-transformers 1.0.5
为了完整起见,这些是 config.cfg 文件和用于将 imdb 数据转换为 spacy 的 python 代码(train e dev .spacy 文件):
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null
[system]
gpu_allocator = "pytorch"
seed = 0
[nlp]
lang = "it"
pipeline = ["transformer","textcat"]
batch_size = 128
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.textcat]
factory = "textcat"
threshold = 0.5
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
nO = null
[components.transformer]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "distilbert-base-multilingual-cased"
[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96
[components.transformer.model.tokenizer_config]
use_fast = true
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[training]
accumulate_gradient = 3
dev_corpus = "dev.spacy"
train_corpus = "train.spacy"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null
[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
get_length = null
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.00005
[training.score_weights]
cats_score = 1.0
cats_score_desc = null
cats_micro_p = null
cats_micro_r = null
cats_micro_f = null
cats_macro_p = null
cats_macro_r = null
cats_macro_f = null
cats_macro_auc = null
cats_f_per_type = null
cats_macro_auc_per_type = null
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]
编码:
import spacy
from tqdm.auto import tqdm
from ml_datasets import imdb
from spacy.tokens import DocBin
train_data, valid_data = imdb()
nlp = spacy.load('en_core_web_sm')
def make_docs(data):
docs = []
for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total=len(data)):
doc.cats['positive'] = label
docs.append(doc)
return docs
train_docs = make_docs(train_data)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk('./train.spacy')
valid_docs = make_docs(valid_data)
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk('./dev.spacy')