尝试将数据从 csv 转换为 DocBin 以训练具有以下块的 textcat_multilabel 组件的模型:
def convert_cat_annontation_from_sentinelle_db(output_path, nlp, input_path, cats_empty):
#Takes in a csv file resulting from the SQL query qry_export_rdp_annotations.sql
db = db = DocBin() # create a DocBin object
annotated_data = pd.read_csv(input_path)
print(len(annotated_data))
for idx, row in annotated_data.iterrows(): # data in previous format
cats = dict(cats_empty)
try:
#fetch article full text from link
article = newspaper.Article(row['link'])
article.download()
article.parse()
doc = nlp.make_doc(article.text) # create doc object from text
#Assign 1 to positive match for each category
cats_list = row["GROUP_CONCAT(j.name)"].split(',')
for cat in cats_list:
cats[cat] = 1
doc.cats = cats
print(doc)
print(cats)
print(doc.cats)
db.add(doc)
except:
cats_list = row["GROUP_CONCAT(j.name)"].split(',')
continue
db.to_disk(output_path) # save the docbin object
来自 doc.cats 的示例打印输出:
{'Santé': 1, 'Économie': 0, 'Infrastructure': 0, 'Politique fédérale': 0, 'Politique provinciale': 1, 'Politique municipale': 0, 'Éducation': 0, 'Faits divers': 0, 'Culture': 0}
spacy train
运行CLI 命令时的完整错误消息:
ℹ Using CPU
=========================== Initializing pipeline ===========================
[2021-08-18 06:09:46,242] [INFO] Set up nlp object from config
[2021-08-18 06:09:46,259] [INFO] Pipeline: ['tok2vec', 'textcat_multilabel', 'ner', 'parser']
[2021-08-18 06:09:46,266] [INFO] Created vocabulary
[2021-08-18 06:09:50,649] [INFO] Added vectors: fr_core_news_lg
[2021-08-18 06:09:56,557] [INFO] Finished initializing nlp object
[2021-08-18 06:10:07,714] [INFO] Initialized pipeline components: ['tok2vec', 'textcat_multilabel', 'ner', 'parser']
✔ Initialized pipeline
============================= Training pipeline =============================
ℹ Pipeline: ['tok2vec', 'textcat_multilabel', 'ner', 'parser']
ℹ Initial learn rate: 0.001
E # LOSS TOK2VEC LOSS TEXTC... LOSS NER LOSS PARSER CATS_SCORE ENTS_F ENTS_P ENTS_R DEP_UAS DEP_LAS SENTS_F SCORE
--- ------ ------------ ------------- -------- ----------- ---------- ------ ------ ------ ------- ------- ------- ------
Traceback (most recent call last):
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 281, in evaluate
scores = nlp.evaluate(dev_corpus(nlp))
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/language.py", line 1389, in evaluate
results = scorer.score(examples)
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/scorer.py", line 135, in score
scores.update(component.score(examples, **self.cfg))
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/pipeline/textcat_multilabel.py", line 179, in score
return Scorer.score_cats(
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/scorer.py", line 465, in score_cats
auc_per_type[label].score_set(pred_score, gold_score)
KeyError: 'Politique fédérale'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/__main__.py", line 4, in <module>
setup_cli()
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/cli/_util.py", line 69, in setup_cli
command(prog_name=COMMAND)
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 829, in __call__
return self.main(*args, **kwargs)
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 782, in main
rv = self.invoke(ctx)
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 1259, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 1066, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 610, in invoke
return callback(*args, **kwargs)
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/typer/main.py", line 497, in wrapper
return callback(**use_params) # type: ignore
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/cli/train.py", line 59, in train_cli
train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 122, in train
raise e
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 105, in train
for batch, info, is_best_checkpoint in training_step_iterator:
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 226, in train_while_improving
score, other_scores = evaluate()
File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 283, in evaluate
raise KeyError(Errors.E900.format(pipeline=nlp.pipe_names)) from e
KeyError: "[E900] Could not run the full pipeline for evaluation. If you specified frozen components, make sure they were already initialized and trained. Full pipeline: ['tok2vec', 'textcat_multilabel', 'ner', 'parser']"
运行spacy debug data
CLI 命令时的输出:
====================== Text Classification (Multilabel) ======================
ℹ Text Classification: 30 label(s)
⚠ Some model labels are not present in the train data. The model
performance may be degraded for these labels after training: 'v', 'F', 'm', 'f',
'É', 'l', 'c', 'q', 'o', ']', 'u', 'I', 'P', 'r', 'a', 'D', 'é', 'S', 't', ',',
'M', ' ', 's', ''', 'd', 'i', 'p', 'e', 'n', '['.
查看该输出中的标签列表,我很确定我的问题在于我格式化用于设置 doc.cats 的字典的方式,但我似乎找不到正确的格式化方法。我确定它在文档中的某个地方,但我似乎找不到它并且感觉有点傻......