目前我可以使用下面的配置文件训练语义角色标签模型。此配置文件基于AllenNLP 提供的配置文件,适用于默认bert-base-uncased
模型,也适用于GroNLP/bert-base-dutch-cased
.
{
"dataset_reader": {
"type": "srl_custom",
"bert_model_name": "GroNLP/bert-base-dutch-cased"
},
"data_loader": {
"batch_sampler": {
"type": "bucket",
"batch_size": 32
}
},
"train_data_path": "./data/SRL/SONAR_1_SRL/MANUAL500/",
"validation_data_path": "./data/SRL/SONAR_1_SRL/MANUAL500/",
"model": {
"type": "srl_bert",
"embedding_dropout": 0.1,
"bert_model": "GroNLP/bert-base-dutch-cased"
},
"trainer": {
"optimizer": {
"type": "huggingface_adamw",
"lr": 5e-5,
"correct_bias": false,
"weight_decay": 0.01,
"parameter_groups": [
[
[
"bias",
"LayerNorm.bias",
"LayerNorm.weight",
"layer_norm.weight"
],
{
"weight_decay": 0.0
}
]
]
},
"learning_rate_scheduler": {
"type": "slanted_triangular"
},
"checkpointer": {
"keep_most_recent_by_count": 2
},
"grad_norm": 1.0,
"num_epochs": 3,
"validation_metric": "+f1-measure-overall"
}
}
由于 SRL 数据读取器仅支持 BertTokenizer而不支持 RobertaTokenizer,因此将的值bert_model_name
和bert_model
参数从交换GroNLP/bert-base-dutch-cased
到开箱即用是行不通的。所以我将配置文件更改为以下内容:roberta-base
{
"dataset_reader": {
"type": "srl_custom",
"token_indexers": {
"tokens": {
"type": "pretrained_transformer",
"model_name": "roberta-base"
}
}
},
"data_loader": {
"batch_sampler": {
"type": "bucket",
"batch_size": 32
}
},
"train_data_path": "./data/SRL/SONAR_1_SRL/MANUAL500/",
"validation_data_path": "./data/SRL/SONAR_1_SRL/MANUAL500/",
"model": {
"type": "srl_bert",
"embedding_dropout": 0.1,
"bert_model": "roberta-base"
},
"trainer": {
"optimizer": {
"type": "huggingface_adamw",
"lr": 5e-5,
"correct_bias": false,
"weight_decay": 0.01,
"parameter_groups": [
[
[
"bias",
"LayerNorm.bias",
"LayerNorm.weight",
"layer_norm.weight"
],
{
"weight_decay": 0.0
}
]
]
},
"learning_rate_scheduler": {
"type": "slanted_triangular"
},
"checkpointer": {
"keep_most_recent_by_count": 2
},
"grad_norm": 1.0,
"num_epochs": 15,
"validation_metric": "+f1-measure-overall"
}
}
但是,这仍然不起作用。我收到以下错误:
2022-02-22 16:19:34,122 - INFO - allennlp.training.gradient_descent_trainer - Training
0%| | 0/1546 [00:00<?, ?it/s]2022-02-22 16:19:34,142 - INFO - allennlp.data.samplers.bucket_batch_sampler - No sorting keys given; trying to guess a good one
2022-02-22 16:19:34,142 - INFO - allennlp.data.samplers.bucket_batch_sampler - Using ['tokens'] as the sorting keys
0%| | 0/1546 [00:00<?, ?it/s]
2022-02-22 16:19:34,526 - CRITICAL - root - Uncaught exception
Traceback (most recent call last):
File "C:\Program Files\Python39\lib\runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Program Files\Python39\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\Scripts\allennlp.exe\__main__.py", line 7, in <module>
sys.exit(run())
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\__main__.py", line 39, in run
main(prog="allennlp")
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\__init__.py", line 119, in main
args.func(args)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 111, in train_model_from_args
train_model_from_file(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 177, in train_model_from_file
return train_model(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 258, in train_model
model = _train_worker(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 508, in _train_worker
metrics = train_loop.run()
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 581, in run
return self.trainer.train()
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\training\gradient_descent_trainer.py", line 771, in train
metrics, epoch = self._try_train()
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\training\gradient_descent_trainer.py", line 793, in _try_train
train_metrics = self._train_epoch(epoch)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\training\gradient_descent_trainer.py", line 510, in _train_epoch
batch_outputs = self.batch_outputs(batch, for_training=True)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\training\gradient_descent_trainer.py", line 403, in batch_outputs
output_dict = self._pytorch_model(**batch)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp_models\structured_prediction\models\srl_bert.py", line 141, in forward
bert_embeddings, _ = self.bert_model(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_bert.py", line 989, in forward
embedding_output = self.embeddings(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_bert.py", line 215, in forward
token_type_embeddings = self.token_type_embeddings(token_type_ids)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\sparse.py", line 156, in forward
return F.embedding(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\functional.py", line 1916, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
我不完全理解出了什么问题,也找不到任何有关如何更改配置文件以加载“自定义”BERT/RoBERTa 模型(此处未提及的模型)的文档。我正在运行默认allennlp train config.jsonnet
命令来开始训练。allennlp train config.jsonnet --dry-run
但是不会产生错误。
提前致谢!蒂斯
编辑: 我现在已经换掉并继承了自定义“srl_roberta”类的“srl_bert”,以使用RobertaModel。然而,这仍然会产生相同的错误。
EDIT2:我现在正在使用 Dirk Groeneveld 建议的 AutoTokenizer。更改 SrlReader 类以支持基于 RoBERTa 的模型似乎涉及更多更改,例如将 BERT 的 wordpiece 标记器交换为 RoBERTa 的 BPE 标记器。有没有一种简单的方法来适应 SrlReader 类,还是从头开始编写一个新的 RobertaSrlReader 更好?
我继承了 SrlReader 类并将这一行更改为以下内容:
self.bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
由于 RoBERTa 标记化与 BERT 不同,它会产生以下错误:
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp_models\structured_prediction\dataset_readers\srl.py", line 255, in text_to_instance
wordpieces, offsets, start_offsets = self._wordpiece_tokenize_input(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp_models\structured_prediction\dataset_readers\srl.py", line 196, in _wordpiece_tokenize_input
word_pieces = self.bert_tokenizer.wordpiece_tokenizer.tokenize(token)
AttributeError: 'RobertaTokenizerFast' object has no attribute 'wordpiece_tokenizer'