我正在尝试使用 Huggingface 预训练模型“GPT2dialog”作为句子的编码器,但文本索引器让我感到困惑。详细来说,我可以正常运行带有预训练索引器的 dataset_reader 的单元测试,当使用 train 命令训练模型时会导致错误:
File "/home/lee/anaconda3/envs/allennlp/lib/python3.6/site-packages/allennlp/common/lazy.py", line 54, in constructor_to_use
return constructor.from_params(Params({}), **kwargs) # type: ignore[union-attr]
File "/home/lee/anaconda3/envs/allennlp/lib/python3.6/site-packages/allennlp/common/from_params.py", line 604, in from_params
**extras,
File "/home/lee/anaconda3/envs/allennlp/lib/python3.6/site-packages/allennlp/common/from_params.py", line 634, in from_params
return constructor_to_call(**kwargs) # type: ignore
File "/home/lee/anaconda3/envs/allennlp/lib/python3.6/site-packages/allennlp/data/vocabulary.py", line 310, in from_instances
instance.count_vocab_items(namespace_token_counts)
File "/home/lee/anaconda3/envs/allennlp/lib/python3.6/site-packages/allennlp/data/instance.py", line 60, in count_vocab_items
field.count_vocab_items(counter)
File "/home/lee/anaconda3/envs/allennlp/lib/python3.6/site-packages/allennlp/data/fields/text_field.py", line 78, in count_vocab_items
for indexer in self.token_indexers.values():
AttributeError: 'PretrainedTransformerIndexer' object has no attribute 'values'
这是我的 dataset_reader 代码。
class MultiWozDatasetReader(DatasetReader):
def __init__(self,
lazy:bool = False,
tokenizer: Tokenizer = None,
tokenindexer:Dict[str, TokenIndexer] = None
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WhitespaceTokenizer()
self._tokenindexer = PretrainedTransformerIndexer("microsoft/DialoGPT-small")
@overrides
def read(self, file_path: str):
logger.warn("call read")
with open(file_path, 'r') as data_file:
dialogs = json.load(data_file)
for dialog in dialogs:
dialogue = dialog["dialogue"]
for turn_num in range(len(dialogue)):
dia_single_turn = dialogue[turn_num]
sys_utt = dia_single_turn["system_transcript"]
user_utt = dia_single_turn["transcript"]
state_category = dia_single_turn["state_category"]
span_info = dia_single_turn["span"]
yield self.text_to_instance(sys_utt, user_utt, state_category, span_info)
@overrides
def text_to_instance(self, sys_utt, user_utt, state_catgory, span_info):
tokenized_sys_utt = self._tokenizer.tokenize(sys_utt)
tokenized_user_utt = self._tokenizer.tokenize(user_utt)
tokenized_span_info = self._tokenizer.tokenize(span_info)
tokenized_classifier_input = self._tokenizer.tokenize("[CLS] "+ sys_utt + " [SEP] "+ user_utt)
sys_utt_field = TextField(tokenized_sys_utt, self._tokenindexer)
user_utt_field = TextField(tokenized_user_utt, self._tokenindexer)
classifier_filed = TextField(tokenized_classifier_input, self._tokenindexer)
span_field = TextField(tokenized_span_info, self._tokenindexer)
fields = {"sys_utt": sys_utt_field,"user_utt":user_utt_field,"classifier_input":classifier_filed,"span":span_field}
fields['label']=LabelField(state_catgory)
return Instance(fields)
我在网上找了很久。但是没有用。请帮助或尝试提供一些想法如何实现这一目标。