按照此处的示例笔记本:
https://github.com/amaiya/ktrain/blob/master/examples/text/20newsgroup-distilbert.ipynb
在第 1 步:预处理数据中,我遇到了下面列出的错误。当我在 Colab 笔记本中执行完全相同的操作时,它可以工作。我的机器上缺少什么?我可以用 BERT 运行它,DistilBERT 会导致问题。
trn, val, preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
x_test=x_test, y_test=y_test,
class_names=class_names,
preprocess_mode='distilbert',
maxlen=350)
原因:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-142-ff3842c91276> in <module>
3 class_names=class_names,
4 preprocess_mode='distilbert',
----> 5 maxlen=350)
/usr/local/lib/python3.7/site-packages/ktrain/text/data.py in texts_from_array(x_train, y_train, x_test, y_test, class_names, max_features, maxlen, val_pct, ngram_range, preprocess_mode, lang, random_state, verbose)
337 classes = class_names,
338 lang=lang, ngram_range=ngram_range)
--> 339 trn = preproc.preprocess_train(x_train, y_train, verbose=verbose)
340 val = preproc.preprocess_test(x_test, y_test, verbose=verbose)
341 return (trn, val, preproc)
/usr/local/lib/python3.7/site-packages/ktrain/text/preprocessor.py in preprocess_train(self, texts, y, mode, verbose)
766 pad_on_left=bool(self.name in ['xlnet']),
767 pad_token=self.tok.convert_tokens_to_ids([self.tok.pad_token][0]),
--> 768 pad_token_segment_id=4 if self.name in ['xlnet'] else 0)
769 self.set_multilabel(dataset, mode)
770 return dataset
/usr/local/lib/python3.7/site-packages/ktrain/text/preprocessor.py in hf_convert_examples(texts, y, tokenizer, max_length, pad_on_left, pad_token, pad_token_segment_id, mask_padding_with_zero)
280 pad_token=pad_token,
281 pad_token_segment_id=pad_token_segment_id,
--> 282 mask_padding_with_zero=mask_padding_with_zero)
283 features_list.append(features)
284 labels.append(y[idx] if y is not None else None)
/usr/local/lib/python3.7/site-packages/ktrain/text/preprocessor.py in hf_convert_example(text, tokenizer, max_length, pad_on_left, pad_token, pad_token_segment_id, mask_padding_with_zero)
206 max_length=max_length,
207 )
--> 208 input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
209
210 # The mask has 1 for real tokens and 0 for padding tokens. Only real
KeyError: 'token_type_ids'
有什么想法吗?