在 Google Colab 上运行 KeyBERT 来提取关键字会给出以下代码:
from keybert import KeyBERT
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words =None)
print(keywords)
但我得到一个TypeError: 'DistilBertTokenizer' 对象不可调用。我正在查看关于 stackoverflow 的另一篇文章。我猜我可能不应该直接调用 extract_keywords 。有什么建议吗?
完整的日志被复制:
TypeError Traceback (most recent call last)
<ipython-input-18-f06d098e147a> in <module>()
----> 1 keywords = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words =None)
2 print(keywords)
5 frames
/usr/local/lib/python3.7/dist-packages/keybert/model.py in extract_keywords(self, docs, candidates, keyphrase_ngram_range, stop_words, top_n, min_df, use_maxsum, use_mmr, diversity, nr_candidates, vectorizer)
112 diversity=diversity,
113 nr_candidates=nr_candidates,
--> 114 vectorizer=vectorizer)
115 elif isinstance(docs, list):
116 warnings.warn("Although extracting keywords for multiple documents is faster "
/usr/local/lib/python3.7/dist-packages/keybert/model.py in _extract_keywords_single_doc(self, doc, candidates, keyphrase_ngram_range, stop_words, top_n, use_maxsum, use_mmr, diversity, nr_candidates, vectorizer)
163
164 # Extract Embeddings
--> 165 doc_embedding = self.model.embed([doc])
166 candidate_embeddings = self.model.embed(candidates)
167
/usr/local/lib/python3.7/dist-packages/keybert/backend/_sentencetransformers.py in embed(self, documents, verbose)
51 that each have an embeddings size of `m`
52 """
---> 53 embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
54 return embeddings
/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py in encode(self, sentences, batch_size, show_progress_bar, output_value, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)
154 for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
155 sentences_batch = sentences_sorted[start_index:start_index+batch_size]
--> 156 features = self.tokenize(sentences_batch)
157 features = batch_to_device(features, device)
158
/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py in tokenize(self, texts)
307 Tokenizes the texts
308 """
--> 309 return self._first_module().tokenize(texts)
310
311 def get_sentence_features(self, *features):
/usr/local/lib/python3.7/dist-packages/sentence_transformers/models/Transformer.py in tokenize(self, texts)
98
99
--> 100 output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length))
101 return output
102
TypeError: 'DistilBertTokenizer' object is not callable
我尝试使用不同的模型(BertTokenizer),但错误仍然存在。任何建议表示赞赏。