我正在使用我最近发现的一个很棒的包来提取关键字。'WordWise'。特别感谢开发它的 Jake Tae。
它在计划文本上运行良好并返回了很好的结果,但 DataFrame 的实现并不那么顺利。我决定构建一个函数来遍历给定列中的所有文本元素。
它确实有效,但需要相当长的时间。运行 2 或 3 分钟后,它给了我这个错误:
IndexError: list index out of range
这是我的代码:
def machine_learning_bert(text):
result = []
extractor = Extractor()
keywords = str(extractor.generate(text, 5)).strip('[]')
for e in keywords.split(', '):
n = e.strip("'")
hashtags = ('#' + n)
result.append(hashtags)
return ", ".join(result)
machine_learning_bert('''Bitcoin is great, we can call it BTC or btc. Space X and Apple are new comers in Etherium as well, we can call it ETH or eth. Non-tengible Token or NFT are the new Bitcoin.''')
输出:
'#apple, #btc, #etherium, #bitcoin, #new commers'
如您所见,它正在处理这个小文本,但运行时间长达 6 秒。我的数据框大约有 15000 行(* 2 列),我无法想象需要多长时间。
当我在这里将它应用到 DF 时,我的代码:
ds['keyword_title (Bert)'] = ds['title'].apply(lambda x: machine_learning_bert(x))
到目前为止,它仅适用于 10 行的样本 df。当我尝试将大小增加到 100 以上时,它会返回我提到的错误:
IndexError: list index out of range
主要问题是我不能在我的整个专栏中使用包模块。我不确定它是来自我的功能还是直接与包有关,与我想做的事情不兼容。
如果有人对此有所了解,请提前致谢!
有关更多详细信息,请在此处查看完整的错误消息:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-74-204289d94930> in <module>
----> 1 ds['keyword_title (Bert)'] = ds['title'].apply(lambda x: machine_learning_bert(x))
2 ds
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
4136 else:
4137 values = self.astype(object)._values
-> 4138 mapped = lib.map_infer(values, f, convert=convert_dtype)
4139
4140 if len(mapped) and isinstance(mapped[0], Series):
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-74-204289d94930> in <lambda>(x)
----> 1 ds['keyword_title (Bert)'] = ds['title'].apply(lambda x: machine_learning_bert(x))
2 ds
<ipython-input-73-77200bab4caa> in machine_learning_bert(text)
2 result = []
3 extractor = Extractor()
----> 4 keywords = str(extractor.generate(text, 5)).strip('[]')
5 for e in keywords.split(', '):
6 n = e.strip("'")
C:\ProgramData\Anaconda3\lib\site-packages\wordwise\core.py in generate(self, text, top_k)
34 candidates = self.get_candidates(text)
35 text_embedding = self.get_embedding(text)
---> 36 candidate_embeddings = self.get_embedding(candidates)
37 distances = cosine_similarity(text_embedding, candidate_embeddings)
38 keywords = [candidates[index] for index in distances.argsort()[0][-top_k:]]
C:\ProgramData\Anaconda3\lib\site-packages\torch\autograd\grad_mode.py in decorate_context(*args, **kwargs)
26 def decorate_context(*args, **kwargs):
27 with self.__class__():
---> 28 return func(*args, **kwargs)
29 return cast(F, decorate_context)
30
C:\ProgramData\Anaconda3\lib\site-packages\wordwise\core.py in get_embedding(self, source)
58 if isinstance(source, str):
59 source = [source]
---> 60 tokens = self.tokenizer(source, padding=True, return_tensors="pt")
61 outputs = self.model(**tokens, return_dict=True)
62 embedding = self.parse_outputs(outputs)
C:\ProgramData\Anaconda3\lib\site-packages\transformers\tokenization_utils_base.py in __call__(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2401 )
2402 batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-> 2403 return self.batch_encode_plus(
2404 batch_text_or_text_pairs=batch_text_or_text_pairs,
2405 add_special_tokens=add_special_tokens,
C:\ProgramData\Anaconda3\lib\site-packages\transformers\tokenization_utils_base.py in batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2586 )
2587
-> 2588 return self._batch_encode_plus(
2589 batch_text_or_text_pairs=batch_text_or_text_pairs,
2590 add_special_tokens=add_special_tokens,
C:\ProgramData\Anaconda3\lib\site-packages\transformers\tokenization_utils_fast.py in _batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose)
437 # we add an overflow_to_sample_mapping array (see below)
438 sanitized_tokens = {}
--> 439 for key in tokens_and_encodings[0][0].keys():
440 stack = [e for item, _ in tokens_and_encodings for e in item[key]]
441 sanitized_tokens[key] = stack
IndexError: list index out of range