我正在尝试找出每个相似度矩阵的 textrank 分数。定义 Summarize 函数以产生摘要。并且该函数被调用用于句子列表的列表,result
但是在使用PageRank算法对句子进行排名时会出现错误。我尝试通过手动更改max_iter
PageRank函数中的值来调试它,错误仍然相同。
get_score 函数
它在汇总函数中调用。此函数内部出现错误。
def get_score(sim_mat):
import networkx as nx
nx_graph = nx.from_numpy_array(sim_mat)
score = nx.pagerank(nx_graph, max_iter=500)
return score
Summarize 函数 获取原始文本并返回摘要
def summarize(text):
sentences = sent_tokenize(text)
t_clean_sentences = []
for i in range(len(sentences)):
obj = text_preprocessing(sentences[i])
j = obj.text_cleaner()
t_clean_sentences.append(j)
clean_sentences = []
for i in range(len(t_clean_sentences)):
a = gb.predict(vectorizer.transform([t_clean_sentences[i]]))
if a[0] != 'whQuestion' and a[0] != 'ynQuestion':
clean_sentences.append(t_clean_sentences[i])
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
filtered_sentences = []
for i in range(len(clean_sentences)):
word_tokens = word_tokenize(clean_sentences[i])
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentences.append(" ".join(filtered_sentence))
filtered_sentences
import numpy as np
#sentence vectors
sentence_vectors = []
for i in filtered_sentences:
if len(i) != 0:
v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
else:
v = np.zeros((100,))
sentence_vectors.append(v)
from sklearn.metrics.pairwise import cosine_similarity
sim_mat = np.zeros([len(clean_sentences), len(clean_sentences)])
for i in range(len(clean_sentences)):
for j in range(len(clean_sentences)):
if i != j:
sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
#pagerank scores
scores = get_score(sim_mat)
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(clean_sentences)), reverse=True)
# Specify number of sentences to form the summary
# Generate summary
summary = []
for i in range(len(ranked_sentences)):
summary.append(ranked_sentences[i][1].capitalize())
return summary
函数调用
大小result
是100
,当我尝试它的第一个50
句子列表时,result
它工作正常。然后我做了一个系统,其中循环一次只总结50
句子列表并继续直到达到大小,result
但它仍然显示相同的错误。
#text is the raw text from the TXT file
result = list(filter(lambda x : x != '', text.split(':')))
compiled = []
for r in result:
compiled.append(summarize(r))
错误
---------------------------------------------------------------------------
PowerIterationFailedConvergence Traceback (most recent call last)
<ipython-input-22-a04a4d4d0dfb> in <module>()
1 compiled = []
2 for r in range(len(result)):
----> 3 compiled.append(summarize(result[r]))
3 frames
<ipython-input-21-c7462482feb4> in summarize(text)
45
46 #pagerank scores
---> 47 scores = get_score(sim_mat)
48 ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(clean_sentences)), reverse=True)
49 # Specify number of sentences to form the summary
<ipython-input-10-798a017cf041> in get_score(sim_mat)
2 import networkx as nx
3 nx_graph = nx.from_numpy_array(sim_mat)
----> 4 score = nx.pagerank(nx_graph)
5 return score
<decorator-gen-431> in pagerank(G, alpha, personalization, max_iter, tol, nstart, weight, dangling)
/usr/local/lib/python3.6/dist-packages/networkx/utils/decorators.py in _not_implemented_for(not_implement_for_func, *args, **kwargs)
80 raise nx.NetworkXNotImplemented(msg)
81 else:
---> 82 return not_implement_for_func(*args, **kwargs)
83 return _not_implemented_for
84
/usr/local/lib/python3.6/dist-packages/networkx/algorithms/link_analysis/pagerank_alg.py in pagerank(G, alpha, personalization, max_iter, tol, nstart, weight, dangling)
156 if err < N * tol:
157 return x
--> 158 raise nx.PowerIterationFailedConvergence(max_iter)
159
160
PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')