0

我正在尝试使用此链接上的代码...参见示例 6。

所以这是代码:

import json
import nltk
import numpy

BLOG_DATA = "resources/ch05-webpages/feed.json"

N = 100  # Number of words to consider
CLUSTER_THRESHOLD = 5  # Distance between words to consider
TOP_SENTENCES = 5  # Number of sentences to return for a "top n" summary

# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn

def _score_sentences(sentences, important_words):
    scores = []
    sentence_idx = -1

for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:

    sentence_idx += 1
    word_idx = []

    # For each word in the word list...
    for w in important_words:
        try:
            # Compute an index for where any important words occur in the sentence.

            word_idx.append(s.index(w))
        except ValueError, e: # w not in this particular sentence
            pass

    word_idx.sort()

    # It is possible that some sentences may not contain any important words at all.
    if len(word_idx)== 0: continue

    # Using the word index, compute clusters by using a max distance threshold
    # for any two consecutive words.

    clusters = []
    cluster = [word_idx[0]]
    i = 1
    while i < len(word_idx):
        if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
            cluster.append(word_idx[i])
        else:
            clusters.append(cluster[:])
            cluster = [word_idx[i]]
        i += 1
    clusters.append(cluster)

    # Score each cluster. The max score for any given cluster is the score 
    # for the sentence.

    max_cluster_score = 0
    for c in clusters:
        significant_words_in_cluster = len(c)
        total_words_in_cluster = c[-1] - c[0] + 1
        score = 1.0 * significant_words_in_cluster \
            * significant_words_in_cluster / total_words_in_cluster

        if score > max_cluster_score:
            max_cluster_score = score

    scores.append((sentence_idx, score))

return scores

def summarize(txt):
   sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
   normalized_sentences = [s.lower() for s in sentences]

   words = [w.lower() for sentence in normalized_sentences for w in
         nltk.tokenize.word_tokenize(sentence)]

   fdist = nltk.FreqDist(words)

   top_n_words = [w[0] for w in fdist.items() 
        if w[0] not in nltk.corpus.stopwords.words('english')][:N]

   scored_sentences = _score_sentences(normalized_sentences, top_n_words)

 # Summarization Approach 1:
 # Filter out nonsignificant sentences by using the average score plus a
 # fraction of the std dev as a filter

avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
               if score > avg + 0.5 * std]

# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences

top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])

# Decorate the post object with summaries

return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
            mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])

blog_data = json.loads(open(BLOG_DATA).read())

for post in blog_data:

   post.update(summarize(post['content']))

   print post['title']
   print '=' * len(post['title'])
   print
   print 'Top N Summary'
   print '-------------'
   print ' '.join(post['top_n_summary'])
   print
   print 'Mean Scored Summary'
   print '-------------------'
   print ' '.join(post['mean_scored_summary'])
   print

但是当我运行它时它说:

Traceback (most recent call last):
  File "/home/jetonp/PycharmProjects/Summeriza/blogs_and_nlp__summarize.py", line 117, in <module>
    post.update(summarize(post['content']))
AttributeError: 'unicode' object has no attribute 'update'

Process finished with exit code 1

是什么导致了这个错误,我该如何解决?

4

1 回答 1

0

我想到了。在您正在处理的示例中,summary 方法返回一个字典。由于缩进不当,您的汇总方法不会返回任何内容。一部分只有三个空格,一部分没有空格。python中的标准缩进是四个空格。总结应该是这样的:

def summarize(txt):
    sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
    normalized_sentences = [s.lower() for s in sentences]

    words = [w.lower() for sentence in normalized_sentences for w in
          nltk.tokenize.word_tokenize(sentence)]

    fdist = nltk.FreqDist(words)

    top_n_words = [w[0] for w in fdist.items() 
        if w[0] not in nltk.corpus.stopwords.words('english')][:N]

    scored_sentences = _score_sentences(normalized_sentences, top_n_words)

     # Summarization Approach 1:
     # Filter out nonsignificant sentences by using the average score plus a
     # fraction of the std dev as a filter

    avg = numpy.mean([s[1] for s in scored_sentences])
    std = numpy.std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]

    # Summarization Approach 2:
    # Another approach would be to return only the top N ranked sentences

    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])

    # Decorate the post object with summaries

    return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
                mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
于 2013-12-18T19:43:55.883 回答