我正在试验 python NLTK 文本分类。这是我正在练习的代码示例:http ://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
这是代码:
from nltk import bigrams
from nltk.probability import ELEProbDist, FreqDist
from nltk import NaiveBayesClassifier
from collections import defaultdict
train_samples = {}
with file ('data/positive.txt', 'rt') as f:
for line in f.readlines():
train_samples[line] = 'pos'
with file ('data/negative.txt', 'rt') as d:
for line in d.readlines():
train_samples[line] = 'neg'
f = open("data/test.txt", "r")
test_samples = f.readlines()
# Error in this code
# def bigramReturner(text):
# tweetString = text.lower()
# bigramFeatureVector = {}
# for item in bigrams(tweetString.split()):
# bigramFeatureVector.append(' '.join(item))
# return bigramFeatureVector
# Updated the code from the stack overflow comment
def bigramReturner (tweetString):
tweetString = tweetString.lower()
#comment the line since the function is not defined
#tweetString = removePunctuation (tweetString)
bigramFeatureVector = []
for item in nltk.unigrams(tweetString.split()):
bigramFeatureVector.append(' '.join(item))
return bigramFeatureVector
def get_labeled_features(samples):
word_freqs = {}
for text, label in train_samples.items():
tokens = text.split()
for token in tokens:
if token not in word_freqs:
word_freqs[token] = {'pos': 0, 'neg': 0}
word_freqs[token][label] += 1
return word_freqs
def get_label_probdist(labeled_features):
label_fd = FreqDist()
for item, counts in labeled_features.items():
for label in ['neg', 'pos']:
if counts[label] > 0:
label_fd.inc(label)
label_probdist = ELEProbDist(label_fd)
return label_probdist
def get_feature_probdist(labeled_features):
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
num_samples = len(train_samples) / 2
for token, counts in labeled_features.items():
for label in ['neg', 'pos']:
feature_freqdist[label, token].inc(True, count=counts[label])
feature_freqdist[label, token].inc(None, num_samples - counts[label])
feature_values[token].add(None)
feature_values[token].add(True)
for item in feature_freqdist.items():
print item[0], item[1]
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return feature_probdist
labeled_features = get_labeled_features(train_samples)
label_probdist = get_label_probdist(labeled_features)
feature_probdist = get_feature_probdist(labeled_features)
classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
for sample in test_samples:
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
但是当我运行代码时,出现以下错误:
Traceback (most recent call last):
File "naive_bigram_1.py", line 87, in <module>
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
File "naive_bigram_1.py", line 30, in bigramReturner
tweetString = removePunctuation (tweetString)
NameError: global name 'removePunctuation' is not defined
我看到了类似的问题和其他错误,在这里我也用朴素贝叶斯分类器更新了 n-grams