1

我想(但仍然 Contino 不明白为什么会有差异)这段代码:

def categories(self):
    cur=self.con.execute('select category from cc');
    for d in cur:
        return d[0]

相当于另一个:

def categories(self):
    cur=self.con.execute('select category from cc');
    return [d[0] for d in cur]

但是,当我在代码中一个一个替换时,我在代码的其他位置出现错误:

  File "C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\main.py", line 226, in post
    spam_result = nb.classify(given_sentence)
  File "C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\main.py", line 204, in classify
    if cat==best: continue
UnboundLocalError: local variable 'best' referenced before assignment

为什么会这样?为什么这两段代码不等价?

完整代码:

# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-

import sqlite3

import USSSALoader

import random

from pysqlite2 import dbapi2 as sqlite

import re

import math

def getfeatures(doc):
  splitter=re.compile('\\W*')
  # Split the words by non-alpha characters
  words=[s.lower() for s in splitter.split(doc)
          if len(s)>2 and len(s)<20]
  # Return the unique set of words only
#  return dict([(w,1) for w in words]).iteritems()
  return dict([(w,1) for w in words])


class classifier:
  def __init__(self,getfeatures, filename=None):
    # Counts of feature/category combinations
    self.fc={}
    # Counts of documents in each category
    self.cc={}
    self.getfeatures=getfeatures

#  def setdb(self,dbfile):
    self.con=sqlite.connect('db_file')
#    self.con=sqlite3.connect(":memory:")
    self.con.execute('create table if not exists fc(feature,category,count)')
    self.con.execute('create table if not exists cc(category,count)')

  def incf(self,f,cat):
    count=self.fcount(f,cat)
    if count==0:
      self.con.execute("insert into fc values ('%s','%s',1)" % (f,cat))
    else:
      self.con.execute(
        "update fc set count=%d where feature='%s' and category='%s'"
        % (count+1,f,cat))

  def fcount(self,f,cat):
    res=self.con.execute(
      'select count from fc where feature="%s" and category="%s"'
      %(f,cat)).fetchone()
    if res==None: return 0
    else: return float(res[0])

  def incc(self,cat):
    count=self.catcount(cat)
    if count==0:
      self.con.execute("insert into cc values ('%s',1)" % (cat))
    else:
      self.con.execute("update cc set count=%d where category='%s'"
                       % (count+1,cat))

  def catcount(self,cat):
    res=self.con.execute('select count from cc where category="%s"'
                         %(cat)).fetchone()
    if res==None: return 0
    else: return float(res[0])

  def categories(self):
    cur=self.con.execute('select category from cc');
#    return [d[0] for d in cur]
    for d in cur:
 #       print "d =", d
  #      print "d[0] =", d[0]
        return d[0]

  def totalcount(self):
    res=self.con.execute('select sum(count) from cc').fetchone();
    if res==None: return 0
    print "res=self.con.execute('select * FROM cc').fetchall(); = ", self.con.execute('select * FROM cc').fetchall();
    print 'res sum(count) = ', res
    print 'res[0] = ', res[0]
    return res[0]

  def train(self,item,cat):
    features=self.getfeatures(item)
    # Increment the count for every feature with this category
    for f in features:
##    for f in features:
      self.incf(f,cat)
    # Increment the count for this category
    self.incc(cat)
    self.con.commit()

  def fprob(self,f,cat):
    if self.catcount(cat)==0: return 0
    # The total number of times this feature appeared in this
    # category divided by the total number of items in this category
    return self.fcount(f,cat)/self.catcount(cat)

  def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
    # Calculate current probability
    basicprob=prf(f,cat)
    # Count the number of times this feature has appeared in
    # all categories
    totals=sum([self.fcount(f,c) for c in self.categories()])
    # Calculate the weighted average
    bp=((weight*ap)+(totals*basicprob))/(weight+totals)
    return bp

class naivebayes(classifier):

  def __init__(self,getfeatures):
    classifier.__init__(self, getfeatures)
    self.thresholds={}

  def docprob(self,item,cat):
    features=self.getfeatures(item)
    # Multiply the probabilities of all the features together
    p=1
    for f in features: p*=self.weightedprob(f,cat,self.fprob)
    return p

  def prob(self,item,cat):
    catprob=self.catcount(cat)/self.totalcount()
    docprob=self.docprob(item,cat)
    return docprob*catprob

  def setthreshold(self,cat,t):
    self.thresholds[cat]=t

  def getthreshold(self,cat):
    if cat not in self.thresholds: return 1.0
    return self.thresholds[cat]

  def classify(self,item,default=None):
    probs={}
    # Find the category with the highest probability
    max=0.0
    for cat in self.categories():
      probs[cat]=self.prob(item,cat)
      if probs[cat]>max:
        max=probs[cat]
        best=cat

    # Make sure the probability exceeds threshold*next best
    for cat in probs:
      if cat==best: continue
      if probs[cat]*self.getthreshold(best)>probs[best]: return default
    return best

def sampletrain(cl):
  cl.train('Nobody owns the water.','good')
  cl.train('the quick rabbit jumps fences','good')
  cl.train('buy pharmaceuticals now','bad')
  cl.train('make quick money at the online casino','bad')
  cl.train('the quick brown fox jumps','good')


nb = naivebayes(getfeatures)

sampletrain(nb)


doc_test = "buy pharmaceuticals now or earn money at the online casino"

print ('\ndoc_test is classified as %s'%nb.classify(doc_test))
4

3 回答 3

4

一个函数只返回一次。

当你看到

for d in cur:
    return d[0]

循环在第一次迭代期间返回。

但是这个列表理解

return [d[0] for d in cur]

循环遍历每个项目cur以创建一个列表,然后返回结果。

于 2012-08-15T21:10:44.547 回答
1

Building on Steven Rumbalksi's answer, the following code:

dList = []
for d in cur:
    dList.append(d[0])
return dList

would be equivalent to:

return [d[0] for d in cur]

List comprehensions are really powerful like that, but they can be excessively dense ways of expressing the ideas, especially when you start nesting them, which leads to difficulty reading and debugging code.

于 2012-08-15T21:29:03.020 回答
1

我认为您可能会混淆生成器的常规功能,请更改

for d in cur:
    return d[0]

for d in cur:
    yield d[0]

返回一个可迭代的

于 2012-08-16T00:35:02.677 回答