python - Python：在对象列表中获取两个布尔属性频率的最有效方法？

Question

我有一个用户对象，有两个布尔属性，如下所示：

class User(object):
  def __init__(self, a, b):
    self.a = a  # Always a bool
    self.b = b  # Always a bool

我有一个这些对象的列表，称为user_list，并且我想获得有多少对象具有 a == True、a == False、b == True 和 b == False 的频率计数。

我最初的方法是使用 collections.Counter，但这需要在列表中循环两次：

a_count = collections.Counter(u.a for u in user_list)
b_count = collections.Counter(u.b for u in user_list)
print a_count[True], a_count[False], b_count[True], b_count[False]

我还考虑过只使用 4 个计数器，但这很丑陋，而且感觉不像 Python：

a_true_count = 0
a_false_count = 0
b_true_count = 0
b_false_count = 0
for u in user_list:
  if u.a:
    a_true_count += 1
  else:
    a_false_count += 1
  if u.b:
    b_true_count += 1
  else:
    a_false_count += 1
print a_true_count, a_false_count, b_true_count, b_false_count

有没有更有效的方法来做到这一点？输出可以是任何东西：4 个单独的变量、一个带有值的 dict、一个列表、元组等等，只要其中包含这 4 个值。

提前致谢！

score 3 · Accepted Answer

我认为使用 acollections.Counter是正确的想法，只需使用单个Counter和单个循环以更通用的方式进行：

from collections import Counter

user_list = [User(True, False), User(False, True), User(True, True), User(False, False)]
user_attr_count = Counter()

for user in user_list:
    user_attr_count['a_%s' % user.a] += 1
    user_attr_count['b_%s' % user.b] += 1

print user_attr_count
# Counter({'b_False': 2, 'a_True': 2, 'b_True': 2, 'a_False': 2})

score 2 · Accepted Answer

为什么不使用两个计数器，并从长度中减去user_list以找到其他两个值？

a_false_count = len(user_list) - a_true_count

b_false_count = len(user_list) - b_true_count

像这样显式循环可能是时间上最有效的解决方案，但如果您正在寻找更简洁的代码方式，您可以尝试filter()：

a_false_count = len(filter(lambda x: x.a,user_list))
b_false_count = len(filter(lambda x: x.b,user_list))

score 1 · Accepted Answer

您可以使用位掩码：

def count(user_list,mask):
    return Counter((u.a<<1 | u.b)&mask for u in user_list)

a=0b10
b=0b01
aANDb=0b11
print count(user_list,aANDb)

score 1 · Accepted Answer

from collections import Counter

c = Counter()
for u in user_list:
    c['a'] += u.a
    c['b'] += u.b

print c['a'], len(user_list) - c['a'], c['b'], len(user_list) - c['b']

score 1 · Accepted Answer

这是一个与您第一次获得的解决方案接近的解决方案，只是它只迭代列表一次。它创建两个计数器，遍历列表，并为每个用户更新每个计数器。进行计数的实际步骤在这里：

for user in user_list:
    a_count.update([user.a])
    b_count.update([user.b])

它使用更新函数来更新每个计数器对象。您可以这样做，而不是像在第一个示例中那样使用生成器在一行中创建计数器。整个代码示例在这里：

import collections

class User(object):
    def __init__(self, a, b):
        self.a = a
        self.b = b

user_list = [
    User(True, False),
    User(False, True),
    User(True, True),
    User(False, False)
]

a_count = collections.Counter()
b_count = collections.Counter()

for user in user_list:
    a_count.update([user.a])
    b_count.update([user.b])


print a_count[True], a_count[False], b_count[True], b_count[False]

score 1 · Accepted Answer

我喜欢对这些东西使用zip和map：

from collections import Counter
# for test, import random:
import random

# define class
class User(object):
  def __init__(self, a, b):
    self.a = a  # Always a bool
    self.b = b  # Always a bool

# create an arbitrary set
users = [ User( r % 2 == 0, r % 3 == 0 ) for r in (random.randint(0,100) for x in xrange(100)) ]

# and... count
aCounter, bCounter = map(Counter, zip(*((u.a, u.b) for u in users)))

更新： map(sum, zip(*tuples))在较小的样本量上比 for 循环略快，但对于较大的样本量，for 循环的扩展性要好得多。. for 循环不会像其他循环那样从处理元组列表中获得太多的性能提升方法。可能是因为它已经非常理想了。

collections.Counter还是很慢。

import random
import itertools
import time
from collections import Counter 

# define class
class User(object):
  def __init__(self, a, b):
    self.a = a  # Always a bool
    self.b = b  # Always a bool

# create an arbitrary sample
users = [ User( r % 2 == 0, r % 3 == 0 ) for r in (random.randint(0,100) for x in xrange(100)) ]
# create a list of tuples of the arbitrary sample
users2 = [ ( u.a,u.b) for u in users ] 

# useful function-timer decorator           
def timer(times=1):
    def outer(fn):
        def wrapper(*args, **kwargs):
            t0 = time.time()
            for n in xrange(times):
                r = fn(*args, **kwargs)
            dt = time.time() - t0
            print '{} ran {} times in {} seconds with {:f} ops/sec'.format(fn.__name__, times, dt, times/dt)
            return r
        return wrapper
    return outer 

# now create the timeable functions         
n=10000
@timer(times=n)
def time_sum():
    return map(sum, zip(*((u.a, u.b) for u in users)))
@timer(times=n)
def time_counter():
    return map(Counter, zip(*((u.a, u.b) for u in users)))
@timer(times=n)
def time_for():
    a,b=0,0
    for u in users:
        if u.a is True:
            a += 1
        if u.b is True:
            b += 1
    return a,b
@timer(times=n)
def time_itermapzip():
    return list(itertools.imap(sum, itertools.izip(*((u.a, u.b) for u in users))))

@timer(times=n)
def time_sum2():
    return map(sum, zip(*users2))
@timer(times=n)
def time_counter2():
    return map(Counter, zip(*users2))
@timer(times=n)
def time_for2():
    a,b=0,0
    for _a,_b in users2:
        if _a is True:
            a += 1
        if _b is True:
            b += 1
    return a,b
@timer(times=n)
def time_itermapzip2():
    return list(itertools.imap(sum, itertools.izip(*users2))) 

v = time_sum()
v = time_counter()
v = time_for()
v = time_itermapzip()

v = time_sum2()
v= time_counter2()
v = time_for2()
v = time_itermapzip2() 

# time_sum ran 10000 times in 0.446894168854 seconds with 22376.662523 ops/sec
# time_counter ran 10000 times in 1.29836297035 seconds with 7702.006471 ops/sec
# time_for ran 10000 times in 0.267076015472 seconds with 37442.523554 ops/sec
# time_itermapzip ran 10000 times in 0.459508895874 seconds with 21762.364319 ops/sec
# time_sum2 ran 10000 times in 0.174293994904 seconds with 57374.323226 ops/sec
# time_counter2 ran 10000 times in 0.989939928055 seconds with  10101.623055 ops/sec
# time_for2 ran 10000 times in 0.183295965195 seconds with 54556.574605 ops/sec
# time_itermapzip2 ran 10000 times in 0.193426847458 seconds with 51699.131384 ops/sec

print "True a's: {}\t False a's: {}\nTrue b's: {}\t False b's:{}".format(v[0], len(users)-v[0], v[1], len(users)-v[1]) 
# True a's: 53   False a's: 47
# True b's: 31   False b's:69
v
# [53, 31]

样本大小为 1000 的相同代码：

# time_sum ran 10000 times in 9.30428719521 seconds with 1074.773359 ops/sec
# time_counter ran 10000 times in 16.7009849548 seconds with 598.767080 ops/sec
# time_for ran 10000 times in 2.61371207237 seconds with 3825.976130 ops/sec
# time_itermapzip ran 10000 times in 9.40824103355 seconds with 1062.897939 ops/sec
# time_sum2 ran 10000 times in 5.70988488197 seconds with 1751.348794 ops/sec
# time_counter2 ran 10000 times in 13.4643371105 seconds with 742.702735 ops/sec
# time_for2 ran 10000 times in 2.49017906189 seconds with 4015.775473 ops/sec
# time_itermapzip2 ran 10000 times in 6.10926699638 seconds with 1636.857581 ops/sec

python - Python：在对象列表中获取两个布尔属性频率的最有效方法？

6 回答 6

Related

Reference