我们可以实现一个跟踪 top-k 值的类,因为我不相信标准库有这个内置的。这将与主字典对象(可能是 a )并行保持最新。Counter
您还可以将其用作主字典对象的子类的属性。
执行
class MostCommon(object):
"""Keep track the top-k key-value pairs.
Attributes:
top: Integer representing the top-k items to keep track of.
store: Dictionary of the top-k items.
min: The current minimum of any top-k item.
min_set: Set where keys are counts, and values are the set of
keys with that count.
"""
def __init__(self, top):
"""Create a new MostCommon object to track key-value paris.
Args:
top: Integer representing the top-k values to keep track of.
"""
self.top = top
self.store = dict()
self.min = None
self.min_set = defaultdict(set)
def _update_existing(self, key, value):
"""Update an item that is already one of the top-k values."""
# Currently handle values that are non-decreasing.
assert value > self.store[key]
self.min_set[self.store[key]].remove(key)
if self.store[key] == self.min: # Previously was the minimum.
if not self.min_set[self.store[key]]: # No more minimums.
del self.min_set[self.store[key]]
self.min_set[value].add(key)
self.min = min(self.min_set.keys())
self.min_set[value].add(key)
self.store[key] = value
def __contains__(self, key):
"""Boolean if the key is one of the top-k items."""
return key in self.store
def __setitem__(self, key, value):
"""Assign a value to a key.
The item won't be stored if it is less than the minimum (and
the store is already full). If the item is already in the store,
the value will be updated along with the `min` if necessary.
"""
# Store it if we aren't full yet.
if len(self.store) < self.top:
if key in self.store: # We already have this item.
self._update_existing(key, value)
else: # Brand new item.
self.store[key] = value
self.min_set[value].add(key)
if value < self.min or self.min is None:
self.min = value
else: # We're full. The value must be greater minimum to be added.
if value > self.min: # New item must be larger than current min.
if key in self.store: # We already have this item.
self._update_existing(key, value)
else: # Brand new item.
# Make room by removing one of the current minimums.
old = self.min_set[self.min].pop()
del self.store[old]
# Delete the set if there are no old minimums left.
if not self.min_set[self.min]:
del self.min_set[self.min]
# Add the new item.
self.min_set[value].add(key)
self.store[key] = value
self.min = min(self.min_set.keys())
def __repr__(self):
if len(self.store) < 10:
store = repr(self.store)
else:
length = len(self.store)
largest = max(self.store.itervalues())
store = '<len={length}, max={largest}>'.format(length=length,
largest=largest)
return ('{self.__class__.__name__}(top={self.top}, min={self.min}, '
'store={store})'.format(self=self, store=store))
示例用法
>>> common = MostCommon(2)
>>> common
MostCommon(top=2, min=None, store={})
>>> common['a'] = 1
>>> common
MostCommon(top=2, min=1, store={'a': 1})
>>> 'a' in common
True
>>> common['b'] = 2
>>> common
MostCommon(top=2, min=1, store={'a': 1, 'b': 2})
>>> common['c'] = 3
>>> common
MostCommon(top=2, min=2, store={'c': 3, 'b': 2})
>>> 'a' in common
False
>>> common['b'] = 4
>>> common
MostCommon(top=2, min=3, store={'c': 3, 'b': 4})
更新值后的访问确实是 O(1)
>>> counter = Counter()
>>> for x in permutations(xrange(10), 10):
counter[x] += 1
>>> common = MostCommon(1)
>>> for key, value in counter.iteritems():
common[key] = value
>>> common
MostCommon(top=1, min=1, store={(9, 7, 8, 0, 2, 6, 5, 4, 3, 1): 1})
>>> timeit('repr(common)', 'from __main__ import common', number=1)
1.3251570635475218e-05
访问是 O(1),但是当在 O(n) 操作的 set-item 调用期间最小值发生变化时,其中n
是最高值的数量。这仍然优于Counter
,在每次访问期间都是 O(n),其中n
是整个词汇表的大小!