这与@Jim Mischel 的回答基本相同,构建显式反向索引,但不构建显式 NxN 矩阵,并且是一个实现而不是整个系统的草图。
对于大量的 NUM_CUSTOMERS(比如 1,000,000),它的性能不是很好(3 次 similiarty_for 检查约 20 秒),但数据是随机的。据推测,实际客户购买数据中会有更多的相关性和更少的随机性。
首先,构建一些用于测试的合成数据:
import random
import operator
NUM_CUSTOMERS = 10000
NUM_ITEMS = 1000
MIN_PER_CUST = 2
MAX_PER_CUST = 10
NUM_INTEREST = 10
customers = ["customer_{}".format(num) for num in xrange(1, NUM_CUSTOMERS+1)]
items = ["item_{}".format(num) for num in xrange(1, NUM_ITEMS+1)]
customer_items = {
customer: random.sample(items, random.randint(MIN_PER_CUST, MAX_PER_CUST))
for customer in customers}
item_customers = {}
for customer, this_items in customer_items.iteritems():
for item in this_items:
item_customers.setdefault(item, [])
item_customers[item].append(customer)
然后定义几个函数:
def similarity(item1, item2):
item1_set = set(item_customers[item1])
item2_set = set(item_customers[item2])
num_common = len(item1_set.intersection(item2_set))
num_total = len(item1_set.union(item2_set))
return float(num_common) / num_total
def similarity_for(item):
to_check = {
itm for customer in item_customers[item]
for itm in customer_items[customer] }
to_check.discard(item)
rankings = {
item2: similarity(item, item2)
for item2 in to_check }
return sorted(rankings.iteritems(), key=operator.itemgetter(1), reverse=True)
并且,运行它:
for index, item in enumerate(sorted(random.sample(items, 3))):
rankings = similarity_for(item)
print "check {}; {} (of {}):".format(index, item, len(rankings))
for item_other, ranking in rankings[:NUM_INTEREST]:
print " {}: {}".format(item_other, ranking)
获得如下所示的结果:
% python /tmp/similarity.py
check 0; item_121 (of 309):
item_520: 0.0283018867925
item_361: 0.027027027027
item_536: 0.0265486725664
item_637: 0.0238095238095
item_515: 0.020202020202
item_750: 0.019801980198
item_960: 0.0192307692308
item_25: 0.0190476190476
item_548: 0.018691588785
item_841: 0.018691588785
check 1; item_714 (of 298):
item_162: 0.0285714285714
item_491: 0.0272727272727
item_617: 0.0265486725664
item_949: 0.0260869565217
item_788: 0.0192307692308
item_446: 0.0190476190476
item_558: 0.018691588785
item_606: 0.0181818181818
item_177: 0.0181818181818
item_577: 0.018018018018
check 2; item_991 (of 352):
item_758: 0.0298507462687
item_204: 0.025
item_85: 0.0247933884298
item_769: 0.024
item_501: 0.0232558139535
item_860: 0.0227272727273
item_408: 0.0225563909774
item_480: 0.0223880597015
item_73: 0.0220588235294
item_593: 0.021897810219