我接受了所有这些答案并编写了一个脚本来 1. 验证每个结果(见下面的断言)和 2. 看看哪个是最快的。代码和结果如下:
# Imports
import numpy as np
import scipy.sparse as sp
from scipy.spatial.distance import squareform, pdist
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
# Create an adjacency matrix
np.random.seed(42)
A = np.random.randint(0, 2, (10000, 100)).astype(float).T
# Make it sparse
rows, cols = np.where(A)
data = np.ones(len(rows))
Asp = sp.csr_matrix((data, (rows, cols)), shape = (rows.max()+1, cols.max()+1))
print "Input data shape:", Asp.shape
# Define a function to calculate the cosine similarities a few different ways
def calc_sim(A, method=1):
if method == 1:
return 1 - squareform(pdist(A, metric='cosine'))
if method == 2:
Anorm = A / np.linalg.norm(A, axis=-1)[:, np.newaxis]
return np.dot(Anorm, Anorm.T)
if method == 3:
Anorm = A / np.linalg.norm(A, axis=-1)[:, np.newaxis]
return linear_kernel(Anorm)
if method == 4:
similarity = np.dot(A, A.T)
# squared magnitude of preference vectors (number of occurrences)
square_mag = np.diag(similarity)
# inverse squared magnitude
inv_square_mag = 1 / square_mag
# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[np.isinf(inv_square_mag)] = 0
# inverse of the magnitude
inv_mag = np.sqrt(inv_square_mag)
# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = similarity * inv_mag
return cosine.T * inv_mag
if method == 5:
'''
Just a version of method 4 that takes in sparse arrays
'''
similarity = A*A.T
square_mag = np.array(A.sum(axis=1))
# inverse squared magnitude
inv_square_mag = 1 / square_mag
# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[np.isinf(inv_square_mag)] = 0
# inverse of the magnitude
inv_mag = np.sqrt(inv_square_mag).T
# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = np.array(similarity.multiply(inv_mag))
return cosine * inv_mag.T
if method == 6:
return cosine_similarity(A)
# Assert that all results are consistent with the first model ("truth")
for m in range(1, 7):
if m in [5]: # The sparse case
np.testing.assert_allclose(calc_sim(A, method=1), calc_sim(Asp, method=m))
else:
np.testing.assert_allclose(calc_sim(A, method=1), calc_sim(A, method=m))
# Time them:
print "Method 1"
%timeit calc_sim(A, method=1)
print "Method 2"
%timeit calc_sim(A, method=2)
print "Method 3"
%timeit calc_sim(A, method=3)
print "Method 4"
%timeit calc_sim(A, method=4)
print "Method 5"
%timeit calc_sim(Asp, method=5)
print "Method 6"
%timeit calc_sim(A, method=6)
结果:
Input data shape: (100, 10000)
Method 1
10 loops, best of 3: 71.3 ms per loop
Method 2
100 loops, best of 3: 8.2 ms per loop
Method 3
100 loops, best of 3: 8.6 ms per loop
Method 4
100 loops, best of 3: 2.54 ms per loop
Method 5
10 loops, best of 3: 73.7 ms per loop
Method 6
10 loops, best of 3: 77.3 ms per loop