我正在学习并尝试使用 LFM(潜在因子模型)构建玩具推荐系统。所以我在这个页面中找到了一些关于矩阵分解的东西(http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/)
该页面内的代码可以完美运行。但在我的工作中,矩阵应该是稀疏的,因为初始化后很多元素仍然是空白的。所以我用字典重写了它,一切都搞砸了。
以下是网页中给出的代码:
import numpy
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
Q = Q.T
for step in xrange(steps):
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
for k in xrange(K):
P_temp = P[i][k]
Q_temp = Q[k][j]
P[i][k] = P_temp + alpha * (2 * eij * Q_temp - beta * P_temp)
Q[k][j] = Q_temp + alpha * (2 * eij * P_temp - beta * Q_temp)
e = 0
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
for k in xrange(K):
e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
if e < 0.001:
break
return P, Q.T
if __name__ == '__main__':
R = [
[5,3,0,1],
[4,0,0,1],
[1,1,0,5],
[1,0,0,4],
[0,1,5,4],
]
R = numpy.array(R)
N = len(R)
M = len(R[0])
K = 2
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)
这段代码可以正常工作。所以我写了以下代码:
import random
def matrix_factorization(R, P, Q, K,steps=5000, alpha=0.0002, beta=0.02):
for step in xrange(steps):
print 'step',step
step += 1
for i in R.keys():
for j in R[i].keys():
eij = R[i][j] - sum([x * y for x in P[i] for y in Q[j]])
for k in xrange(K):
P_temp = P[i][k]
Q_temp = Q[j][k]
P[i][k] = P_temp + alpha * (2 * eij * Q_temp - beta * P_temp)
Q[k][j] = Q_temp + alpha * (2 * eij * P_temp - beta * Q_temp)
e = 0
for i in R.keys():
for j in R[i].keys():
e += pow(R[i][j] - sum([x * y for x in P[i] for y in Q[j]]), 2)
for k in xrange(K):
e += (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
if e < 0.001:
break
return P,Q
if __name__ == '__main__':
R = {0:{0:5,1:3,3:1},
1:{0:4,3:1},
2:{0:1,1:1,3:5},
3:{0:1,3:4},
4:{1:1,2:5,3:4}
}
N = len(R.keys())
M = 4
K = 4
P = dict()
Q = dict()
for i in xrange(N):
P[i] = [random.random() for x in xrange(K)]
for j in xrange(M):
Q[j] = [random.random() for x in xrange(K)]
P,Q = matrix_factorization(R,P,Q,K)
Rij = dict()
这两个部分应该具有相同的功能并且结构是相同的。但是!我的代码返回的是:
OverflowError: (34, 'Result too large')
或计算 P 和 Q 后显示:
P
Out[5]:
{0: [nan, nan, nan, nan],
1: [nan, nan, nan, nan],
2: [nan, nan, nan, nan],
3: [nan, nan, nan, nan],
4: [nan, nan, nan, nan]}
Q
Out[6]:
{0: [nan, nan, nan, nan],
1: [nan, nan, nan, nan],
2: [nan, nan, nan, nan],
3: [nan, nan, nan, nan]}
我只是不知道为什么,非常可悲的事实是我已经使用这种方法完成了我的推荐系统。你能帮我找出发生这种情况的原因吗?非常感谢您的宝贵时间!