我编写了下面的代码来构建用户项目矩阵和相似度矩阵,对电影镜头数据集使用了惊喜。使用 100K 数据集很好,但我尝试为 10M 数据集编译它,它没有完成超过 4 小时。我相信原因是迭代。你能帮我解决这个问题吗?
import os
import numpy as np
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD
from scipy.spatial import distance
file_path = os.path.expanduser('movie_lens_10M.dat')
reader = Reader(line_format='user item rating timestamp', sep='::')
data = Dataset.load_from_file(file_path, reader=reader)
trainset = data.build_full_trainset()
# Build an algorithm, and train it.
algo = SVD()
algo.fit(trainset)
data = pd.read_csv('movie_lens_10M.dat',
names=['user_id', 'movie_id', 'rating', 'time'],
engine='python', delimiter='::')
rating_matrix = np.ndarray(shape=(np.max(data.movie_id.values), np.max(data.user_id.values)), dtype=np.uint8)
rating_matrix[data.movie_id.values - 1, data.user_id.values - 1] = data.rating.values
# build item user matrix, if 0 than predict
for i in range(np.max(data.movie_id.values)):
for j in range(np.max(data.user_id.values)):
if rating_matrix[i][j] == 0:
rating_matrix[i][j] = algo.predict(str(j + 1), str(i + 1))[3]
np.savetxt("rating_matrix.txt", rating_matrix, delimiter="\t")
# build similarity matrix
similarity_matrix = np.ndarray(shape=(np.max(data.user_id.values), np.max(data.user_id.values)), dtype=np.uint8)
for i in range(np.max(data.user_id.values)):
for j in range(i, np.max(data.user_id.values)):
similarity_matrix[i][j] = 1 - distance.correlation(rating_matrix[:, i], rating_matrix[:, j])
similarity_matrix[j][i] = similarity_matrix[i][j]
np.savetxt("similarity_matrix.txt", similarity_matrix, delimiter="\t")