0

我编写了下面的代码来构建用户项目矩阵和相似度矩阵,对电影镜头数据集使用了惊喜。使用 100K 数据集很好,但我尝试为 10M 数据集编译它,它没有完成超过 4 小时。我相信原因是迭代。你能帮我解决这个问题吗?

import os

import numpy as np
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD

from scipy.spatial import distance

file_path = os.path.expanduser('movie_lens_10M.dat')
reader = Reader(line_format='user item rating timestamp', sep='::')
data = Dataset.load_from_file(file_path, reader=reader)
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = SVD()
algo.fit(trainset)

data = pd.read_csv('movie_lens_10M.dat',
                   names=['user_id', 'movie_id', 'rating', 'time'],
                   engine='python', delimiter='::')

rating_matrix = np.ndarray(shape=(np.max(data.movie_id.values), np.max(data.user_id.values)), dtype=np.uint8)
rating_matrix[data.movie_id.values - 1, data.user_id.values - 1] = data.rating.values

# build item user matrix, if 0 than predict
for i in range(np.max(data.movie_id.values)):
    for j in range(np.max(data.user_id.values)):
        if rating_matrix[i][j] == 0:
            rating_matrix[i][j] = algo.predict(str(j + 1), str(i + 1))[3]

np.savetxt("rating_matrix.txt", rating_matrix, delimiter="\t")

# build similarity matrix
similarity_matrix = np.ndarray(shape=(np.max(data.user_id.values), np.max(data.user_id.values)), dtype=np.uint8)
for i in range(np.max(data.user_id.values)):
    for j in range(i, np.max(data.user_id.values)):
        similarity_matrix[i][j] = 1 - distance.correlation(rating_matrix[:, i], rating_matrix[:, j])
        similarity_matrix[j][i] = similarity_matrix[i][j]


np.savetxt("similarity_matrix.txt", similarity_matrix, delimiter="\t")
4

1 回答 1

0

n_factors 很重要。默认值 = 100。尝试一个小数字:

algo = SVD(n_factors = 10)
于 2020-12-07T22:52:05.193 回答