0

我目前正在研究用户项目协同过滤模型。

我有一组用户和他们购物过的地方,并尝试使用 R 构建推荐模型。

该项目有两个目标:a)向所有客户推荐新商店 b)提供统计数据以显示模型的准确性。

我有 2 年的数据。

为了回答 b),我将我的数据子集分配给在前 1.5 年和接下来的 6 个月内购买的客户。我在前 1.5 年的交易数据上创建了一个模型,然后将模型预测与 ACTUAL 6 个月的数据进行了比较。

通过执行上述操作,我确定我要使用 UBCF 和 nn=500,并且我能够达到大约 80% 的准确度。

但是,我现在不确定如何预测整个用户群。我正在考虑将整个数据集应用于我刚刚创建的模型,但存在偏差/将不准确,因为并非所有商店都在我创建的这个小模型中表示。


我读过人们做过不同事情的文章和教程。我见过他们输入整个数据集,并应用 [which] 子集,以便它以 80% 创建模型并使用剩余的 20% 进行测试。

我的问题是,如果我要使用这个过程,当模型只为 20% 的用户提供预测时,我将如何获得所有用户的推荐?

最好在整个数据集上创建模型吗?

子集数据

创建周期标志

#If in 1.5 years, then 1. If in following 6 months, then 0.
FV$Flag1<-ifelse(FV$Date<="2018-10-01",1,0) 
FV$Flag2<-ifelse(FV$Date>"2018-10-01",1,0) 

识别要在培训模型中使用的客户

#Create SCV
#FV
FV_SCV<-select(FV, Customer, Flag1, Flag2) %>% 
  group_by(Customer) %>%
  summarise_all(funs(sum)) #Sum all variables. 

#Determine which customers to use based on if they have purchased both in the first and second years
FV_SCV$Use<-ifelse(FV_SCV$Flag1>0 & FV_SCV$Flag2>0, 1,0)
培训模型的提取客户列表
#Training. Where customers have purchased both in the first & second year, but we only run the model on the first.
FV_Train<-FV_SCV %>%
  filter(Use==1 )

1 年内购买的客户和两年内购买的客户的子集,仅限第一年购物的客户

#FV_SCV$flag_sum<- FV_SCV$Flag1+FV_SCV$Flag2


培训模型中使用的客户的 SCV

#Join on the USE flag
FV_Train_Transactions<- FV %>% #Join on the page info
  left_join(select(FV_Train, Customer,  Use), by=c("Customer"="Customer"))

#Replace NA with 0
FV_Train_Transactions[is.na(FV_Train_Transactions)] <- 0

##Subset to only the users' transactions to be used in training
FV_Train_Transactions<-FV_Train_Transactions %>%
  filter(Use==1)

##Create date flag for train and test to use to create the model on the train and comparing the results with the output of the test df
FV_Train_Transactions_Compare<-FV_Train_Transactions %>%
  filter(Flag2>0)

##Create SCV for TRAIN 
FV_TRAIN_SCV<-FV_Train_Transactions %>%
  filter(Flag1>0) %>%
  group_by(Customer, Brand)%>%
  select(Customer, Brand) 

FV_TRAIN_SCV$Flag<-1

#Not sure why this hasn't selected unique rows, so remove duplicates.
FV_TRAIN_SCV<-distinct(FV_TRAIN_SCV)

##Create scv for TEST
FV_TEST_SCV<-FV_Train_Transactions_Compare %>%
  filter(Flag2>0) %>%
  select(Customer, Brand) %>%
  group_by(Customer, Brand)
FV_TEST_SCV$Flag<-1
#Not sure why this hasn't selected unique rows, so remove duplicates.
FV_TEST_SCV<-distinct(FV_TEST_SCV)

转置到列

install.packages("reshape")
install.packages("reshape2")
install.packages("tidytext")
library(reshape)
library(reshape2)
library(tidytext)
#Melt data for transposition
#Train
fv_train_md<-melt(FV_TRAIN_SCV, id=(c("Customer", "Brand")))
FV_TRAIN_SCV2<-dcast(fv_train_md, Customer~Brand, value="Flag",  fun.aggregate = mean)
#Test
fv_test_md<-melt(FV_TEST_SCV, id=(c("Customer" , "Brand")))
#Do the same for the overall transactions table
#Make FV_SCV a binary rating matrix
fv_overall<- FV[,c(1,3)] #The table name is case sensitive. Select only the customer and brand columns
fv_overall<- distinct(fv_overall) #Remove dups
fv_overall$Flag<-1

fv_overall_md<-melt(fv_overall, id=(c("Customer", "Brand")))
fv_overall_2<- dcast(fv_overall_md, Customer~Brand, value="Flag", fun.aggregate = mean)


#fv_test_123<-dcast(FV_TEST_SCV, Customer~Brand, value.var="Brand")

#colnames(fv_test_123)
#fv_test_12345<-which(fv_test_123==1, arr.ind=TRUE)
#fv_test_123<-colnames(fv_test_123)[fv_test_12345]
#fv_test_123
#fv_test_123_df<-as.data.frame((fv_test_123))

FV_TRAIN_SCV2<-dcast(fv_train_md, Customer~Brand, value="Value",  fun.aggregate = mean)
FV_TEST_SCV2<-dcast(fv_test_md, Customer~Brand, value="Value",  fun.aggregate = mean)

#Replace NaN with 0
is.nan.data.frame <- function(x)
do.call(cbind, lapply(x, is.nan))

FV_TRAIN_SCV2[is.nan(FV_TRAIN_SCV2)] <- 0
FV_TEST_SCV2[is.nan(FV_TEST_SCV2)] <- 0
fv_overall_2[is.nan(fv_overall_2)] <- 0
##
install.packages("recommenderlab")
library(recommenderlab)

row.names(FV_TRAIN_SCV2)<-FV_TRAIN_SCV2$Customer
FV_TRAIN_SCV2$Hawkers<-0
FV_TRAIN_SCV2$Pollini<-0
FV_TRAIN_SCV2$"Twin Set"<-0
FV_TRAIN_SCV2_matrix<-as.matrix(FV_TRAIN_SCV2[,2:ncol(FV_TRAIN_SCV2)])
FV_TRAIN_SCV2_binarymatrix<-as(FV_TRAIN_SCV2_matrix,"binaryRatingMatrix")

similarity_FV_train_trans_items<-similarity(FV_TRAIN_SCV2_binarymatrix, method="jaccard", which="items")

train_col<- data.frame(colnames(FV_TRAIN_SCV2))
#------------------------------------------------------------------------------------
row.names(fv_overall_2)<-fv_overall_2$Customer

#Convert NaN to 0
fv_overall_2[is.nan(fv_overall_2)]<-0
#fv_overall_matrix<- as.matrix(fv_overall_2[,2:(ncol(fv_overall_2)-3)])#Convert to matrix
fv_overall_matrix<- as.matrix(fv_overall_2[,2:ncol(fv_overall_2)])#Convert to matrix
#fv_overall_matrix<- as.matrix(fv_overall_matrix[,1:(ncol(fv_overall_2)-3)])
fv_matrix_binary<- as(fv_overall_matrix, "binaryRatingMatrix")  #Make a binary ratings matrix

FV_overall_similarity<-similarity(fv_matrix_binary, method="jaccard", which="items")
overall_col<- data.frame(colnames(fv_overall_2))
#---------------------------------------------------------------------------------------------------------


##
#Now, define multiple recommender algorithms to compare them all.

algorithms <- list(`user-based CF 50` = list(name = "UBCF",param = list(method = "Jaccard", nn = 50)),
                   `user-based CF 100` = list(name = "UBCF",param = list(method = "Jaccard", nn = 100)),
                   `user-based CF 200` = list(name = "UBCF",param = list(method = "Jaccard", nn = 200)),
                   `user-based CF 500` = list(name = "UBCF",param = list(method = "Jaccard", nn = 500)),
                   #
                   `item-based CF 3` = list(name = "IBCF",param = list(method = "Jaccard", k = 3)),
                   `item-based CF 5` = list(name = "IBCF",param = list(method = "Jaccard", k = 5)),
                   `item-based CF 10` = list(name = "IBCF",param = list(method = "Jaccard", k = 10)),
                   `item-based CF 50` = list(name = "IBCF",param = list(method = "Jaccard", k = 50))
                   )

scheme <- evaluationScheme(FV_TRAIN_SCV2_binarymatrix, method = "cross", k = 4,given = 1)
scheme <- evaluationScheme(fv_matrix_binary, method = "cross", k = 4,given = 1)
results <- evaluate(scheme, algorithms, n = c(1,2,3,4,5,6,7,8))
results <- evaluate(scheme, algorithms, n = c(1,2,3,4,5,6,7,8,50,100,200,500))#Evaluating with n=c(1,2,3.....) being the number of recommendations
#names(results) #Check that all results have run. 
#results

#Plot results to help determine which of the above models is best for further analysis
#plot(results, annotate = c(1, 3), legend = "right") #ROC Curve
#plot(results, "prec/rec", annotate = 3) #Precision/Recall Plot


这些图中的第一个(在 x 轴上带有 FPR)是 ROC 曲线。性能更好的模型是具有最高面积的曲线,因此在这些测试参数中,性能更好的模型是 nn=500 的 UBCF。或者,nn=50。

根据精度/召回图,nn 应设置为 500。

使用 UBCF 的模型 nn = 500

recc_model <- Recommender(data = FV_TRAIN_SCV2_binarymatrix, method = "UBCF", 
                          parameter = list(method = "Jaccard",
                                           nn=500))
model_details <- getModel(recc_model)
model_details




#Running on ENTIRE DATA
recc_model <- Recommender(data = fv_matrix_binary, method = "UBCF", 
                          parameter = list(method = "Jaccard",
                                           nn=500))
model_details <- getModel(recc_model)
model_details

install.packages("plyr")
library(plyr)
#Get the scores
recc_predicted <- predict(object = recc_model, newdata = FV_TRAIN_SCV2_binarymatrix, n = 198, type="ratings")
ibcf_scores<-as(recc_predicted, "list")
ibcf_list_scores<-ldply(ibcf_scores, rbind)
ibcf_list_scores[is.na(ibcf_list_scores)]<-0

#Get the list of brands
recc_predicted_list2<-predict(object=recc_model,newdata= FV_TRAIN_SCV2_binarymatrix,type="topNList", n=198)
recc_predicted_list2_a<- as(recc_predicted_list2, "list")
recc_predicted_list2_b<-ldply(recc_predicted_list2_a, rbind)
#recc_predicted_list2_b[is.na(recc_predicted_list2_b)]<-0

#------------------------------------------------------------
#On the overall model:

#Get the scores
recc_predicted <- predict(object = recc_model, newdata = fv_matrix_binary, n = 80, type="ratings")
ibcf_scores<-as(recc_predicted, "list")
ibcf_list_scores<-ldply(ibcf_scores, rbind)
ibcf_list_scores[is.na(ibcf_list_scores)]<-0

#Get the list of brands
recc_predicted_list2<-predict(object=recc_model,newdata= fv_matrix_binary,type="topNList", n=80)
recc_predicted_list2_a<- as(recc_predicted_list2, "list")
recc_predicted_list2_b<-ldply(recc_predicted_list2_a, rbind)
#recc_predicted_list2_b[is.na(recc_predicted_list2_b)]<-0

重塑 df 使所有评级都在一列中。然后使用它来创建一个唯一的表,然后进行计数,因为这总是会使 excel 崩溃。

install.packages("data.table")
library(data.table)
df.m1<- melt(ibcf_list_scores, id.vars=c(".id"),
             value.name="Rating")

df.m1.unique<- data.frame(df.m1)
df.m1.unique$variable<-NULL
df.m1.unique$.id<-NULL

#df.m1.unique<-distinct(df.m1.unique)
#df.m1.unique<- df.m1.unique[order(df.m1.unique$Rating),] #This comma means it is only ordering based on this one var.

#Using ave
df.m1.unique$count<- ave(df.m1.unique$Rating, df.m1.unique[,c("Rating")], FUN=length)
rownames(df.m1.unique) <- c() #Remove rownames
df.m1.unique<-distinct(df.m1.unique)
df.m1.unique<- df.m1.unique[order(-df.m1.unique$Rating),]#Sort by ascending rating

#Plot this
df.m1.unique.plot<- data.frame(df.m1.unique[2:(nrow(df.m1.unique)-1),])
#plot(x=df.m1.unique.plot$Rating, y=df.m1.unique.plot$count)

#Get the cumulative distribution
df.m1.unique.plot2<- df.m1.unique.plot %>%
  mutate(Percentage=cumsum(100*(count/sum(count))),
         cumsum=cumsum(count))


移除评分

#a) Remove values that are less than specific rating
#Using logical indexing with replacement
ibcf_list_scores_removal<- ibcf_list_scores

#Replace low values with 0
ibcf_list_scores_removal[,2:ncol(ibcf_list_scores_removal)][ibcf_list_scores_removal[,2:ncol(ibcf_list_scores_removal)] < 0.0217] <- 0

#To flag whether customer is recommended the brand, replace all values >0 with 1. Keep 0 as is.
ibcf_list_scores_removal_b<- ibcf_list_scores_removal #Call a new df
ibcf_list_scores_removal_b[,2:ncol(ibcf_list_scores_removal_b)][ibcf_list_scores_removal_b[,2:ncol(ibcf_list_scores_removal_b)] > 0] <- 1#Create the flag

所以基本上我想知道如何在我的整个数据集上创建模型?以及如何提取所有评级?

谢谢

4

0 回答 0