我正在支持一个系统,该系统使用拨浪鼓生成的代码进行一些预测。然而,在仔细查看摇铃代码生成的预测时,我发现代码根本没有做任何预测,而只是按原样显示输入文件。尽管我希望对新数据进行预测,但它只是按原样显示输入文件。请一些专家看看附加的代码并指导我这可能有什么问题。创建这个的人已经离开了公司,我正试图弄清楚如何解决这个问题。我对 R 完全陌生,希望用户提供任何帮助。代码如下:
# Rattle is Copyright (c) 2006-2014 Togaware Pty Ltd.
#============================================================
# Rattle timestamp: 2015-08-03 08:32:05 x86_64-w64-mingw32
# Rattle version 3.4.1 user 'vikashs'
# Export this log textview to a file using the Export button or the Tools
# menu to save a log of all activity. This facilitates repeatability. Exporting
# to file 'myrf01.R', for example, allows us to the type in the R Console
# the command source('myrf01.R') to repeat the process automatically.
# Generally, we may want to edit the file to suit our needs. We can also directly
# edit this current log textview to record additional information before exporting.
# Saving and loading projects also retains this log.
library(rattle)
library(pROC)
library(ggplot2)
library(randomForest)
library(ROCR)
# This log generally records the process of building a model. However, with very
# little effort the log can be used to score a new dataset. The logical variable
# 'building' is used to toggle between generating transformations, as when building
# a model, and simply using the transformations, as when scoring a dataset.
predictFraud=(function(inputVector) {
building <- TRUE
scoring <- ! building
# The colorspace package is used to generate the colours used in plots, if available.
library(colorspace)
# A pre-defined value is used to reset the random seed so that results are repeatable.
crv$seed <- 42
#============================================================
# Rattle timestamp: 2015-08-03 08:32:18 x86_64-w64-mingw32
# Load the data.
crs$dataset <- read.csv(java_file1, na.strings=c(".", "NA", "", "?"), strip.white=TRUE, encoding="UTF-8")
#============================================================
# Rattle timestamp: 2015-08-03 08:32:30 x86_64-w64-mingw32
# Note the user selections.
# Build the training/validate/test datasets.
set.seed(crv$seed)
crs$nobs <- nrow(crs$dataset) # 1000 observations
crs$sample <- crs$train <- sample(nrow(crs$dataset), 0.7*crs$nobs) # 700 observations
crs$validate <- NULL
crs$test <- setdiff(setdiff(seq_len(nrow(crs$dataset)), crs$train), crs$validate) # 300 observations
# The following variable selections have been noted.
crs$input <- inputVector
crs$numeric <- c("TRANS_AMOUNT", "CUST_ACC_LIABLTY", "CUST_AGE", "CUST_SALARY")
crs$categoric <- c("ACTVTY_TYP_NAME", "TRNS_TYP_NAME", "TRNS_CTGRY_NAME", "TRNS_CRRNCY_NAME",
"CUST_ACC_DEF_STATUS", "CUST_ACC_RGSTN_TYPE", "CUST_ACC_OWNRSHP_TYPE", "CUST_ACC_BRNCH",
"CUST_GENDR", "CUST_MARITAL_STS", "CUST_EMPLYMNT_PRFLE", "CUST_TOT_FMLY_MMBRS",
"CUST_DEPNDNTS", "CUST_EXSTNG_LOAN", "CUST_EXSTNG_INSRNCE", "LOAN_TYPE",
"LOAN_STATUS", "LOAN_DFAULT", "LOAN_FRCLSRE_FLG", "LOAN_TRNSFR_FLG",
"INS_STATUS", "INS_DFAULT", "INS_SRRNDR_FLG")
crs$target <- colnames(crs$dataset)[ncol(crs$dataset)]
crs$risk <- NULL
crs$ident <- "ID"
crs$ignore <- NULL
crs$weights <- NULL
#============================================================
# Rattle timestamp: 2015-08-03 08:32:36 x86_64-w64-mingw32
# The 'randomForest' package provides the 'randomForest' function.
require(randomForest, quietly=TRUE)
# Build the Random Forest model.
set.seed(crv$seed)
crs$rf <- randomForest(Response ~ .,
data=crs$dataset[crs$sample,c(crs$input, crs$target)],
ntree=500,
mtry=5,
importance=TRUE,
na.action=na.roughfix,
replace=FALSE)
rn <- round(importance(crs$rf), 2)
itm=length(crs$input)
GiniIndex=sort(rn[,4],decreasing=T)[1:itm]
write.csv(data.frame(GiniIndex),"D:/VarImp.csv")
temp="";
for (i in 1:length(names(GiniIndex)))
{
#if(colnames(crs$dataset)==names(GiniIndex)[i])
{
temp[i]= names(GiniIndex)[i]
}
}
# temp=sort(temp)
col_for_prediction=temp
# Regression model
# Build a Regression model.
crs$glm <- glm(Response ~ .,
data=crs$dataset[crs$train, c(crs$input, crs$target)],
family=binomial(link="logit"))
# Generate a textual view of the Linear model.
print(summary(crs$glm))
cat(sprintf("Log likelihood: %.3f (%d df)\n",
logLik(crs$glm)[1],
attr(logLik(crs$glm), "df")))
cat(sprintf("Null/Residual deviance difference: %.3f (%d df)\n",
crs$glm$null.deviance-crs$glm$deviance,
crs$glm$df.null-crs$glm$df.residual))
cat(sprintf("Chi-square p-value: %.8f\n",
dchisq(crs$glm$null.deviance-crs$glm$deviance,
crs$glm$df.null-crs$glm$df.residual)))
cat(sprintf("Pseudo R-Square (optimistic): %.8f\n",
cor(crs$glm$y, crs$glm$fitted.values)))
cat('\n==== ANOVA ====\n\n')
print(anova(crs$glm, test="Chisq"))
cat("\n")
# Time taken: 0.56 secs
#============================================================
# Rattle timestamp: 2015-08-03 08:32:41 x86_64-w64-mingw32
# Evaluate model performance.
# Generate an Error Matrix for the Linear model.
# Obtain the response from the Linear model.
crs$pr <- as.vector(ifelse(predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)]) > 0.5, "Y", "N"))
# Generate the confusion matrix showing counts.
table(crs$dataset[crs$test, c(crs$input, crs$target)]$Response, crs$pr,
dnn=c("Actual", "Predicted"))
# Generate the confusion matrix showing proportions.
pcme <- function(actual, cl)
{
x <- table(actual, cl)
tbl <- cbind(round(x/length(actual), 2),
Error=round(c(x[1,2]/sum(x[1,]),
x[2,1]/sum(x[2,])), 2))
names(attr(tbl, "dimnames")) <- c("Actual", "Predicted")
return(tbl)
};
pcme(crs$dataset[crs$test, c(crs$input, crs$target)]$Response, crs$pr)
# Calculate the overall error percentage.
overall <- function(x)
{
if (nrow(x) == 2)
cat((x[1,2] + x[2,1]) / sum(x))
else
cat(1 - (x[1,rownames(x)]) / sum(x))
}
overall(table(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response,
dnn=c("Predicted", "Actual")))
# Calculate the averaged class error percentage.
avgerr <- function(x)
cat(mean(c(x[1,2], x[2,1]) / apply(x, 1, sum)))
avgerr(table(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response,
dnn=c("Predicted", "Actual")))
#============================================================
# Rattle timestamp: 2015-08-03 08:32:46 x86_64-w64-mingw32
# Evaluate model performance.
# Risk Chart: requires the ggplot2 package.
library(ggplot2)
# Generate a risk chart.
# Rattle provides evaluateRisk() and riskchart().
crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
crs$eval <- evaluateRisk(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
#dp3<-print(riskchart(crs$pr,
# crs$dataset[crs$test, c(crs$input, crs$target)]$Response,
# title="Performance Chart Linear Master_File_New_02_New_Ver01.csv [test] ", show.lift=TRUE, show.precision=TRUE))
#============================================================
# Rattle timestamp: 2015-08-03 08:32:53 x86_64-w64-mingw32
# Evaluate model performance.
# Lift Chart: requires the ROCR package.
library(ROCR)
# Obtain predictions for the glm model on Master_File_New_02_New_Ver01.csv [test].
crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
# Remove observations with missing target.
no.miss <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL
if (length(miss.list))
{
pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
pred <- prediction(crs$pr, no.miss)
}
# Convert rate of positive predictions to percentage.
per <- performance(pred, "lift", "rpp")
per@x.values[[1]] <- per@x.values[[1]]*100
# Plot the lift chart.
#plot(per, col="#CC0000FF", lty=1, xlab="Caseload (%)", add=FALSE)
# Generate a Lift Chart for the glm model on Master_File_New_02_New_Ver01.csv [train].
crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
# Also convert rate of positive predictions to percentage
per <- performance(prediction(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response),"lift", "rpp")
per@x.values[[1]] <- per@x.values[[1]]*100
# Now plot the lift.
# Bug in ROCR 1.0-3 plot does not obey the add command.# Calling the function directly (.plot.performance) does work.
#.plot.performance(per, col="#00CCCCFF", lty=2, add=TRUE)
# Add a legend to the plot.
#legend("topright", c("Test","Train"), col=rainbow(2, 1, .8), lty=1:2, title="Linear", inset=c(0.05, 0.05))
# Add decorations to the plot.
#title(main="Lift Chart Master_File_New_02_New_Ver01.csv ",
# sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#grid()
#============================================================
# Rattle timestamp: 2015-08-03 08:32:56 x86_64-w64-mingw32
# Evaluate model performance.
# ROC Curve: requires the ROCR package.
library(ROCR)
# ROC Curve: requires the ggplot2 package.
require(ggplot2, quietly=TRUE)
# Generate an ROC Curve for the glm model on Master_File_New_02_New_Ver01.csv [test].
crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
# Remove observations with missing target.
no.miss <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL
if (length(miss.list))
{
pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
pred <- prediction(crs$pr, no.miss)
}
pe <- performance(pred, "tpr", "fpr")
au <- performance(pred, "auc")@y.values[[1]]
pd <- data.frame(fpr=unlist(pe@x.values), tpr=unlist(pe@y.values))
p <- ggplot(pd, aes(x=fpr, y=tpr))
p <- p + geom_line(colour="red")
p <- p + xlab("False Positive Rate") + ylab("True Positive Rate")
p <- p + ggtitle("ROC Curve Linear Master_File_New_02_New_Ver01.csv [test] Response")
p <- p + theme(plot.title=element_text(size=10))
p <- p + geom_line(data=data.frame(), aes(x=c(0,1), y=c(0,1)), colour="grey")
p <- p + annotate("text", x=0.50, y=0.00, hjust=0, vjust=0, size=5,
label=paste("AUC =", round(au, 2)))
#print(p)
# Calculate the area under the curve for the plot.
# Remove observations with missing target.
no.miss <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL
if (length(miss.list))
{
pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
pred <- prediction(crs$pr, no.miss)
}
performance(pred, "auc")
#============================================================
# Rattle timestamp: 2015-08-03 08:32:59 x86_64-w64-mingw32
# Evaluate model performance.
# Precision/Recall Plot: requires the ROCR package
library(ROCR)
# Generate a Precision/Recall Plot for the glm model on Master_File_New_02_New_Ver01.csv [test].
crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
# Remove observations with missing target.
no.miss <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL
if (length(miss.list))
{
pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
pred <- prediction(crs$pr, no.miss)
}
#plot(performance(pred, "prec", "rec"), col="#CC0000FF", lty=1, add=FALSE)
dp2<-performance(pred, "prec", "rec")
# Generate a Precision/Recall Plot for the glm model on Master_File_New_02_New_Ver01.csv [train].
crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
# In ROCR (1.0-3) plot does not obey the add command.
# Calling the function directly (.plot.performance) does work.
#.plot.performance(performance(prediction(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response),"prec", "rec"), col="#00CCCCFF", lty=2, add=TRUE)
# Add a legend to the plot.
#legend("bottomleft", c("Test","Train"), col=rainbow(2, 1, .8), lty=1:2, title="glm", inset=c(0.05, 0.05))
# Add decorations to the plot.
#title(main="Precision/Recall Plot Master_File_New_02_New_Ver01.csv ",
# sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#grid()
#============================================================
# Rattle timestamp: 2015-08-03 08:33:05 x86_64-w64-mingw32
# Evaluate model performance.
# Sensitivity/Specificity Plot: requires the ROCR package
library(ROCR)
# Generate Sensitivity/Specificity Plot for glm model on Master_File_New_02_New_Ver01.csv [test].
crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
# Remove observations with missing target.
no.miss <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL
if (length(miss.list))
{
pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
pred <- prediction(crs$pr, no.miss)
}
#plot(performance(pred, "sens", "spec"), col="#CC0000FF", lty=1, add=FALSE)
dp1<-performance(pred, "sens", "spec")
# Generate a Lift Chart for the glm model on Master_File_New_02_New_Ver01.csv [train].
crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
#In ROCR (1.0-3) plot does not obey the add command.
# Calling the function directly (.plot.performance) does work.
#.plot.performance(performance(prediction(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response),"sens", "spec"), col="#00CCCCFF", lty=2, add=TRUE)
# Add a legend to the plot.
#legend("bottomleft", c("Test","Train"), col=rainbow(2, 1, .8), lty=1:2, title="glm", inset=c(0.05, 0.05))
# Add decorations to the plot.
#title(main="Sensitivity/Specificity (tpr/tnr) Master_File_New_02_New_Ver01.csv ",
# sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#grid()
#============================================================
# Rattle timestamp: 2015-08-03 08:33:11 x86_64-w64-mingw32
# Score a dataset.
# Obtain probability scores for the Linear model on Master_File_New_02_New_Ver01.csv [test].
#crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input)])
crs$pr <- as.vector(ifelse(predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)]) > 0.5, "Y", "N"))
sdata3<-crs$pr
# Extract the relevant variables from the dataset.
sdata <- subset(crs$dataset[crs$test,], select=c("ID", "Response"))
# Output the combined data.
write.csv(cbind(sdata, crs$pr), file="D:/R_Outputs/Logistic_Custom_Scores.csv", row.names=FALSE)
#============================================================
# Rattle timestamp: 2015-08-03 08:33:22 x86_64-w64-mingw32
# Score a dataset.
#============================================================
# Rattle timestamp: 2015-08-03 08:33:36 x86_64-w64-mingw32
# Score a dataset.
# Obtain probability scores for the Linear model on Master_File_New_02_New_Ver01.csv [test].
crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input)])
# Extract the relevant variables from the dataset.
sdata <- subset(crs$dataset[crs$test,], select=c("ID", "Response"))
# Output the combined data.
write.csv(cbind(sdata, crs$pr), file="D:/R_Outputs/Logistic_Custom_Prob.csv", row.names=FALSE)
Probability<-crs$pr
sdata1<-subset(crs$dataset[crs$test,],select=c("ID"))
sdata2<-subset(crs$dataset[crs$test,],select=c("Response"))
sdata1<-cbind.data.frame(sdata1,Probability,sdata2,sdata3)
#=============================================================
#Prediction on New Dataset
#data_for_prediction <- crs$dataset[-29]
data_for_prediction=read.csv(java_file2, na.strings=c(".", "NA", "", "?"), strip.white=TRUE, encoding="UTF-8")
result <- predict(crs$glm, type = "response", newdata=data_for_prediction)
tempdata=0
for (i in 1:length(result))
{
tempdata[i]= (1-result[i])
}
tempdata<-cbind.data.frame(tempdata,result)
write.csv(data.frame(tempdata),"D:/Predictions.csv")
#dp<-c(per,dp1,dp2,pd,dp3,tempdata,sdata1,data.frame(GiniIndex),col_for_prediction)
dp<-c(per,dp1,dp2,pd,tempdata,sdata1,data.frame(GiniIndex),col_for_prediction)
return(dp)
})