0

我正在支持一个系统,该系统使用拨浪鼓生成的代码进行一些预测。然而,在仔细查看摇铃代码生成的预测时,我发现代码根本没有做任何预测,而只是按原样显示输入文件。尽管我希望对新数据进行预测,但它只是按原样显示输入文件。请一些专家看看附加的代码并指导我这可能有什么问题。创建这个的人已经离开了公司,我正试图弄清楚如何解决这个问题。我对 R 完全陌生,希望用户提供任何帮助。代码如下:

# Rattle is Copyright (c) 2006-2014 Togaware Pty Ltd.

#============================================================
# Rattle timestamp: 2015-08-03 08:32:05 x86_64-w64-mingw32 

# Rattle version 3.4.1 user 'vikashs'

# Export this log textview to a file using the Export button or the Tools 
# menu to save a log of all activity. This facilitates repeatability. Exporting 
# to file 'myrf01.R', for example, allows us to the type in the R Console 
# the command source('myrf01.R') to repeat the process automatically. 
# Generally, we may want to edit the file to suit our needs. We can also directly 
# edit this current log textview to record additional information before exporting. 

# Saving and loading projects also retains this log.

library(rattle)
library(pROC)
library(ggplot2)
library(randomForest)
library(ROCR)

# This log generally records the process of building a model. However, with very 
# little effort the log can be used to score a new dataset. The logical variable 
# 'building' is used to toggle between generating transformations, as when building 
# a model, and simply using the transformations, as when scoring a dataset.

predictFraud=(function(inputVector) {

building <- TRUE
scoring  <- ! building

# The colorspace package is used to generate the colours used in plots, if available.

library(colorspace)

# A pre-defined value is used to reset the random seed so that results are repeatable.

crv$seed <- 42 

#============================================================
# Rattle timestamp: 2015-08-03 08:32:18 x86_64-w64-mingw32 

# Load the data.

crs$dataset <- read.csv(java_file1, na.strings=c(".", "NA", "", "?"), strip.white=TRUE, encoding="UTF-8")

#============================================================
# Rattle timestamp: 2015-08-03 08:32:30 x86_64-w64-mingw32 

# Note the user selections. 

# Build the training/validate/test datasets.

set.seed(crv$seed) 
crs$nobs <- nrow(crs$dataset) # 1000 observations 
crs$sample <- crs$train <- sample(nrow(crs$dataset), 0.7*crs$nobs) # 700 observations
crs$validate <- NULL
crs$test <- setdiff(setdiff(seq_len(nrow(crs$dataset)), crs$train), crs$validate) # 300 observations

# The following variable selections have been noted.

crs$input <- inputVector

crs$numeric <- c("TRANS_AMOUNT", "CUST_ACC_LIABLTY", "CUST_AGE", "CUST_SALARY")

crs$categoric <- c("ACTVTY_TYP_NAME", "TRNS_TYP_NAME", "TRNS_CTGRY_NAME", "TRNS_CRRNCY_NAME",
     "CUST_ACC_DEF_STATUS", "CUST_ACC_RGSTN_TYPE", "CUST_ACC_OWNRSHP_TYPE", "CUST_ACC_BRNCH",
     "CUST_GENDR", "CUST_MARITAL_STS", "CUST_EMPLYMNT_PRFLE", "CUST_TOT_FMLY_MMBRS",
     "CUST_DEPNDNTS", "CUST_EXSTNG_LOAN", "CUST_EXSTNG_INSRNCE", "LOAN_TYPE",
     "LOAN_STATUS", "LOAN_DFAULT", "LOAN_FRCLSRE_FLG", "LOAN_TRNSFR_FLG",
     "INS_STATUS", "INS_DFAULT", "INS_SRRNDR_FLG")

crs$target  <- colnames(crs$dataset)[ncol(crs$dataset)]
crs$risk    <- NULL
crs$ident   <- "ID"
crs$ignore  <- NULL
crs$weights <- NULL

#============================================================
# Rattle timestamp: 2015-08-03 08:32:36 x86_64-w64-mingw32 

# The 'randomForest' package provides the 'randomForest' function.

require(randomForest, quietly=TRUE)

# Build the Random Forest model.

set.seed(crv$seed)
crs$rf <- randomForest(Response ~ .,
      data=crs$dataset[crs$sample,c(crs$input, crs$target)], 
      ntree=500,
      mtry=5,
      importance=TRUE,
      na.action=na.roughfix,
      replace=FALSE)

rn <- round(importance(crs$rf), 2)
itm=length(crs$input)
GiniIndex=sort(rn[,4],decreasing=T)[1:itm]
write.csv(data.frame(GiniIndex),"D:/VarImp.csv")

temp="";
for (i in 1:length(names(GiniIndex)))
  {
    #if(colnames(crs$dataset)==names(GiniIndex)[i])
    {
        temp[i]= names(GiniIndex)[i]
    }
  }
 # temp=sort(temp)
  col_for_prediction=temp

# Regression model 

# Build a Regression model.

crs$glm <- glm(Response ~ .,
    data=crs$dataset[crs$train, c(crs$input, crs$target)],
    family=binomial(link="logit"))

# Generate a textual view of the Linear model.

print(summary(crs$glm))
cat(sprintf("Log likelihood: %.3f (%d df)\n",
            logLik(crs$glm)[1],
            attr(logLik(crs$glm), "df")))
cat(sprintf("Null/Residual deviance difference: %.3f (%d df)\n",
            crs$glm$null.deviance-crs$glm$deviance,
            crs$glm$df.null-crs$glm$df.residual))
cat(sprintf("Chi-square p-value: %.8f\n",
            dchisq(crs$glm$null.deviance-crs$glm$deviance,
                   crs$glm$df.null-crs$glm$df.residual)))
cat(sprintf("Pseudo R-Square (optimistic): %.8f\n",
             cor(crs$glm$y, crs$glm$fitted.values)))
cat('\n==== ANOVA ====\n\n')
print(anova(crs$glm, test="Chisq"))
cat("\n")

# Time taken: 0.56 secs

#============================================================
# Rattle timestamp: 2015-08-03 08:32:41 x86_64-w64-mingw32 

# Evaluate model performance. 

# Generate an Error Matrix for the Linear model.

# Obtain the response from the Linear model.

crs$pr <- as.vector(ifelse(predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)]) > 0.5, "Y", "N"))

# Generate the confusion matrix showing counts.

table(crs$dataset[crs$test, c(crs$input, crs$target)]$Response, crs$pr,
        dnn=c("Actual", "Predicted"))

# Generate the confusion matrix showing proportions.

pcme <- function(actual, cl)
{
  x <- table(actual, cl)
  tbl <- cbind(round(x/length(actual), 2),
               Error=round(c(x[1,2]/sum(x[1,]),
                             x[2,1]/sum(x[2,])), 2))
  names(attr(tbl, "dimnames")) <- c("Actual", "Predicted")
  return(tbl)
};
pcme(crs$dataset[crs$test, c(crs$input, crs$target)]$Response, crs$pr)

# Calculate the overall error percentage.

overall <- function(x)
{
  if (nrow(x) == 2) 
    cat((x[1,2] + x[2,1]) / sum(x)) 
  else
    cat(1 - (x[1,rownames(x)]) / sum(x))
} 
overall(table(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response,  
        dnn=c("Predicted", "Actual")))

# Calculate the averaged class error percentage.

avgerr <- function(x) 
    cat(mean(c(x[1,2], x[2,1]) / apply(x, 1, sum))) 
avgerr(table(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response,  
        dnn=c("Predicted", "Actual")))

#============================================================
# Rattle timestamp: 2015-08-03 08:32:46 x86_64-w64-mingw32 

# Evaluate model performance. 

# Risk Chart: requires the ggplot2 package.

library(ggplot2)

# Generate a risk chart.

# Rattle provides evaluateRisk() and riskchart().

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
crs$eval <- evaluateRisk(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
#dp3<-print(riskchart(crs$pr, 
#                crs$dataset[crs$test, c(crs$input, crs$target)]$Response, 
#                title="Performance Chart Linear Master_File_New_02_New_Ver01.csv [test] ", show.lift=TRUE, show.precision=TRUE))


#============================================================
# Rattle timestamp: 2015-08-03 08:32:53 x86_64-w64-mingw32 

# Evaluate model performance. 

# Lift Chart: requires the ROCR package.

library(ROCR)

# Obtain predictions for the glm model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}

# Convert rate of positive predictions to percentage.

per <- performance(pred, "lift", "rpp")
per@x.values[[1]] <- per@x.values[[1]]*100

# Plot the lift chart.
#plot(per, col="#CC0000FF", lty=1, xlab="Caseload (%)", add=FALSE)

# Generate a Lift Chart for the glm model on Master_File_New_02_New_Ver01.csv [train].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Also convert rate of positive predictions to percentage

per <- performance(prediction(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response),"lift", "rpp")
per@x.values[[1]] <- per@x.values[[1]]*100

# Now plot the lift.

# Bug in ROCR 1.0-3 plot does not obey the add command.# Calling the function directly (.plot.performance) does work.

#.plot.performance(per, col="#00CCCCFF", lty=2, add=TRUE)

# Add a legend to the plot.

#legend("topright", c("Test","Train"), col=rainbow(2, 1, .8), lty=1:2, title="Linear", inset=c(0.05, 0.05))

# Add decorations to the plot.

#title(main="Lift Chart  Master_File_New_02_New_Ver01.csv ",
#    sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#grid()

#============================================================
# Rattle timestamp: 2015-08-03 08:32:56 x86_64-w64-mingw32 

# Evaluate model performance. 

# ROC Curve: requires the ROCR package.

library(ROCR)

# ROC Curve: requires the ggplot2 package.

require(ggplot2, quietly=TRUE)

# Generate an ROC Curve for the glm model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}

pe <- performance(pred, "tpr", "fpr")
au <- performance(pred, "auc")@y.values[[1]]
pd <- data.frame(fpr=unlist(pe@x.values), tpr=unlist(pe@y.values))
p <- ggplot(pd, aes(x=fpr, y=tpr))
p <- p + geom_line(colour="red")
p <- p + xlab("False Positive Rate") + ylab("True Positive Rate")
p <- p + ggtitle("ROC Curve Linear Master_File_New_02_New_Ver01.csv [test] Response")
p <- p + theme(plot.title=element_text(size=10))
p <- p + geom_line(data=data.frame(), aes(x=c(0,1), y=c(0,1)), colour="grey")
p <- p + annotate("text", x=0.50, y=0.00, hjust=0, vjust=0, size=5,
                   label=paste("AUC =", round(au, 2)))
#print(p)

# Calculate the area under the curve for the plot.


# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}
performance(pred, "auc")

#============================================================
# Rattle timestamp: 2015-08-03 08:32:59 x86_64-w64-mingw32 

# Evaluate model performance. 

# Precision/Recall Plot: requires the ROCR package

library(ROCR)

# Generate a Precision/Recall Plot for the glm model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}
#plot(performance(pred, "prec", "rec"), col="#CC0000FF", lty=1, add=FALSE)
dp2<-performance(pred, "prec", "rec")

# Generate a Precision/Recall Plot for the glm model on Master_File_New_02_New_Ver01.csv [train].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# In ROCR (1.0-3) plot does not obey the add command.
# Calling the function directly (.plot.performance) does work.

#.plot.performance(performance(prediction(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response),"prec", "rec"), col="#00CCCCFF", lty=2, add=TRUE)


# Add a legend to the plot.

#legend("bottomleft", c("Test","Train"), col=rainbow(2, 1, .8), lty=1:2, title="glm", inset=c(0.05, 0.05))

# Add decorations to the plot.

#title(main="Precision/Recall Plot  Master_File_New_02_New_Ver01.csv ",
#    sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#grid()

#============================================================
# Rattle timestamp: 2015-08-03 08:33:05 x86_64-w64-mingw32 

# Evaluate model performance. 

# Sensitivity/Specificity Plot: requires the ROCR package

library(ROCR)

# Generate Sensitivity/Specificity Plot for glm model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}
#plot(performance(pred, "sens", "spec"), col="#CC0000FF", lty=1, add=FALSE)
dp1<-performance(pred, "sens", "spec")

# Generate a Lift Chart for the glm model on Master_File_New_02_New_Ver01.csv [train].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

#In ROCR (1.0-3) plot does not obey the add command.
# Calling the function directly (.plot.performance) does work.

#.plot.performance(performance(prediction(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response),"sens", "spec"), col="#00CCCCFF", lty=2, add=TRUE)


# Add a legend to the plot.

#legend("bottomleft", c("Test","Train"), col=rainbow(2, 1, .8), lty=1:2, title="glm", inset=c(0.05, 0.05))

# Add decorations to the plot.

#title(main="Sensitivity/Specificity (tpr/tnr)  Master_File_New_02_New_Ver01.csv ",
 #   sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#grid()

#============================================================
# Rattle timestamp: 2015-08-03 08:33:11 x86_64-w64-mingw32 

# Score a dataset. 

# Obtain probability scores for the Linear model on Master_File_New_02_New_Ver01.csv [test].

#crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input)])
crs$pr <- as.vector(ifelse(predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)]) > 0.5, "Y", "N"))
sdata3<-crs$pr

# Extract the relevant variables from the dataset.

sdata <- subset(crs$dataset[crs$test,], select=c("ID", "Response"))

# Output the combined data.

write.csv(cbind(sdata, crs$pr), file="D:/R_Outputs/Logistic_Custom_Scores.csv", row.names=FALSE)

#============================================================
# Rattle timestamp: 2015-08-03 08:33:22 x86_64-w64-mingw32 

# Score a dataset. 

#============================================================
# Rattle timestamp: 2015-08-03 08:33:36 x86_64-w64-mingw32 

# Score a dataset. 

# Obtain probability scores for the Linear model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input)])


# Extract the relevant variables from the dataset.

sdata <- subset(crs$dataset[crs$test,], select=c("ID", "Response"))

# Output the combined data.

write.csv(cbind(sdata, crs$pr), file="D:/R_Outputs/Logistic_Custom_Prob.csv", row.names=FALSE)

Probability<-crs$pr
sdata1<-subset(crs$dataset[crs$test,],select=c("ID"))
sdata2<-subset(crs$dataset[crs$test,],select=c("Response"))
sdata1<-cbind.data.frame(sdata1,Probability,sdata2,sdata3)

#=============================================================
#Prediction on New Dataset

#data_for_prediction <- crs$dataset[-29]
data_for_prediction=read.csv(java_file2, na.strings=c(".", "NA", "", "?"), strip.white=TRUE, encoding="UTF-8")
result <- predict(crs$glm, type = "response", newdata=data_for_prediction)

tempdata=0
for (i in 1:length(result))
{
  tempdata[i]= (1-result[i])
}

tempdata<-cbind.data.frame(tempdata,result)

write.csv(data.frame(tempdata),"D:/Predictions.csv")

#dp<-c(per,dp1,dp2,pd,dp3,tempdata,sdata1,data.frame(GiniIndex),col_for_prediction)
dp<-c(per,dp1,dp2,pd,tempdata,sdata1,data.frame(GiniIndex),col_for_prediction)
return(dp)

})
4

0 回答 0