在使用 glm 函数进行逻辑回归后,我试图了解如何制作混淆矩阵。到目前为止,这是我的代码。我正在使用 caret 包和confusionMatrix 函数。
dput(head(wine_quality))
structure(list(fixed.acidity = c(7, 6.3, 8.1, 7.2, 7.2, 8.1),
volatile.acidity = c(0.27, 0.3, 0.28, 0.23, 0.23, 0.28),
citric.acid = c(0.36, 0.34, 0.4, 0.32, 0.32, 0.4), residual.sugar = c(20.7,
1.6, 6.9, 8.5, 8.5, 6.9), chlorides = c(0.045, 0.049, 0.05,
0.058, 0.058, 0.05), free.sulfur.dioxide = c(45, 14, 30,
47, 47, 30), total.sulfur.dioxide = c(170, 132, 97, 186,
186, 97), density = c(1.001, 0.994, 0.9951, 0.9956, 0.9956,
0.9951), pH = c(3, 3.3, 3.26, 3.19, 3.19, 3.26), sulphates = c(0.45,
0.49, 0.44, 0.4, 0.4, 0.44), alcohol = c(8.8, 9.5, 10.1,
9.9, 9.9, 10.1), quality = structure(c(4L, 4L, 4L, 4L, 4L,
4L), .Label = c("3", "4", "5", "6", "7", "8", "9", "white"
), class = "factor"), type = structure(c(3L, 3L, 3L, 3L,
3L, 3L), .Label = c("", "red", "white"), class = "factor"),
numeric_type = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 6L
), class = "data.frame")
library(tibble)
library(broom)
library(ggplot2)
library(caret)
any(is.na(wine_quality)) # this evaulates to FALSE
wine_model <- glm(type ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density + pH + sulphates + alcohol, wine_quality, family = "binomial")
# split data into test and train
smp_size <- floor(0.75 * nrow(wine_quality))
set.seed(123)
train_ind <- sample(seq_len(nrow(wine_quality)), size = smp_size)
train <- wine_quality[train_ind, ]
test <- wine_quality[-train_ind, ]
# make prediction on train data
pred <- predict(wine_model)
train$fixed.acidity <- as.numeric(train$fixed.acidity)
round(train$fixed.acidity)
train$fixed.acidity <- as.factor(train$fixed.acidity)
pred <- as.numeric(pred)
round(pred)
pred <- as.factor(pred)
confusionMatrix(pred, wine_quality$fixed.acidity)
在这最后一行代码之后,我收到此错误:
Error: `data` and `reference` should be factors with the same levels.
这个错误对我没有意义。我已经测试过 pred 的长度和 fixed.acidity 的长度都相同(6497),而且它们都是因子数据类型。
length(pred)
length(wine_quality$fixed.acidity)
class(pred)
class(train$fixed.acidity)
这个混淆矩阵不起作用有什么明显的原因吗?我正在尝试找到模型的命中率。我会很感激虚拟的解释,我真的不知道我在这里做什么。