0

将食谱加载到插入符号时我遇到了这个问题:: train

NA 估算有问题,但我不知道如何解决。如果我删除交叉验证一切正常。

提前致谢,

data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7) 
air_train <- training(air_split)
air_test <- testing(air_split)

# Feature engineering - final recipe
air_recipe <- recipe(Ozone ~ ., data = air_train) %>% 
  step_zv(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>% 
  step_log(Ozone, Wind) %>%
  step_other(Day, threshold = 0.01, other = "other") %>%
  step_dummy(all_nominal(), -all_outcomes())


# Validation
cv5 <- trainControl( method = "repeatedcv", 
                     number = 5,
                     repeats = 5, allowParallel = TRUE)

# Fit an lm model
set.seed(12) 
lm_fit <- train(
  air_recipe,
  data = air_train, 
  method = "lm", 
  trControl = cv5, 
  metric = "RMSE")

错误信息

quantile.default(y, probs = seq(0, 1, length = cut)) 中的错误:如果 'na.rm' 为 FALSE,则不允许缺少值和 NaN

R.version _
platform x86_64-apple-darwin15.6.0
arch x86_64
os darwin15.6.0
system x86_64, darwin15.6.0
status
major 3
minor 6.1
year 2019
month 07
day 05
svn rev 76782
language R
version.string R version 3.6.1 (2019 -07-05) 昵称 脚趾动作

4

1 回答 1

1

看起来重新采样是在应用配方之前进行的

所以你可以prepand juicethe recipeand 使用公式方法:

library(recipes)
library(caret)
library(rsample)

data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7) 
air_train <- training(air_split)
air_test <- testing(air_split)

# Feature engineering - final recipe
air_recipe <- recipe(Ozone ~ ., data = air_train) %>% 
  step_zv(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>% 
  step_log(Ozone, Wind) %>%
  step_other(Day, threshold = 0.01, other = "other") %>%
  step_dummy(all_nominal(), -all_outcomes()) %>% 
  step_naomit(all_outcomes(),all_predictors())

# Prep recipe
air_prep <- prep(air_recipe, retain = TRUE)

# Juice the prepared recipe 
air_train <- juice(air_prep)

# Validation
cv5 <- trainControl( method = "repeatedcv", 
                     number = 5,
                     repeats = 5, allowParallel = TRUE)


# Fit an lm model
set.seed(12) 
lm_fit <- train(
  Ozone ~ .,
  data = air_train, 
  method = "lm", 
  trControl = cv5, 
  metric = "RMSE")

lm_fit
#> Linear Regression 
#> 
#> 108 samples
#>   5 predictor
#> 
#> No pre-processing
#> Resampling: Cross-Validated (5 fold, repeated 5 times) 
#> Summary of sample sizes: 86, 88, 86, 86, 86, 86, ... 
#> Resampling results:
#> 
#>   RMSE       Rsquared   MAE      
#>   0.5091496  0.6568485  0.3793589
#> 
#> Tuning parameter 'intercept' was held constant at a value of TRUE

或者,您可以使用{parsnip}{tune}保留tidymodels成语中的所有内容:

library(recipes)
library(rsample)
library(parsnip)
library(tune)
library(yardstick)

data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7) 
air_train <- training(air_split)
air_test <- testing(air_split)

air_recipe <- recipe(Ozone ~ ., data = air_train) %>% 
  step_zv(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>% 
  step_log(Ozone, Wind) %>%
  step_other(Day, threshold = 0.01, other = "other") %>%
  step_dummy(all_nominal(), -all_outcomes()) %>% 
  step_naomit(all_outcomes(),all_predictors())

air_cv <- vfold_cv(air_train, v = 5, repeats = 5)

lm_mod <- linear_reg() %>% set_engine("lm")

lm_fits <- fit_resamples(air_recipe, lm_mod, air_cv)

show_best(lm_fits, metric = "rmse", maximize = FALSE)
#> # A tibble: 1 x 5
#>   .metric .estimator  mean     n std_err
#>   <chr>   <chr>      <dbl> <int>   <dbl>
#> 1 rmse    standard   0.526    25  0.0256

reprex 包于 2020-04-05 创建(v0.3.0)

于 2020-04-05T14:09:46.277 回答