workflowsets
尝试使用以下软件包运行(调整)我的工作流集时,我遇到了一些警告和错误tidymodels
:
i No tuning parameters. `fit_resamples()` will be attempted
i 1 of 30 resampling: normalized_lm
✓ 1 of 30 resampling: normalized_lm (3s)
i 2 of 30 tuning: normalized_glm
Warning: All models failed. See the `.notes` column.
x 2 of 30 tuning: normalized_glm failed with: There were no valid metrics for the ANOVA model.
i No tuning parameters. `fit_resamples()` will be attempted
i 3 of 30 resampling: normalized_knn
Warning: All models failed. See the `.notes` column.
x 3 of 30 resampling: normalized_knn failed with preprocessor 1/1, model 1/1 (predictions): Error: Problem with `mutate()` column `.row`.ℹ `.row = orig_rows`.ℹ `.row` must be size 710 or 1, not 723.
i 4 of 30 tuning: normalized_svm
Warning: All models failed. See the `.notes` column.
x 4 of 30 tuning: normalized_svm failed with: There were no valid metrics for the ANOVA model.
i 5 of 30 tuning: normalized_RF
i Creating pre-processing data to finalize unknown parameter: mtry
Warning: All models failed. See the `.notes` column.
x 5 of 30 tuning: normalized_RF failed with: There were no valid metrics for the ANOVA model.
i 6 of 30 tuning: normalized_XGB
i Creating pre-processing data to finalize unknown parameter: mtry
✓ 6 of 30 tuning: normalized_XGB (40.8s)
i No tuning parameters. `fit_resamples()` will be attempted
i 7 of 30 resampling: rm_corr_lm
✓ 7 of 30 resampling: rm_corr_lm (2.5s)
i 8 of 30 tuning: rm_corr_glm
Warning: All models failed. See the `.notes` column.
x 8 of 30 tuning: rm_corr_glm failed with: There were no valid metrics for the ANOVA model.
i No tuning parameters. `fit_resamples()` will be attempted
i 9 of 30 resampling: rm_corr_knn
Warning: All models failed. See the `.notes` column.
x 9 of 30 resampling: rm_corr_knn failed with preprocessor 1/1, model 1/1 (predictions): Error: Problem with `mutate()` column `.row`.ℹ `.row = orig_rows`.ℹ `.row` must be size 710 or 1, not 723.
i 10 of 30 tuning: rm_corr_svm
Warning: All models failed. See the `.notes` column.
x 10 of 30 tuning: rm_corr_svm failed with: There were no valid metrics for the ANOVA model.
i 11 of 30 tuning: rm_corr_RF
i Creating pre-processing data to finalize unknown parameter: mtry
Warning: All models failed. See the `.notes` column.
[...]
编码:
第 1 步:获取数据
不幸的是,我没有更多的空间在这里添加数据。我可以告诉你数据是
> str(ml.data.no.outliers.wf)
tibble [4,519 × 44] (S3: tbl_df/tbl/data.frame)
$ PrName : Factor w/ 25 levels "Parklands","Agroforestry Pruning-Organic Fertilizer",..: 1 1 2 2 2 3 3 3 2 2 ...
$ Out.SubInd : Factor w/ 42 levels "Biomass Yield",..: 1 1 1 1 1 1 1 1 4 4 ...
$ Product : Factor w/ 16 levels "Pearl Millet",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Site.Type : Factor w/ 3 levels "Farm","Station",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Tree : Factor w/ 105 levels "Faidherbia albida",..: 1 1 2 2 2 2 2 2 2 2 ...
$ Out.SubInd.Code : Factor w/ 42 levels "BiY","SOC","SN",..: 1 1 1 1 1 1 1 1 4 4 ...
$ Latitude : num [1:4519] 13.4 13.4 13.5 13.5 13.5 ...
$ Longitude : num [1:4519] 2.27 2.27 2.58 2.58 2.58 ...
$ Bio01_MT_Annu : num [1:4519] 27.3 27.3 19.4 19.4 19.4 ...
$ Bio02_MDR : num [1:4519] 11.2 11.2 12.1 12.1 12.1 ...
$ Bio03_Iso : num [1:4519] 64.3 64.3 39.8 39.8 39.8 ...
$ Bio04_TS : num [1:4519] 171 171 666 666 666 ...
$ Bio05_TWM : num [1:4519] 36.7 36.7 35.9 35.9 35.9 ...
$ Bio06_MinTCM : num [1:4519] 19.3 19.3 5.46 5.46 5.46 ...
$ Bio07_TAR : num [1:4519] 17.4 17.4 30.4 30.4 30.4 ...
$ Bio08_MT_WetQ : num [1:4519] 25.3 25.3 15.8 15.8 15.8 ...
$ Bio09_MT_DryQ : num [1:4519] 27.8 27.8 27.6 27.6 27.6 ...
$ Bio10_MT_WarQ : num [1:4519] 29.7 29.7 27.7 27.7 27.7 ...
$ Bio11_MT_ColQ : num [1:4519] 25.3 25.3 11.5 11.5 11.5 ...
$ Bio12_Pecip_Annu : num [1:4519] 1230 1230 195 195 195 ...
$ Bio13_Precip_WetM : num [1:4519] 248 248 26.3 26.3 26.3 ...
$ Bio14_Precip_DryM : num [1:4519] 1.78 1.78 1.31 1.31 1.31 ...
$ Bio15_Precip_S : num [1:4519] 90.7 90.7 48.5 48.5 48.5 ...
$ Bio16_Precip_WetQ : num [1:4519] 695.1 695.1 69.2 69.2 69.2 ...
$ Bio17_Precip_DryQ : num [1:4519] 13 13 12.2 12.2 12.2 ...
$ iSDA_Depth_to_bedrock: num [1:4519] 197 197 178 178 178 ...
$ iSDA_SAND_conc : num [1:4519] 72.1 72.1 83.5 83.5 83.5 ...
$ iSDA_CLAY_conc : num [1:4519] 13.59 13.59 8.07 8.07 8.07 ...
$ iSDA_SILT_conc : num [1:4519] 12.32 12.32 7.72 7.72 7.72 ...
$ iSDA_FE_Bulk_dens : num [1:4519] 140 140 144 144 144 ...
$ iSDA_log_C_tot : num [1:4519] 15.52 15.52 3.52 3.52 3.52 ...
$ iSDA_log_Ca : num [1:4519] 58.8 58.8 52.5 52.5 52.5 ...
$ iSDA_log_eCEC : num [1:4519] 18.7 18.7 14.4 14.4 14.4 ...
$ iSDA_log_Fe : num [1:4519] 27.68 27.68 8.88 8.88 8.88 ...
$ iSDA_log_K : num [1:4519] 40.2 40.2 35.8 35.8 35.8 ...
$ iSDA_log_Mg : num [1:4519] 45.4 45.4 38.6 38.6 38.6 ...
$ iSDA_log_N : num [1:4519] 25.5 25.5 12.5 12.5 12.5 ...
$ iSDA_log_SOC : num [1:4519] 10.94 10.94 7.43 7.43 7.43 ...
$ iSDA_log_P : num [1:4519] 14.48 14.48 4.91 4.91 4.91 ...
$ iSDA_log_S : num [1:4519] 8.2 8.2 6.68 6.68 6.68 6.68 6.68 6.68 6.68 6.68 ...
$ iSDA_pH : num [1:4519] 59.3 59.3 56 56 56 ...
$ ASTER_Altitude : num [1:4519] 202 202 228 228 228 ...
$ ASTER_Slope : num [1:4519] 0.49 0.49 0.69 0.69 0.69 0.69 0.69 0.69 0.69 0.69 ...
$ logRR : num [1:4519] 0.303 0.316 1.06 1.246 0.918 ...
第 2 步:拆分数据
set.seed(456)
# Splitting data
af.split.wf <- initial_split(ml.data.no.outliers.wf, prop = 0.80, strata = logRR)
af.train.wf <- training(af.split.wf)
af.test.wf <- testing(af.split.wf)
第 3 步:定义训练数据的重采样技术
set.seed(345)
# Re-sample technique(s)
cv.fold.wf <- vfold_cv(af.train.wf, v = 5)
spatial.cv.fold.wf <- spatial_clustering_cv(af.train.wf, coords = c("Longitude", "Latitude"), v = 5)
第 4 步:定义模型指标
# Metrics
multi.metric.wf <- metric_set(rmse, rsq, ccc, mae)
model.control.wf <- control_stack_grid() # save_pred = TRUE, save_workflow = TRUE.
model.control.linear.wf <- control_resamples(save_pred = TRUE)
第 5 步:创建预处理配方
# Linear model - lm recipe
recipe_base <-
recipe(formula = logRR ~ ., data = af.train.wf) %>%
update_role(Site.Type, new_role = "predictor") %>% # alters an existing role in the recipe to variables.
update_role(PrName, # or assigns an initial role to variables that do not yet have a declared role.
Out.SubInd,
Out.SubInd.Code,
Product,
Latitude,
Longitude,
Tree,
new_role = "sample ID") %>%
step_dummy(Site.Type, one_hot = TRUE, naming = partial(dummy_names,sep = "_")) # convert nominal data into one or more numeric
#step_novel(Site.Type, -all_outcomes()) %>% # assign a previously unseen factor level to a new value.
# Prep an juice
recipe_base %>% prep() %>% juice() %>% glimpse()
recipe_normal <- recipe_base %>%
step_normalize(all_predictors()) # normalize numeric data: standard deviation of one and a mean of zero.
# Prep an juice
recipe_normal %>% prep() %>% juice() %>% glimpse()
recipe_corr <- recipe_base %>%
step_corr(all_predictors()) # remove variables that have large absolute correlations with other variables.
# Warning in cor(x, use = use, method = method) :
# the standard deviation is zero
# Warning: The correlation matrix has missing values. 3 columns were excluded from the filter.
# Prep an juice
recipe_corr %>% prep() %>% juice() %>% glimpse()
recipe_remove <- recipe_base %>%
step_nzv(all_predictors()) %>% # remove variables that are highly sparse and unbalanced.
step_zv(all_predictors()) # remove variables that contain only a single value.
# Prep an juice
recipe_remove %>% prep() %>% juice() %>% glimpse()
recipe_impute_mean <- recipe_base %>%
step_impute_mean(all_predictors()) # substitute missing values of numeric variables by the training set mean of those variables.
# Prep an juice
recipe_impute_mean %>% prep() %>% juice() %>% glimpse()
recipe_impute_knn <- recipe_base %>%
step_impute_knn(all_predictors(), neighbors = tune()) # substitute missing values of numeric variables by the training set mean of those variables.
# Prep an juice
# recipe_impute_knn %>% prep() %>% juice() # <- cannot be shown because workflowset will tune this one
第 6 步:创建模型规范
lm_model.wf <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
glm_model.wf <- linear_reg(
mode = "regression",
penalty = tune(),
mixture = tune()
) %>%
set_engine("glmnet")
spline_model.wf <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
knn_model.wf <- nearest_neighbor() %>%
set_mode("regression") %>%
set_engine("kknn")
# pca_model <- linear_reg() %>%
# set_mode("regression") %>%
# set_engine("lm")
svm_model.wf <- svm_poly(cost = tune(),
margin = tune()
) %>%
set_mode("regression") %>%
set_engine("kernlab")
rf_model.wf <- rand_forest(mtry = tune(),
min_n = tune(),
trees = 200
) %>%
set_mode("regression") %>%
set_engine("ranger")
#importance = "permutation")
xgb_model.wf <- boost_tree(trees = 200,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(), # first three: model complexity
sample_size = tune(), # randomness
mtry = tune(), # randomness
learn_rate = tune() # step size
) %>%
set_mode("regression") %>%
set_engine("xgboost")
#importance = "permutation")
# catboost_model.wf <- parsnip::boost_tree( # Remember to load library(treesnip)
# mode = "regression",
# trees = tune(),
# min_n = tune(),
# learn_rate = tune(),
# tree_depth = tune()
# ) %>%
# set_engine("catboost")
第 7 步:设置工作流集
wflwset_setup <- workflow_set(
preproc = list(
normalized = recipe_normal,
rm_corr = recipe_corr,
rm_unbalan = recipe_remove,
impute_mean = recipe_impute_mean,
impute_knn = recipe_impute_knn
),
models = list(
lm = lm_model.wf,
glm = glm_model.wf,
spline = spline_model.wf,
knn = knn_model.wf,
svm = svm_model.wf,
RF = rf_model.wf,
XGB = xgb_model.wf
#CatB = catboost_model.wf
),
cross = TRUE
)
第 8 步:运行工作流集
set.seed(579)
if (exists("wflwset_tune_results_cv_2")) rm("wflwset_tune_results_cv_2")
# Initializing parallel processing
doParallel::registerDoParallel()
# Workflowset tuning
wflwset_tune_results_cv_2 <- wflwset_setup %>%
workflowsets::workflow_map(
fn = "tune_race_anova",
resamples = cv.fold.wf,
metrics = metric_set(rmse, ccc, mae), #
verbose = TRUE,
grid = 15
)
# Terminating parallel session
parallelStop()