0

我有随机森林变量选择的示例代码。我们希望选择最重要的变量组合,并建立具有最低 OOB 的随机森林模型。任何人都可以为我解释函数中的 for 循环部分吗?

 clinical_variables <- c("Age","location", "smoke", "perianal_disease","upper_tract", "LnASCA 
 IgA","LnASCA IgG", "LnANCA", "LnCbir", "LnOMPC", "CRP", "Albumin", "African American Race")

 variable_selected_progress_biomarkers <- vector("list", 50)

 error_rate_min_progress_biomarkers <- rep(NA, 50)

 for (j in 1:50){

  risk_progress_biomarker_variables <- risk_full %>% 
    select(names(risk), clinical_variables) %>% 
    select(-c("STRICTURE", "TIM2STRICTURE", "PENETRATING",     "TIM2PENETRATING","BDNF","LASTFOLLOWUPDAYSPROGRESS", "PROGRESSED")) %>% names

  risk_progress_biomarker_variables_total <- vector("list",104)
  names(risk_progress_biomarker_variables_total) <- 104:1
  error_rate_tail_progress_biomarker <- rep(NA, 104)
  for (i in 1:104){
    set.seed(4182019)
    risk_progress_biomarker_variables_total[[i]] <-  risk_progress_biomarker_variables
    rf_risk_progress_biomarker <- rfsrc(
      Surv(LASTFOLLOWUPDAYSPROGRESS, PROGRESSED) ~ .,
      data = risk_full %>% select(risk_progress_biomarker_variables, LASTFOLLOWUPDAYSPROGRESS, PROGRESSED)%>% 
        mutate_if(is.factor, as.numeric),
      ntree=1000,
      importance = TRUE
    )
    error_rate_tail_progress_biomarker[i] <- tail(rf_risk_progress_biomarker$err.rate,n =1)

    rf_risk_progress_biomarker_importance <- rf_risk_progress_biomarker$importance %>% 
      as.data.frame() %>% 
      rownames_to_column() %>% 
      as.tibble() %>% 
      dplyr::rename(VIMP = ".") %>% 
      arrange(desc(VIMP)) 
    risk_progress_biomarker_variables <- rf_risk_progress_biomarker_importance %>% 
      head((dim(rf_risk_progress_biomarker_importance)[1]-1)) %>% 
      # top_n((dim(rf_risk_progress_biomarker_importance)[1]-1)) %>% 
      pull(rowname)
    print(i)
  }

  tibble_error_rate_tail_progress_biomarker <- tibble(n = 104:1, error_rate = error_rate_tail_progress_biomarker)
  suppressMessages(n_min_progress_biomarker <- tibble_error_rate_tail_progress_biomarker %>%  top_n(-1) %>% pull(n))
  suppressMessages(error_rate_min_progress_biomarker <- tibble_error_rate_tail_progress_biomarker %>%  top_n(-1) %>% pull(error_rate))

  variable_selected_progress_biomarkers[[j]] <- str_replace_all(risk_progress_biomarker_variables_total[[105-n_min_progress_biomarker]], "_", "")
  error_rate_min_progress_biomarkers[j] <- error_rate_min_progress_biomarker
  print(paste("Finish", j))
}
4

0 回答 0