r - R：删除/删除多重共线性变量，基于多重共线性测试输出的输出

Question

我有一个包含 32 个变量和 48 个观察值的数据集（观察值会增加，因为我们正处于通过订阅收集数据的阶段）。由于数据集具有高度的多重共线性，我介绍了 Farrar – Glauber Test。测试的单个度量 (idiags) 有一个称为 Klein 的参数，其值为 0s 和 1s，表示变量是否具有多重共线性。现在基于 Klien 的值，我需要从主数据集中删除列。我被困在如何根据数据集 imcdiag_idiagval 中的 Klein 值从主数据集（model_df）中删除/删除。

下面是我的示例代码，model_df0：没有目标变量的数据集 model_df：有目标变量的数据集

library(mctest)

#Farrar – Glauber test(F–test) for location of multicollinearity
# Individual Multicollinearity Diagnostic Measures
imcdiag_out = imcdiag(model_df0, model_df$TARGET)
summary(imcdiag_out)
#Output of summary function below,
        Length Class  Mode   
idiags   224   -none- numeric
x       1536   -none- numeric
y         48   -none- numeric
method     0   -none- NULL   
corr       1   -none- logical
call       3   -none- call   
pval      29   -none- numeric
R2         1   -none- numeric
all        1   -none- logical
alldiag  224   -none- logical

imcdiag_idiagval = as.data.frame(imcdiag_out$idiags)

> imcdiag_idiagval$Klein
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0

在 32 个变量中，Klein 中带 1 的值具有多重共线性。因此需要删除它们

> dput(head(model_df))
structure(list(Mon = structure(c(2014.33333333333, 2014.41666666667, 
2014.5, 2014.58333333333, 2014.66666666667, 2014.75), class = "yearmon"), 
    Al_Prod_Qty = c(4516084.63333333, 4488436.58064516, 4529767.5, 
    4468792.16129032, 4515652.4516129, 4513669.13333333), US_Indus_Growth = c(2.72933333333333, 
    3.00806451612903, 3.25466666666667, 3.41161290322581, 4.03322580645161, 
    3.22333333333333), China_Elec_Rail = c(6.206, 7.30733333333333, 
    7.18741935483871, 5.84322580645161, 3.90166666666667, 5.65290322580645
    ), DS_Index = c(80.4, 79.81, 81.52, 82.78, 86.05, 87.02), 
    Al_Prod_Africa = c(152L, 143L, 144L, 148L, 142L, 144L), Al_Prod_NorthAmerica = c(392L, 
    372L, 386L, 386L, 372L, 379L), Al_Prod_SouthAmerica = c(135L, 
    117L, 117L, 116L, 111L, 120L), Al_Prod_AsiaEXChina = c(203L, 
    193L, 203L, 198L, 194L, 204L), Al_Prod_WestEurope = c(304L, 
    297L, 306L, 306L, 298L, 307L), Al_Prod_EastCentralEurope = c(319L, 
    309L, 319L, 319L, 310L, 321L), Al_Prod_Oceania = c(178L, 
    174L, 175L, 166L, 160L, 165L), Al_Prod_GCC = c(418L, 412L, 
    427L, 426L, 413L, 426L), Al_Prod_ChinaEstimated = c(12, 15, 
    18, 22, 23, 24), Al_Prod_Total = c(4441L, 4419L, 4507L, 4553L, 
    4502L, 4619L), Al_Prod_DailyAvg = c(143.3, 147.3, 145.4, 
    146.9, 150.1, 149), almna_prod_AfricaAsiaExChina = c(395L, 
    445L, 459L, 436L, 413L, 395L), almna_prod_NorthAmerica = c(440L, 
    444L, 479L, 476L, 441L, 455L), almna_prod_SouthAmerica = c(1137L, 
    1077L, 1047L, 1063L, 1068L, 1097L), almna_prod_WestEurope = c(326L, 
    363L, 334L, 386L, 351L, 323L), almna_prod_EastCentralEurope = c(352L, 
    322L, 330L, 321L, 325L, 325L), almna_prod_Oceania = c(1696L, 
    1632L, 1663L, 1631L, 1579L, 1693L), almna_prod_ChinaEstimated = c(4513L, 
    4448L, 4509L, 4461L, 4416L, 4754L), almna_prod_Total = c(8859L, 
    8731L, 8821L, 8774L, 8593L, 9042L), almna_prod_DailyAverage = c(285.77, 
    291.03, 284.55, 283.03, 286.43, 291.68), TARGET = c(1749.1, 
    1834.4, 1945.41, 2030.23, 1992.48, 1938.2), lme_al_3month_price = c(1790.98, 
    1868.83, 1967.28, 2038.1, 2023.48, 1956.39), lme_al_stock = c(5271645, 
    5116815, 4990208, 4887148, 4706865, 4519988), Coal_Central_Appalachia_Price = c(60.7, 
    60.05, 57.25, 56.35, 54.59, 56.3), Coal_Northern_Apppalachia_Price = c(65.5, 
    64.5, 63.7, 62.75, 63.3, 65.55), Coal_Illinois_Basin_Price = c(46, 
    45.45, 44, 44, 43.4, 44.7), Coal_Powder_River_Price = c(13, 
    12.55, 12.25, 11.55, 11.25, 11.55), Coal_Uinta_Basin_Price = c(36.75, 
    36.75, 36.45, 36.45, 37, 37.75), BrentCrudePrice = c(109.21, 
    111.03, 104.94, 101.12, 94.67, 84.17)), class = c("data.table", 
"data.frame"), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x00000000025f1ef0>)

score 0 · Accepted Answer

这是一个示例，其中 1 表示应使用mtcars示例数据从数据框中删除一列。

data(mtcars)

# set up true false variable where 1 = drop, 0 = keep
# noting that it is of type numeric(), not logical()
drop <- c(0,0,0,0,1,1,1,1,0,0,0)

# this will result in a data frame that
# contains 7 columns, excluding drat, wt, qsec, and vs

theSubset <- mtcars[!drop]
head(theSubset)

...和输出：

> head(theSubset)
                   mpg cyl disp  hp am gear carb
Mazda RX4         21.0   6  160 110  1    4    4
Mazda RX4 Wag     21.0   6  160 110  1    4    4
Datsun 710        22.8   4  108  93  1    4    1
Hornet 4 Drive    21.4   6  258 110  0    3    1
Hornet Sportabout 18.7   8  360 175  0    3    2
Valiant           18.1   6  225 105  0    3    1
>

score 0 · Accepted Answer

如果 1 表示变量不是多重共线的，流程代码：

#raw data

dataset<-sapply(paste0("var",1:32), function(x) assign(x,rnorm(n=48)) )

imcdiag_idiagval.Klein<-c(0 ,0 ,0, 0 ,0 ,0, 0 ,0, 0 ,0 ,0 ,0 ,0, 1, 1, 1, 1, 1, 1 ,1 ,1, 1, 1 ,1 ,0 ,0, 0, 0, 0, 0, 0, 0) 

#final data without multi-collinearity

final_dataset<-dataset[,imcdiag_idiagval.Klein==1]

编辑帖子以使用您的数据集。

在您的数据中，您有 34 列：

names(data) 
 [1] "Mon"                             "Al_Prod_Qty"                    
  [3] "US_Indus_Growth"                 "China_Elec_Rail"                
  [5] "DS_Index"                        "Al_Prod_Africa"                 
  [7] "Al_Prod_NorthAmerica"            "Al_Prod_SouthAmerica"           
  [9] "Al_Prod_AsiaEXChina"             "Al_Prod_WestEurope"             
 [11] "Al_Prod_EastCentralEurope"       "Al_Prod_Oceania"                
 [13] "Al_Prod_GCC"                     "Al_Prod_ChinaEstimated"         
 [15] "Al_Prod_Total"                   "Al_Prod_DailyAvg"               
 [17] "almna_prod_AfricaAsiaExChina"    "almna_prod_NorthAmerica"        
 [19] "almna_prod_SouthAmerica"         "almna_prod_WestEurope"          
 [21] "almna_prod_EastCentralEurope"    "almna_prod_Oceania"             
 [23] "almna_prod_ChinaEstimated"       "almna_prod_Total"               
 [25] "almna_prod_DailyAverage"         "TARGET"                         
 [27] "lme_al_3month_price"             "lme_al_stock"                   
 [29] "Coal_Central_Appalachia_Price"   "Coal_Northern_Apppalachia_Price"
 [31] "Coal_Illinois_Basin_Price"       "Coal_Powder_River_Price"        
 [33] "Coal_Uinta_Basin_Price"          "BrentCrudePrice"

所以你需要一个 33 的向量：

 imcdiag_idiagval.Klein<-sample(0:1,33,replace = T)

添加响应变量：

fina_index<-c(imcdiag_idiagval.Klein,1)

最终数据：

final_data<-data[,fina_index==1]

相比：

dim(final_data)[2]
[1]  6 16
sum(fina_index)
[1] 16
sum(fina_index)

r - R：删除/删除多重共线性变量，基于多重共线性测试输出的输出

2 回答 2

编辑帖子以使用您的数据集。

Related

Reference