r - 使用 dplyr 汇总多个列的最小值和最大值范围

Question

我正在尝试根据现有变量自动生成几个新变量。我想使用它们各自的最小和最大变量来获取“a”、“b”和“c”中的每一个的值范围。我正在模拟的数据来自聚合到区域统计数据的卫星传感器，这意味着每一行都是一个多边形特征。

这是一个要使用的玩具数据框：

dat <- data.frame(a.min = runif(100, 0, 100), 
                  b.min = runif(100, 0, 10), 
                  c.min = runif(100, 0, 0.5), 
                  a.max = runif(100, 100, 200), 
                  b.max = runif(100, 10, 20), 
                  c.max = runif(100, 0.5, 1))

这是执行此操作的手动方式：

dat$a.range <- dat$a.max - dat$a.min
dat$b.range <- dat$b.max - dat$b.min
dat$c.range <- dat$c.max - dat$c.min

head(dat)

如何使用 dplyr 以自动化方式完成此任务？我知道我的数据中会有 NA 值。

到目前为止，我有：

dat %>% select(dat, matches("min|max"))

我试图定义一个范围函数：

rng <- function(x,y){y - x})

我不知道选择后去哪里。我想我需要使用“mutate”或“cross”？

干杯和感谢！

score 3 · Accepted Answer

我们可以转向长格式pivot_longer并获得range

library(dplyr)
library(tidyr)
dat1 <- dat %>%
   pivot_longer(cols = everything(), names_to = c(".value", 'grp'), 
      names_sep = "\\.") %>%
  summarise(across(a:c,  ~.[grp == 'max']- .[grp == 'min'],
        .names = '{.col}.range')) %>% 
  bind_cols(dat, .)

-输出

head(dat1)
#     a.min    b.min     c.min    a.max    b.max     c.max   a.range   b.range   c.range
#1 27.646339 4.055958 0.1095838 179.7785 14.82492 0.5455450 152.13219 10.768966 0.4359612
#2 77.459085 9.549793 0.4220214 187.4912 12.64510 0.5871106 110.03215  3.095303 0.1650892
#3 79.308797 8.449052 0.2786377 137.7695 15.64397 0.9327440  58.46075  7.194922 0.6541063
#4  8.430773 2.060054 0.3746367 125.9992 17.76314 0.9935886 117.56838 15.703083 0.6189519
#5 89.627414 5.498631 0.3217548 112.5346 17.39814 0.8001432  22.90720 11.899511 0.4783885
#6 74.553222 9.621933 0.4568924 156.3704 18.85852 0.7971354  81.81716  9.236589 0.3402430

score 2 · Accepted Answer

这只能在 dplyr 中完成

mutate如果您想添加列，则可以使用

set.seed(1)
dat <- data.frame(a.min = runif(100, 0, 100), 
                  b.min = runif(100, 0, 10), 
                  c.min = runif(100, 0, 0.5), 
                  a.max = runif(100, 100, 200), 
                  b.max = runif(100, 10, 20), 
                  c.max = runif(100, 0.5, 1))



library(dplyr)

dat %>%
  transmute(across(ends_with("max"), ~ . - get(gsub( "max", "min", cur_column())), 
                   .names = '{.col}.range'))
#>     a.max.range b.max.range c.max.range
#> 1     140.82036  10.0415368   0.6433344
#> 2      72.27340   8.3187269   0.7348150
#> 3      91.97428  16.8411799   0.5706304
#> 4      55.33441   9.0516443   0.6971961
#> 5     117.35346  13.1020379   0.6455329
#> 6     109.27095  15.1048262   0.7254760
#> 7      23.16754  12.4098472   0.4197071
#> 8     115.26374  13.0289951   0.8601976
#> 9      43.93326   0.8707504   0.7501387
#> 10    133.86635  13.4154775   0.4073330
#> 11     93.51698  10.1757853   0.4563699
#> 12    101.67531   6.2561323   0.6834871
#> 13    115.43289  13.9090815   0.1224529
#> 14    133.58103  13.6143533   0.3899003
#> 15     49.73707  15.5764745   0.4489774
#> 16     99.73024  14.6274746   0.4395670
#> 17     36.54954   7.7908846   0.4982439
#> 18     36.19781  12.0486821   0.2943119
#> 19    158.91736  12.4872790   0.1907941
#> 20     84.72690  11.8269226   0.6489648
#> 21     72.99130   4.4287902   0.7833635
#> 22    110.03471  10.1913907   0.7717606
#> 23     75.40158  11.7866144   0.5131911
#> 24    187.05223   9.6972425   0.3091013
#> 25    158.78617   5.8966645   0.2320550
#> 26    156.74343   9.5186809   0.7086089
#> 27    179.89148   5.7414122   0.7733848
#> 28    139.97942  17.2502681   0.8561180
#> 29     39.81873  16.0972592   0.7696968
#> 30    142.18025  12.8372130   0.5699562
#> 31    150.42315  13.6084027   0.6296052
#> 32     69.40397   9.9539625   0.4489674
#> 33     90.58098  13.4321886   0.4809792
#> 34    162.59139   8.9506943   0.3231189
#> 35     24.97784   1.7643494   0.3635593
#> 36     69.52301  12.0359528   0.1767952
#> 37     64.83526  11.7874100   0.3404482
#> 38    104.87705   5.2612129   0.3909888
#> 39     85.84943   4.9707435   0.3720141
#> 40    155.88877  14.1287602   0.5845266
#> 41    116.85535   3.5874035   0.5457002
#> 42     52.93918   6.9245058   0.6886609
#> 43     75.91977  12.6198181   0.3714896
#> 44     83.12676  13.5158301   0.5930698
#> 45    114.64445   5.3493943   0.3910495
#> 46     47.99375   9.7052778   0.4522083
#> 47    144.59197  11.9143685   0.7836076
#> 48     69.45700  11.9580705   0.6232934
#> 49     63.68757  10.1873592   0.5509458
#> 50    103.26737   2.0548773   0.3948805
#> 51    100.85295  13.4967207   0.5544158
#> 52     20.25930   8.8535057   0.3196302
#> 53    134.64491  10.4149506   0.2701788
#> 54    117.35244   7.1711213   0.6513636
#> 55    191.03390   5.5374946   0.6428320
#> 56    118.34178  14.7696171   0.6368492
#> 57    153.16104  10.4853131   0.4231595
#> 58     56.36050  19.1182602   0.2755695
#> 59    122.44537  13.2603647   0.7241478
#> 60    106.51005   8.5225040   0.4473165
#> 61     19.62250   8.7731860   0.5653734
#> 62    103.96746   5.8030382   0.4078416
#> 63    137.83508   5.5569751   0.5319219
#> 64     94.44552   6.6147425   0.4719155
#> 65     93.61647   3.8031070   0.6024145
#> 66    157.87155  15.2528954   0.7655847
#> 67     59.26088  16.5202480   0.5885608
#> 68     93.64681   8.2759799   0.4447008
#> 69    161.45776   7.5246207   0.3301701
#> 70     58.86411  13.5264139   0.8940299
#> 71    109.78582   8.3048106   0.5480910
#> 72     72.27364  11.1453218   0.6546549
#> 73    158.17997   8.5640846   0.4351467
#> 74     89.66915  10.0578865   0.4625397
#> 75     74.54625   7.3722673   0.7645326
#> 76     52.80176   3.4467085   0.6665480
#> 77     46.91813  12.1931480   0.7052888
#> 78    147.48180   6.9489775   0.6135230
#> 79     39.98738   8.9256461   0.6826381
#> 80     53.27007   7.8884606   0.4832842
#> 81     99.50539  14.6658313   0.6602186
#> 82     85.17492   9.6414111   0.7108137
#> 83    125.61679   9.6300615   0.3831801
#> 84    165.32019  14.4347833   0.3311072
#> 85     47.50740   7.1136165   0.7742447
#> 86    103.81193  13.1305719   0.6383449
#> 87    108.57149   4.3167687   0.3777025
#> 88    170.99798   2.7733797   0.6866086
#> 89     86.80192   9.3385324   0.9150526
#> 90    182.00076   5.8646475   0.7437171
#> 91     90.76935   3.1099736   0.6635475
#> 92    108.46926  15.5023161   0.5557700
#> 93    128.29417  11.9207560   0.3982395
#> 94     63.07664   2.3702449   0.6624636
#> 95     37.59363  11.1587006   0.6616531
#> 96     55.09932   8.6900225   0.4835542
#> 97    120.45466  13.2367024   0.6903333
#> 98     90.16883   6.7650279   0.6880808
#> 99     54.07032  15.2828207   0.6647165
#> 100    54.29124   2.7231295   0.2606158

使用mutate

set.seed(42)

dat %>%
  mutate(across(ends_with("max"), ~ . - get(gsub( "max", "min", cur_column())), 
                   .names = '{.col}.range')) %>% 
  head()
#>      a.min    b.min      c.min    a.max    b.max     c.max a.max.range
#> 1 91.48060 6.262453 0.44255884 148.3768 10.22700 0.5682526    56.89621
#> 2 93.70754 2.171577 0.25855553 144.4570 15.13240 0.5885682    50.74941
#> 3 28.61395 2.165673 0.42596549 106.0386 16.30726 0.7597802    77.42461
#> 4 83.04476 3.889450 0.22139813 132.7506 14.18772 0.9055604    49.70584
#> 5 64.17455 9.424557 0.07894005 187.8429 18.79266 0.5576810   123.66835
#> 6 51.90959 9.626080 0.22116232 193.0605 11.07987 0.9467109   141.15089
#>   b.max.range c.max.range
#> 1    3.964547   0.1256938
#> 2   12.960818   0.3300127
#> 3   14.141588   0.3338147
#> 4   10.298266   0.6841623
#> 5    9.368103   0.4787410
#> 6    1.453791   0.7255486

^{由reprex 包于 2021-04-29 创建 (v2.0.0 )}

score 2 · Accepted Answer

akrun 出色答案的一个次要替代方案：

# set.seed(42)
# dat <- ...
head(dat)
#      a.min    b.min      c.min    a.max    b.max     c.max
# 1 91.48060 6.262453 0.44255884 148.3768 10.22700 0.5682526
# 2 93.70754 2.171577 0.25855553 144.4570 15.13240 0.5885682
# 3 28.61395 2.165673 0.42596549 106.0386 16.30726 0.7597802
# 4 83.04476 3.889450 0.22139813 132.7506 14.18772 0.9055604
# 5 64.17455 9.424557 0.07894005 187.8429 18.79266 0.5576810
# 6 51.90959 9.626080 0.22116232 193.0605 11.07987 0.9467109

library(dplyr)
head(dat) %>%
  mutate(rn = row_number()) %>%
  pivot_longer(-rn, names_pattern = "(.)\\.(.*)", names_to=c("ltr", ".value")) %>%
  mutate(range = max - min) %>%
  pivot_wider(names_glue="{ltr}.{.value}", names_from = "ltr", values_from = min:range)
# # A tibble: 6 x 10
#      rn a.min b.min  c.min a.max b.max c.max a.range b.range c.range
#   <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>
# 1     1  91.5  6.26 0.443   148.  10.2 0.568    56.9    3.96   0.126
# 2     2  93.7  2.17 0.259   144.  15.1 0.589    50.7   13.0    0.330
# 3     3  28.6  2.17 0.426   106.  16.3 0.760    77.4   14.1    0.334
# 4     4  83.0  3.89 0.221   133.  14.2 0.906    49.7   10.3    0.684
# 5     5  64.2  9.42 0.0789  188.  18.8 0.558   124.     9.37   0.479
# 6     6  51.9  9.63 0.221   193.  11.1 0.947   141.     1.45   0.726

这可能会稍微降低效率，因为这会进行更多的重塑操作，这需要更多的工作，而 akrun 的答案将其与原始数据绑定。有了这个样本数据，我相信 akrun 的运行速度会更快。

score 1 · Accepted Answer

您也可以使用以下解决方案。glue&函数的组合get在这种情况下非常有用，您可以使用glue函数创建变量的名称，然后使用get.

library(dplyr)
library(purrr)
library(glue)

dat %>%
  rowwise() %>%
  mutate(map_dfc(list(a.range = "a", 
                  b.range = "b",
                  c.range = "c"), ~ get(glue("{.x}.max")) - get(glue("{.x}.min"))))


# A tibble: 100 x 9
# Rowwise: 
   a.min  b.min  c.min a.max b.max c.max a.range b.range c.range
   <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>
 1  91.5 6.26   0.443   148.  10.2 0.568    56.9    3.96   0.126
 2  93.7 2.17   0.259   144.  15.1 0.589    50.7   13.0    0.330
 3  28.6 2.17   0.426   106.  16.3 0.760    77.4   14.1    0.334
 4  83.0 3.89   0.221   133.  14.2 0.906    49.7   10.3    0.684
 5  64.2 9.42   0.0789  188.  18.8 0.558   124.     9.37   0.479
 6  51.9 9.63   0.221   193.  11.1 0.947   141.     1.45   0.726
 7  73.7 7.40   0.484   139.  19.8 0.788    65.6   12.4    0.304
 8  13.5 7.33   0.242   116.  12.6 0.573   102.     5.32   0.331
 9  65.7 5.36   0.126   132.  10.8 0.951    66.3    5.49   0.825
10  70.5 0.0227 0.130   131.  13.9 0.627    60.2   13.8    0.497
# ... with 90 more rows

数据

set.seed(42)
dat <- data.frame(a.min = runif(100, 0, 100), 
                  b.min = runif(100, 0, 10), 
                  c.min = runif(100, 0, 0.5), 
                  a.max = runif(100, 100, 200), 
                  b.max = runif(100, 10, 20), 
                  c.max = runif(100, 0.5, 1))

score 1 · Accepted Answer

如果列名的前缀部分（例如“a”、“b”、“c”）是已知的，则这可能是一种更易于阅读且效率更高的替代方法。

library(tidyverse)

base_names <- c("a", "b", "c")

result <- dat %>% {
  max_cols <- select(., str_c(base_names, ".max"))
  min_cols <- select(., str_c(base_names, ".min"))
  range_cols <- max_cols - min_cols
  names(range_cols) <- str_c(base_names, ".range")
  bind_cols(., range_cols)
}

r - 使用 dplyr 汇总多个列的最小值和最大值范围

5 回答 5

Related

Reference