1

我的数据看起来基本上是这样的:

id <- c(1:5)
VolumeA <- c(12, NA, NA, NA, NA)
VolumeB <- c(NA, 34, NA, NA, NA)
VolumeC <- c(NA, NA, 56, NA, NA)
VolumeD <- c(NA, NA, NA, 78, NA)
VolumeE <- c(NA, NA, NA, NA, 90)

df_now <- tibble(id, VolumeA, VolumeB, VolumeC, VolumeD, VolumeE)
df_now

# A tibble: 5 x 6
     id VolumeA VolumeB VolumeC VolumeD VolumeE
  <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1     1      12      NA      NA      NA      NA
2     2      NA      34      NA      NA      NA
3     3      NA      NA      56      NA      NA
4     4      NA      NA      NA      78      NA
5     5      NA      NA      NA      NA      90

在 IRL 数据集中,还有很多Volume[label]列,但在每一行中,我只需要其中之一:最大的列。所以我想创建一个具有最大值的新变量:

Volume <- c(12, 34, 56, 78, 90)
df_desired <- cbind(df_now, Volume)
df_desired

  id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
1  1      12      NA      NA      NA      NA     12
2  2      NA      34      NA      NA      NA     34
3  3      NA      NA      56      NA      NA     56
4  4      NA      NA      NA      78      NA     78
5  5      NA      NA      NA      NA      90     90

在查看了 dplyr 文档后,我尝试了这个......

library(tidyverse)
df_try <- df_now %>%
  mutate(Volume = across(contains("Volume"), max, na.rm = TRUE))

...但是得到了一小部分数据,而不是一列。有人可以告诉我如何正确执行此操作吗?

gather(请假设,由于我的 IRL 数据问题太复杂,无法在此处解释,我不能只spread使用我的数据。我想使用条件mutate.)

4

2 回答 2

3

由于您有“更多Volume[label]列”,任何适用于每一行(rowwise)或单独适用于每一列(带有reduceReduce)的解决方案都将比必要的慢得多。

df_now %>%
  mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
# # A tibble: 5 x 7
#      id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
#   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
# 1     1      12      NA      NA      NA      NA     12
# 2     2      NA      34      NA      NA      NA     34
# 3     3      NA      NA      56      NA      NA     56
# 4     4      NA      NA      NA      78      NA     78
# 5     5      NA      NA      NA      NA      90     90

相对改善的证明:

  1. 使用Reduceorpurrr::reduce或任何将迭代每列的东西(好吧,使用nc列,然后它将迭代nc-1次数):

    mypmax <- function(...) { message("mypmax"); pmax(...); }
    df_now %>%
      mutate(Volume = reduce(select(., starts_with('Volume')), mypmax, na.rm = TRUE))
    # mypmax
    # mypmax
    # mypmax
    # mypmax
    # # A tibble: 5 x 7
    #      id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
    #   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
    # 1     1      12      NA      NA      NA      NA     12
    # 2     2      NA      34      NA      NA      NA     34
    # 3     3      NA      NA      56      NA      NA     56
    # 4     4      NA      NA      NA      78      NA     78
    # 5     5      NA      NA      NA      NA      90     90
    
  2. 任何事情都会rowwise每行执行一次,甚至可能更糟(假设数据中的行数多于列数:

    mymax <- function(...) { message("mymax"); max(...); }
    df_now %>%
      rowwise %>%
      mutate(Volume = mymax(c_across(starts_with('Volume')), na.rm = TRUE))
    # mymax
    # mymax
    # mymax
    # mymax
    # mymax
    # # A tibble: 5 x 7
    # # Rowwise: 
    #      id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
    #   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
    # 1     1      12      NA      NA      NA      NA     12
    # 2     2      NA      34      NA      NA      NA     34
    # 3     3      NA      NA      56      NA      NA     56
    # 4     4      NA      NA      NA      78      NA     78
    # 5     5      NA      NA      NA      NA      90     90
    
  3. 对所有列、所有行执行一次:

    mypmax <- function(...) { message("mypmax"); pmax(...); }
    df_now %>%
      mutate(Volume = do.call(mypmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
    # mypmax
    # # A tibble: 5 x 7
    #      id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
    #   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
    # 1     1      12      NA      NA      NA      NA     12
    # 2     2      NA      34      NA      NA      NA     34
    # 3     3      NA      NA      56      NA      NA     56
    # 4     4      NA      NA      NA      78      NA     78
    # 5     5      NA      NA      NA      NA      90     90
    

在这种规模下,基准测试是次要的,但对于更大的数据会更加引人注目:

microbenchmark::microbenchmark(
  red = df_now %>% mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE)),
  row = df_now %>% rowwise %>% mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE)),
  sgl = df_now %>% mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
)
# Unit: milliseconds
#  expr    min      lq     mean  median      uq     max neval
#   red 4.9736 5.36240 7.240561 5.68010 6.19915 70.7482   100
#   row 4.5813 5.02020 6.082047 5.34460 5.70345 63.1166   100
#   sgl 3.8270 4.18605 5.803043 4.43215 4.76030 65.7217   100
于 2020-08-05T16:54:09.103 回答
2

我们可以使用pmax(首先在pmax此处发布解决方案)。请注意,相对改进非常小do.call

library(dplyr)
library(purrr)
df_now %>%
    mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE))
# A tibble: 5 x 7
#     id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
#  <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
#1     1      12      NA      NA      NA      NA     12
#2     2      NA      34      NA      NA      NA     34
#3     3      NA      NA      56      NA      NA     56
#4     4      NA      NA      NA      78      NA     78
#5     5      NA      NA      NA      NA      90     90

或使用c_acrossand max(仅使用tidyverse方法)

df_now %>%
   rowwise %>%
   mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE))
# A tibble: 5 x 7
# Rowwise: 
#     id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
#  <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
#1     1      12      NA      NA      NA      NA     12
#2     2      NA      34      NA      NA      NA     34
#3     3      NA      NA      56      NA      NA     56
#4     4      NA      NA      NA      78      NA     78
#5     5      NA      NA      NA      NA      90     90

基准

system.time({df_now %>% mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE))})
#   user  system elapsed 
#  0.023   0.006   0.029 

system.time({df_now %>% rowwise %>% mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE))})
#   user  system elapsed 
#  0.012   0.002   0.015 

system.time({df_now %>% mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))})
#   user  system elapsed 
#  0.011   0.001   0.011 

注意:时间上没有太大差异

于 2020-08-05T16:37:40.813 回答