2

我正在尝试获取跨行某些列的 mean() 和 sum()。此代码将生成数据集:

library(tidyverse)

test_data <- tibble(part_id = 1:5,
                      a_1 = c("a", "b", "c", "d", "a"),
                      a_2 = c("b", NA, "b", "a", "d"),
                      a_3 = c("b", "b", "d", "d", "a"))


test_data <- test_data %>%
  mutate_at(vars(a_1, a_2), .funs = list(scored = ~case_when(
    . == "a" | . == "b" ~ 1,
    . == "c" ~ 0,
    . == "d" ~ -100)))

如果我尝试使用 rowSums() 或 rowMeans(),我会得到正确答案:

library(tidyverse)

test_data <- test_data %>%
  mutate(a_total = rowSums(dplyr::select(., contains("scored")), na.rm = TRUE),
         a_mean = rowMeans(dplyr::select(., contains("scored")), na.rm = TRUE))

但是,如果尝试使用 rowwise() 后跟 sum() 或 mean(),它不起作用:

library(tidyverse)

test_data <- test_data %>%
  rowwise() %>%
  mutate(a_total = base::sum(dplyr::select(., contains("scored")), na.rm = TRUE),
         a_mean = base::mean(dplyr::select(., contains("scored")), na.rm = TRUE)) %>%
  ungroup()

对于 sum(),它给出了总和,有效地忽略了 rowwise(),对于 mean(),所有答案都是 NA,我对每一行都收到以下警告:

Warning messages:
1: In mean.default(dplyr::select(., contains("scored")), na.rm = TRUE) :
  argument is not numeric or logical: returning NA

我还尝试通过包含 c() 函数进行此修改,就像您要列出每一列一样。这导致了以下错误:

library(tidyverse)

test_data <- test_data %>%
  rowwise() %>%
  mutate(a_total = base::sum(c(dplyr::select(., contains("scored"))), na.rm = TRUE),
         a_mean = base::mean(c(dplyr::select(., contains("scored"))), na.rm = TRUE)) %>%
  ungroup()

Error in base::sum(c(dplyr::select(., contains("scored"))), na.rm = TRUE) : 
  invalid 'type' (list) of argument

如何使用 rowwise() 完成这项工作?为什么它的行为与典型的和 rowSums() 或 rowMeans() 如此不同?

我很感激任何见解!

4

2 回答 2

1

问题是rowwise按行进行分组,并且sum,mean等对vectors 起作用。它本质上是应用于单行data.frame。通过用 包装unlist,它被转换data.framevector

library(dplyr)
test_data <- test_data %>%
                  rowwise() %>%
                  mutate(a_total = base::sum(unlist(dplyr::select(., 
                               contains("scored")), recursive = FALSE), na.rm = TRUE),
                         a_mean = base::mean(unlist(dplyr::select(., 
                               contains("scored")), recursive = FALSE), na.rm = TRUE)) %>%
                   ungroup()

或使用pmap

library(purrr)
test_data  %>%
   mutate(a_total = pmap_dbl(select(., contains("scored")),
                    ~ sum(c(...), na.rm = TRUE)),
          a_mean =  pmap_dbl(select(., contains("scored")),
                    ~ mean(c(...), na.rm = TRUE)))
于 2020-04-05T17:38:14.633 回答
0

如果您想坚持使用rowwise()该方法{rlang}来捕获要求和和平均的变量,这是另一种方法:

library(dplyr)

test_data <- tibble(part_id = 1:5,
                    a_1 = c("a", "b", "c", "d", "a"),
                    a_2 = c("b", NA, "b", "a", "d"),
                    a_3 = c("b", "b", "d", "d", "a"))


test_data <- test_data %>%
  mutate_at(vars(a_1, a_2), .funs = list(scored = ~case_when(
    . == "a" | . == "b" ~ 1,
    . == "c" ~ 0,
    . == "d" ~ -100)))


# Get the names of the variables you want
vars <- test_data %>% select(contains("scored")) %>% names()

# Use `rlang` so that `dplyr` will recognize the variable names
test_data %>%
  rowwise() %>%
  mutate(a_sum = sum(c(!!!rlang::syms(vars)), na.rm = TRUE),
         a_mean = mean(c(!!!rlang::syms(vars)), na.rm = TRUE)) %>% 
  ungroup()
#> # A tibble: 5 x 8
#>   part_id a_1   a_2   a_3   a_1_scored a_2_scored a_sum a_mean
#>     <int> <chr> <chr> <chr>      <dbl>      <dbl> <dbl>  <dbl>
#> 1       1 a     b     b              1          1     2    1  
#> 2       2 b     <NA>  b              1         NA     1    1  
#> 3       3 c     b     d              0          1     1    0.5
#> 4       4 d     a     d           -100          1   -99  -49.5
#> 5       5 a     d     a              1       -100   -99  -49.5

reprex 包于 2020-04-05 创建(v0.3.0)

于 2020-04-05T17:58:45.087 回答