2

我正在编写一个函数来计算需要使用 dplyr 和 tidyr 进行 NSE 评估的计数表的优势比。显而易见,这是我第一次进入 NSE 世界。

例如,使用数据框“foo”:

# A tibble: 4 x 3
   strata   group     select
    <chr>   <chr>      <chr>
1 Manager A_Group     Chosen
2  Worker A_Group     Chosen
3 Manager B_Group Not_Chosen
4  Worker B_Group     Chosen
5 ...

我首先做计数: foo2 <- foo %>% count(strata, group, select)

# A tibble: 8 x 4
   strata   group     select     n
    <chr>   <chr>      <chr> <int>
1 Manager A_Group     Chosen     1
2 Manager A_Group Not_Chosen     9
3 Manager B_Group     Chosen     1
4 Manager B_Group Not_Chosen     3
5 ...

接下来,我使用 tidyr 的 unite and spread 折叠成宽格式,它通过组的值命名新列并选择列:

foo2 %>% unite(cat, c(group, select)) %>% 
    spread(cat, n, fill = 0)

# A tibble: 2 x 5
strata A_Group_Chosen A_Group_Not_Chosen B_Group_Chosen B_Group_Not_Chosen
*   <chr>          <dbl>              <dbl>          <dbl>              <dbl>
1 Manager              1                  9              1                  3
2  Worker              1                 11              1                  3

最后,我计算一个新列,或者为

 ... %>% mutate(OR = (A_Group_Chosen * B_Group_Not_Chosen) /
  (A_Group_Not_Chosen * B_Group_Chosen))

要将此代码放入函数中,我使用 enquo 和 !! 处理原始列,但要计算新列,或者,我需要新创建的列(由组和选择列的值的串联命名)。问题是如何“取消引用” OR 计算的名称?

我当前的草稿在联合/传播之后保存中间结果,将名称放入向量中,并使用 $`!!'() 运算符。这感觉很笨拙。更好的方法?

我的功能:

OR_tab <- function(dat, strat, grp, decision ){
  strat <- enquo(strat)
  grp <- enquo(grp)
  decision <- enquo(decision)



  tab <- dat %>% count(!!strat, !!grp, !!decision) %>% unite(cat, c(!!grp, !!decision)) %>% 
    spread(cat, n, fill = 0)
  nm <- names(tab)[2:5]
  tab %>% mutate(OR = (tab$`!!`(nm[1]) * tab$`!!`(nm[4])) / (tab$`!!`(nm[2]) * (tab$`!!`(nm[3])))) %>% 
    print(n = Inf)
}

OR_tab(foo, strata, group, select)

我的原始数据框 'foo' :

> dput(foo2)
structure(list(strata = c("Manager", "Worker", "Manager", "Manager", 
"Worker", "Manager", "Manager", "Manager", "Worker", "Worker", 
"Worker", "Worker", "Worker", "Worker", "Manager", "Worker", 
"Worker", "Manager", "Manager", "Manager", "Worker", "Worker", 
"Manager", "Manager", "Manager", "Manager", "Worker", "Worker", 
"Worker", "Worker"), group = c("A_Group", "A_Group", "A_Group", 
"A_Group", "B_Group", "A_Group", "B_Group", "A_Group", "A_Group", 
"A_Group", "A_Group", "A_Group", "B_Group", "B_Group", "A_Group", 
"A_Group", "A_Group", "A_Group", "A_Group", "B_Group", "A_Group", 
"A_Group", "B_Group", "B_Group", "A_Group", "A_Group", "B_Group", 
"A_Group", "A_Group", "A_Group"), select = c("Chosen", "Chosen", 
"Not_Chosen", "Not_Chosen", "Not_Chosen", "Not_Chosen", "Not_Chosen", 
"Not_Chosen", "Not_Chosen", "Not_Chosen", "Not_Chosen", "Not_Chosen", 
"Not_Chosen", "Not_Chosen", "Not_Chosen", "Not_Chosen", "Not_Chosen", 
"Not_Chosen", "Not_Chosen", "Not_Chosen", "Not_Chosen", "Not_Chosen", 
"Not_Chosen", "Chosen", "Not_Chosen", "Not_Chosen", "Chosen", 
"Not_Chosen", "Not_Chosen", "Not_Chosen")), .Names = c("strata", 
"group", "select"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-30L))
4

1 回答 1

0

您可以通过对长数据进行优势比计算来避免取消引用spread(如@MrFlick 所建议的那样):

library(tidyverse)

OR_tab2 <- function(dat, strat, grp, decision ){

  strat <- enquo(strat)
  grp <- enquo(grp)
  decision <- enquo(decision)

  dat %>% 
    count(!!strat, !!grp, !!decision) %>% 
    group_by(!!strat) %>% 
    mutate(OR = (n[1]*n[4])/(n[2]*n[3])) %>% 
    unite(cat, c(!!grp, !!decision)) %>% 
    spread(cat, n)

}

OR_tab2(foo2, strata, group, select)
  strata     OR A_Group_Chosen A_Group_Not_Chosen B_Group_Chosen B_Group_Not_Chosen
  <chr>   <dbl>          <int>              <int>          <int>              <int>
1 Manager 0.333              1                  9              1                  3
2 Worker  0.273              1                 11              1                  3

与您的原始代码一样,这适用于任何数据框,其中groupselect参数每个只有两个级别,但分子或分母中的哪对级别将取决于每列中级别的顺序。例如,请注意,对于df2下面重新编码的数据帧,优势比与原始数据帧的优势比相反df

df2 = foo2 %>% 
  mutate(experimental_groups = recode(group, 
                        "A_Group"="Control",
                        "B_Group"="Treatment"),
         flavor = recode(select, 
                         "Chosen"="Vanilla",
                         "Not_Chosen"="Chocolate"))

OR_tab2(df2, strata, experimental_groups, flavor)
  strata     OR Control_Chocolate Control_Vanilla Treatment_Chocolate Treatment_Vanilla
  <chr>   <dbl>             <int>           <int>               <int>             <int>
1 Manager  3.00                 9               1                   3                 1
2 Worker   3.67                11               1                   3                 1
OR_tab(df2, strata, experimental_groups, flavor)
  strata  Control_Chocolate Control_Vanilla Treatment_Chocolate Treatment_Vanilla    OR
  <chr>               <dbl>           <dbl>               <dbl>             <dbl> <dbl>
1 Manager                9.              1.                  3.                1.  3.00
2 Worker                11.              1.                  3.                1.  3.67
于 2018-04-01T14:34:55.217 回答