r - 识别包含指标的组

Question

我想确定包含指标的组。在下面的示例中，我想确定districts包含county == 'other'. 如果有county == 'other'，district那么我希望该区域中的每一行都有一个指示变量1，否则。以下是使用,和0进行的几次尝试，但均无效。也许我可以提取所有行，为该子集定义一个指标，然后将该子集与原始数据集合并回来，但我一直认为必须有一种更简单的方法。谢谢你的任何建议。splitlapplyanycounty == 'other'

df.1 <- read.table(text = '

    state    district    county    apples
       AA          EC        AB       100
       AA          EC        BC        10
       AA          EC        DC       150
       AA           C        FG       200
       AA           C     other        20
       AA           C        HC       250
       AA          WC        RT       300
       AA          WC        TT        30
       AA          WC     other       350

', header=TRUE, stringsAsFactors = FALSE)

desired.result <- read.table(text = '

    state    district    county    apples  indicator
       AA          EC        AB       100          0
       AA          EC        BC        10          0
       AA          EC        DC       150          0
       AA           C        FG       200          1
       AA           C     other        20          1
       AA           C        HC       250          1
       AA          WC        RT       300          1
       AA          WC        TT        30          1
       AA          WC     other       350          1

', header=TRUE, stringsAsFactors = FALSE)

# various attempts that do not work

with(df.1, lapply(split(county, district), function(x) {any(x)=='county' <- 1} ))
with(df.1, lapply(split(county, district), function(x) {ifelse(any(x)=='other', 1, 0)} ))
with(df.1, lapply(split(county, district), function(x) {any(x)=='other'} ))
with(df.1, lapply(split(df.1  , district), function(x) {any(x$county)=='other'} ))
with(df.1, lapply(split(county, district), function(x) {x=='other'} ))

编辑

这是我上面提到的子集/合并方法：

df.indicator <- df.1[df.1$county == 'other',]
df.indicator <- df.indicator[,1:2]
df.indicator$indicator = 1
merge(df.1, df.indicator, by=c('state', 'district'), all=TRUE)

我更喜欢使用基础 R。

score 2 · Accepted Answer

library(data.table)

dt = data.table(df.1)
dt[, indicator := 1*any(county == 'other'), by = district]

dt
#   state district county apples indicator
#1:    AA       EC     AB    100         0
#2:    AA       EC     BC     10         0
#3:    AA       EC     DC    150         0
#4:    AA        C     FG    200         1
#5:    AA        C  other     20         1
#6:    AA        C     HC    250         1
#7:    AA       WC     RT    300         1
#8:    AA       WC     TT     30         1
#9:    AA       WC  other    350         1

这是一个基本解决方案 - 它更慢更丑陋，但如果那是 OP 的事情，哦，好吧 :)

df.1$indicator = as.numeric(ave(df.1$county, df.1$district,
                                FUN = function(x) {1*any(x == "other")}))

或者

df.1$indicator <- with(df.1, ave(county=='other', district, FUN=max))

或者

df.1$indicator <- with(df.1, ave(county=='other', district, FUN=any)+0L)

score 0 · Accepted Answer

当试图用我的实际数据实现上述两个答案时，我意识到我必须考虑一个新变量df.1$year，并且我需要满足更复杂的条件，然后指标变量才应该是一个：df.1$county == 'other' & is.na(df.1$apples)withindistrict和year。以下是修改后的数据集df.1和修改后的lapply声明以实施这些新条件。我还没有ave在这些新条件下工作，但我确实借用了一些 eddi 的代码。

这个修改后的场景似乎与我原来的问题密切相关，无法发布新问题。

# identify district/year combinations containing >= 1 row in which county == 'other' & apples == NA

df.1 <- read.table(text = '

    state    district    county   year   apples
       AA          EC        A    1980     100
       AA          EC        B    1980      10
       AA          EC        C    1980     150
       AA           C        G    1980     200
       AA           C    other    1980      20
       AA           C        I    1980     250
       AA          WC        R    1980     300
       AA          WC        S    1980      NA
       AA          WC    other    1980     350
       AA          EC        A    1999    1100
       AA          EC        D    1999      NA
       AA          EC        E    1999    1150
       AA           C        H    1999    1200
       AA           C        I    1999     120
       AA           C        J    1999    1250
       AA          WC        R    1999    1300
       AA          WC    other    1999      NA
       AA          WC        T    1999    1350

', header=TRUE, stringsAsFactors = FALSE)

df.1$my.grouping <- paste(df.1$district, df.1$year, sep = '')

z <- lapply(split( df.1, df.1$my.grouping), function(x) { merge(x, 1*any(x$county == "other" & is.na(x$apples)), all=TRUE)} )
df.2 <- do.call(rbind, z)
rownames(df.2) = NULL
df.2

   state district county year apples my.grouping y
1     AA        C      G 1980    200       C1980 0
2     AA        C  other 1980     20       C1980 0
3     AA        C      I 1980    250       C1980 0
4     AA        C      H 1999   1200       C1999 0
5     AA        C      I 1999    120       C1999 0
6     AA        C      J 1999   1250       C1999 0
7     AA       EC      A 1980    100      EC1980 0
8     AA       EC      B 1980     10      EC1980 0
9     AA       EC      C 1980    150      EC1980 0
10    AA       EC      A 1999   1100      EC1999 0
11    AA       EC      D 1999     NA      EC1999 0
12    AA       EC      E 1999   1150      EC1999 0
13    AA       WC      R 1980    300      WC1980 0
14    AA       WC      S 1980     NA      WC1980 0
15    AA       WC  other 1980    350      WC1980 0
16    AA       WC      R 1999   1300      WC1999 1
17    AA       WC  other 1999     NA      WC1999 1
18    AA       WC      T 1999   1350      WC1999 1

score 0 · Accepted Answer

这是迄今为止我使用 apply 系列函数所能想到的最好的方法：

df.1 <- read.table(text = '

    state    district    county    apples
       AA          EC        AB       100
       AA          EC        BC        10
       AA          EC        DC       150
       AA           C        FG       200
       AA           C     other        20
       AA           C        HC       250
       AA          WC        RT       300
       AA          WC        TT        30
       AA          WC     other       350

', header=TRUE, stringsAsFactors = FALSE)

z <- with(df.1, lapply(split( df.1, district), function(x) { merge(x, ifelse('other' %in% x$county, 1, 0), all=TRUE) } )) ; z
df.2 <- do.call(rbind, z)
rownames(df.2) = NULL
df.2

给予：

  state district county apples y
1    AA        C     FG    200 1
2    AA        C  other     20 1
3    AA        C     HC    250 1
4    AA       EC     AB    100 0
5    AA       EC     BC     10 0
6    AA       EC     DC    150 0
7    AA       WC     RT    300 1
8    AA       WC     TT     30 1
9    AA       WC  other    350 1

r - 识别包含指标的组

3 回答 3

Related

Reference