0

假设这个数据集:

household_id person_id age_group  
1            1         5  
1            2         3  
1            3         2  
2            1         3  
2            2         5
2            3         1
2            4         1

我想创建一个新字段,指示家庭是否包括任何 age_group=1 的人,如下所示:

household_id person_id age_group age_group1  
1            1         5         0  
1            2         3         0
1            3         2         0
2            1         3         1
2            2         5         1
2            3         1         1
2            4         1         1

我感谢您的帮助!

4

5 回答 5

3

一个plyr解决方案:

require(plyr)
df <- structure(list(household_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L), 
person_id = c(1L, 2L, 3L, 1L, 2L, 3L, 4L), age_group = c(5L, 
3L, 2L, 3L, 5L, 1L, 1L)), .Names = c("household_id", "person_id", 
"age_group"), class = "data.frame", row.names = c(NA, -7L))

ddply(df, .(household_id), transform, age_group1 = 0 + any(age_group == 1))

#   household_id person_id age_group age_group1
# 1            1         1         5          0
# 2            1         2         3          0
# 3            1         3         2          0
# 4            2         1         3          1
# 5            2         2         5          1
# 6            2         3         1          1
# 7            2         4         1          1

编辑: data.table替代:

require(data.table)
dt <- data.table(df, key="household_id")
dt[, age_group1 := 0 + any(age_group == 1), by=household_id]
于 2013-02-09T19:51:09.630 回答
3
ave(t$age_group, t$household_id, FUN=function(x) 1 %in% x)
[1] 0 0 0 1 1 1 1

> t$age_group1 <- with(t, ave(age_group, household_id, FUN=function(x) 1 %in% x))
> t
  household_id person_id age_group age_group1
1            1         1         5          0
2            1         2         3          0
3            1         3         2          0
4            2         1         3          1
5            2         2         5          1
6            2         3         1          1
7            2         4         1          1
于 2013-02-09T19:51:15.643 回答
2

读取您的数据后

dat <- read.table(text = 'household_id person_id age_group  
1            1         5  
1            2         3  
1            3         2  
2            1         3  
2            2         5
2            3         1
2            4         1',head=T)

使用transformwith ave(类似于 @Mathhew 解决方案)但使用更简洁的 sytnax

 transform(dat, age_group1  = ave(age_group, household_id, FUN=function(x) any(x==1)))

  household_id person_id age_group age_group1
1            1         1         5          0
2            1         2         3          0
3            1         3         2          0
4            2         1         3          1
5            2         2         5          1
6            2         3         1          1
7            2         4         1          1
于 2013-02-09T19:49:20.410 回答
1

我更喜欢sql这种东西,因为很多人已经知道它,它可以跨语言工作(sas 有proc sql;),而且非常直观:)

# read your data into an object named `x`

# load the sqldf library
library(sqldf)

# create a new household-level table that contains just
# the household id and a 0/1 indicator of
# whether anyone within the household meets your requirement
households <- 
    sqldf( 'select household_id , max( age_group == 1 ) as age_group1 from x group by household_id' )

# merge the new column back on to the original table
x <- merge( x , households )

# view your result
x
于 2013-02-09T20:43:08.147 回答
1

这是另一个不涉及安装任何软件包的选项;)

# read your data frame into `x`
x <- read.table( text = "household_id person_id age_group  
1            1         5  
1            2         3  
1            3         2  
2            1         3  
2            2         5
2            3         1
2            4         1" , head=TRUE)


# determine the maximum of age_group == 1 within each household id
hhold <- aggregate( age_group == 1 ~ household_id , FUN = max , data = x )

# now just change the name of the second column
names( hhold )[ 2 ] <- 'age_group1'

# merge it back on and you're done
x <- merge( x , hhold )

# look at the result
x
于 2013-02-09T23:27:17.777 回答