因此,对于我的分析,我需要在我的样本中创建 4 个子样本:
我想比较选民的投票率
1) 家庭中有 0 个其他选民的选民
2) 家庭中有 1 个其他选民的选民
3) 家庭中有 2 个其他选民的选民
4) 家庭中有 3 个以上其他选民的选民
我有7个变量:
1)家庭规模(vn437)
2)HH中第二人的年龄(vn438a)
3)HH中第三人的年龄(vn438b)
等等,直到HH中第六人的年龄
在我的逻辑中,我需要为每个组创建 4 个子样本:
第一组(HH 中的 0 个其他选民)将是满足以下条件的观察:
d$vn437 == 1;
d$vn437 == 2 & d$vn438a < 18;
d$vn437 == 3 & d$vn438a < 18 & d$vn438b < 18
d$vn437 == 4 & d$vn438a < 18 & d$vn438b < 18 & d$vn438c < 18;
依此类推,直到我完成 'd$vn438e < 18'
我是 R 的超级菜鸟,我不知道该怎么做。
我将如何创建这些组?我真的很绝望,我一直在寻找几个小时无济于事!
正如 Richard Telford 所建议的,这里是 'dput(head(d))
命令的输出:
structure(list(dat = c(20091026, 20091025, 20091025, 20091026,
20091025, 20091025), vn1 = c(1, 2, 1, 1, 1, 1), vn542 = c(27,
22, 25, 23, 24, 22), vn217 = c(4, 3, 2, 4, 3, 3), n111 = c(1,
1, 1, 2, 1, 1), vn437 = c(2, 2, 2, 2, 2, 2), vn438a = c(28, 24,
24, 24, 23, 25), vn438b = c(1000, 1000, 1000, 1000, 1000, 1000
), vn438c = c(1000, 1000, 1000, 1000, 1000, 1000), vn438d = c(1000,
1000, 1000, 1000, 1000, 1000), vn438e = c(1000, 1000, 1000, 1000,
1000, 1000), vn5 = c(4, 4, 4, 4, 4, 4), vn9a = c(5, 5, 5, 5,
5, 5), vn75 = c(1, 1, 3, 2, 1, 3), vn79 = c(2, 2, 2, 2, 2, 2)), .Names = c("dat",
"vn1", "vn542", "vn217", "n111", "vn437", "vn438a", "vn438b",
"vn438c", "vn438d", "vn438e", "vn5", "vn9a", "vn75", "vn79"), row.names = c(2174L,
2175L, 2177L, 2178L, 2180L, 2181L), class = "data.frame")
这vn438b = "1000"
是 NA 值,但如果我删除它们,我会丢失其他观察结果,所以我没有清理 HH 年龄变量中的第 N 个人。
这也是我希望我的结果最终看起来的样子
编辑
设法自己解决了。对于任何有兴趣的人,这是我的代码:
# changing variable names into understandable names
colnames(d)[2] <- "sex"
colnames(d)[3] <- "age"
colnames(d)[4] <- "polint"
colnames(d)[5] <- "turnout"
colnames(d)[6] <- "HHsize"
colnames(d)[7] <- "HHage2"
colnames(d)[8] <- "HHage3"
colnames(d)[9] <- "HHage4"
colnames(d)[10] <- "HHage5"
colnames(d)[11] <- "HHage6"
colnames(d)[12] <- "marital"
colnames(d)[13] <- "education"
colnames(d)[14] <- "income"
colnames(d)[15] <- "religion"
####################################################################
## creating subsets: no other voters in HH --> combine them later ##
####################################################################
noHHM <- d[d$HHsize==1, ]
noHHM1 <- d[d$HHsize==2 & d$HHage2<18, ]
noHHM2 <- d[d$HHsize==3 & d$HHage2<18 & d$HHage3<18, ]
noHHM3 <- d[d$HHsize==4 & d$HHage2<18 & d$HHage3<18 & d$HHage4<18, ]
noHHM4 <- d[d$HHsize==5 & d$HHage2<18 & d$HHage3<18 & d$HHage4<18 & d$HHage5<18, ]
#at this point no more observations match specifications so we remove noHHM4
rm(noHHM4)
#merging the noHHM variables
zeroHHM <- rbind(noHHM, noHHM1, noHHM2, noHHM3)
#removing intermediate variables now
rm(noHHM, noHHM1, noHHM2, noHHM3)
#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth0 <- zeroHHM[zeroHHM$age <26, ]
Old0 <- zeroHHM[zeroHHM$age >25, ]
##################################################
## repeat whole process for 1 other voter in HH ##
##################################################
one1HHM <- d[d$HHsize==2 & d$HHage2>17, ]
oneHHM1 <- d[d$HHsize==3 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18, ]
oneHHM2 <- d[d$HHsize==4 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18 & d$HHage4<18, ]
oneHHM3 <- d[d$HHsize==5 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18 & d$HHage4<18 & d$HHage5<18, ]
oneHHM4 <- d[d$HHsize==6 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18 & d$HHage4<18 & d$HHage5<18 & d$HHage6<18, ]
#merging the oneHHM variables
oneHHM <- rbind(one1HHM, oneHHM1, oneHHM2, oneHHM3, oneHHM4)
#removing intermediate variables now
rm(one1HHM, oneHHM1, oneHHM2, oneHHM3, oneHHM4)
#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth1 <- oneHHM[zeroHHM$age <26, ]
Old1 <- oneHHM[zeroHHM$age >25, ]
###################################################
## repeat whole process for 2 other voters in HH ##
###################################################
twoHHM1 <- d[d$HHsize==3 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900, ]
twoHHM2 <- d[d$HHsize==4 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4<18, ]
twoHHM3 <- d[d$HHsize==5 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4<18 & d$HHage5<18, ]
twoHHM4 <- d[d$HHsize==6 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4<18 & d$HHage5<18 & d$HHage6<18, ]
#merging the the twoHHM variables
twoHHM <- rbind(twoHHM1, twoHHM2, twoHHM3, twoHHM4)
#removing intermediate variables
rm(twoHHM1, twoHHM2, twoHHM3, twoHHM4)
#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth2 <- twoHHM[zeroHHM$age <26, ]
Old2 <- twoHHM[zeroHHM$age >25, ]
####################################################
## repeat whole process for 3+ other voters in HH ##
####################################################
threeHHM1 <- d[d$HHsize==4 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4>17 & d$HHage4<900, ]
threeHHM2 <- d[d$HHsize==5 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4>17 & d$HHage4<900, ]
threeHHM3 <- d[d$HHsize==6 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4>17 & d$HHage4<900, ]
#merging the the threeHHM variables
threeHHM <- rbind(threeHHM1, threeHHM2, threeHHM3)
#removing intermediate variables
rm(threeHHM1, threeHHM2, threeHHM3)
#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth3 <- threeHHM[zeroHHM$age <26, ]
Old3 <- threeHHM[zeroHHM$age >25, ]
#renaming the bigsets
HHM0 <- zeroHHM
HHM1 <- oneHHM
HHM2 <- twoHHM
HHM3 <- threeHHM
#removing old name bigsets
rm(zeroHHM, oneHHM, twoHHM, threeHHM)