我只是一个新的 R 用户,但这是我的解决方案:
加载示例数据(基于 PSID)。数据是不平衡的面板数据:1977 年至 1983 年间 98 个个人观察,15 个组,有性别识别(未使用)
df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L, 5L, 5L,5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L,10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L,13L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L,15L, 15L), year = c(1978L, 1979L, 1980L, 1981L, 1982L, 1983L,1977L, 1978L, 1979L, 1980L, 1981L, 1982L, 1983L, 1977L, 1978L,1979L, 1980L, 1981L, 1982L, 1983L, 1979L, 1977L, 1978L, 1979L,1980L, 1981L, 1982L, 1983L, 1977L, 1978L, 1979L, 1980L, 1981L,1982L, 1983L, 1977L, 1978L, 1979L, 1980L, 1981L, 1982L, 1983L,1977L, 1978L, 1979L, 1980L, 1981L, 1982L, 1983L, 1977L, 1978L,1979L, 1980L, 1981L, 1982L, 1983L, 1977L, 1978L, 1979L, 1980L,1981L, 1982L, 1983L, 1977L, 1978L, 1979L, 1980L, 1981L, 1982L,1983L, 1977L, 1978L, 1979L, 1980L, 1981L, 1982L, 1983L, 1977L,1978L, 1979L, 1980L, 1981L, 1982L, 1983L, 1977L, 1978L, 1979L,1980L, 1981L, 1982L, 1983L, 1977L, 1978L, 1979L, 1980L, 1981L,1982L, 1983L), gender = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("id", "year","gender"), row.names = c(NA, 98L), class = "data.frame")
创建每个组 id 有 1 个观察值的数据框(在此示例中,有 15 个不同的组)
sample <- select(df, id) %>% group_by(id) %>% sample_n(1)
从 15 个随机观察中创建 5 个样本
sample <- ungroup(sample) %>% sample_n(5) %>% mutate(id=row_number())
将 m:1 旧数据框与样本数据框合并
df_new <- merge(x = df, y = sample, by = "id", all.y = TRUE)