0

首先,很抱歉提出这个问题。我知道有办法做到这一点,但我已经为此苦苦挣扎了几天,我开始变得一无所知。

从 2008 年到 2020 年,我对个人的不同变量进行了年度观察。我有关于家庭(25 个变量)、收入(15 个变量)和学校教育(22 个变量)的数据。现在,已经“清理”了每个数据集,以便每个类别的每一列都具有相同的列名。对于上下文,这就是我的 R 现在的样子。

在此处输入图像描述

问题是,我想在一个数据框中拥有一个包含所有个人和年份的大数据集。我知道我应该/可以使用 innerjoin 或 merge 函数,首先按“Householdmember”排序,并且我可以使用gather函数,但我真的在努力按什么顺序执行此操作以及应该从哪里开始。我一直在尝试很多事情,但考虑到数据帧的数量,很难跟踪我在做什么。我还为每年创建了每个类别的列表,因为这是在一种方法中推荐的,但是没有成功……

我想最终得到一个类似于此的数据框:

个人 变量1 变量2
1 2008年 价值 价值
1 2009 价值 价值
1 2010 价值 价值
2 2008年 价值 价值
2 2009 价值 价值
2 2010 价值 价值

我真的希望有人可以帮助我或告诉我第一步应该做什么...如果我合并数据框,我认为R不知道哪些值对应于哪一年...

    > head(fam08)
# A tibble: 6 x 25
  HouseholdMember RandomChild YearBirthRandom  Gender   Age FatherBirth FatherAlive MotherBirth MotherAlive Divorce SeeFather SeeMother
            <dbl>   <dbl+lbl>           <dbl> <dbl+l> <dbl>   <dbl+lbl>   <dbl+lbl>   <dbl+lbl>   <dbl+lbl> <dbl+l> <dbl+lbl> <dbl+lbl>
1          800033 16 [not ap…              NA 1 [mal…    16        1952     1 [yes]        1961     1 [yes] 1 [yes]  7 [ever…  7 [ever…
2          800042 16 [not ap…              NA 2 [fem…    32        1946     1 [yes]        1948     1 [yes] 2 [no]   4 [at l…  4 [at l…
3          800045 16 [not ap…              NA 1 [mal…    65        1913     2 [no]         1915     2 [no]  2 [no]  NA        NA       
4          800057 16 [not ap…              NA 1 [mal…    33        1939     1 [yes]        1945     1 [yes] 1 [yes]  4 [at l…  4 [at l…
5          800076 16 [not ap…              NA 2 [fem…    22        1955     1 [yes]        1955     1 [yes] 1 [yes]  5 [at l…  3 [a fe…
6          800119 16 [not ap…              NA 2 [fem…    57        1908     2 [no]         1918     2 [no]  2 [no]  NA        NA       
# … with 13 more variables: Married <dbl+lbl>, Child <dbl+lbl>, NumChild <dbl>, SchoolCH1 <dbl+lbl>, SchoolCH2 <dbl+lbl>,
#   SchoolCH3 <dbl+lbl>, SchoolCH4 <dbl+lbl>, BirthCH1 <dbl>, BirthCH2 <dbl>, BirthCH3 <dbl>, BirthCH4 <dbl>, FamSatisfaction <dbl+lbl>,
#   Year <dbl>



> head(fam09)
# A tibble: 6 x 25
  HouseholdMember RandomChild YearBirthRandom  Gender   Age FatherBirth FatherAlive MotherBirth MotherAlive Divorce SeeFather SeeMother
            <dbl>   <dbl+lbl>           <dbl> <dbl+l> <dbl>   <dbl+lbl>   <dbl+lbl>   <dbl+lbl>   <dbl+lbl> <dbl+l> <dbl+lbl> <dbl+lbl>
1          800033 16 [not ap…              NA 1 [mal…    17        1952     1 [yes]        1961     1 [yes]      NA  5 [at l…  7 [ever…
2          800042 16 [not ap…              NA 2 [fem…    33        1946     1 [yes]        1948     1 [yes]      NA  4 [at l…  4 [at l…
3          800057 16 [not ap…              NA 1 [mal…    34        1939     1 [yes]        1945     1 [yes]      NA  3 [a fe…  3 [a fe…
4          800076 16 [not ap…              NA 2 [fem…    23        1955     1 [yes]        1955     1 [yes]      NA  5 [at l…  3 [a fe…
5          800119 16 [not ap…              NA 2 [fem…    58          NA    NA                NA    NA            NA NA        NA       
6          800125 16 [not ap…              NA 2 [fem…    50          NA    NA              1928     1 [yes]      NA NA         1 [neve…
# … with 13 more variables: Married <dbl+lbl>, Child <dbl+lbl>, NumChild <dbl>, SchoolCH1 <dbl+lbl>, SchoolCH2 <dbl+lbl>,
#   SchoolCH3 <dbl+lbl>, SchoolCH4 <dbl+lbl>, BirthCH1 <dbl>, BirthCH2 <dbl>, BirthCH3 <dbl>, BirthCH4 <dbl>, FamSatisfaction <dbl+lbl>,
#   Year <dbl>




dput(head(fam09,10))
structure(list(HouseholdMember = c(800033, 800042, 800057, 800076, 
800119, 800125, 800170, 800186, 800201, 800204), RandomChild = structure(c(16, 
16, 16, 16, 16, 16, 3, 16, 16, 16), label = "Randomly chosen child", labels = c(`child 1` = 1, 
`child 2` = 2, `child 3` = 3, `child 4` = 4, `child 5` = 5, `child 6` = 6, 
`child 7` = 7, `child 8` = 8, `child 9` = 9, `child 10` = 10, 
`child 11` = 11, `child 12` = 12, `child 13` = 13, `child 14` = 14, 
`child 15` = 15, `not applicable` = 16), class = "haven_labelled"), 
    YearBirthRandom = c(NA, NA, NA, NA, NA, NA, 1999, NA, NA, 
    NA), Gender = structure(c(1, 2, 1, 2, 2, 2, 2, 2, 1, 1), label = "Gender respondent", labels = c(male = 1, 
    female = 2), class = "haven_labelled"), Age = c(17, 33, 34, 
    23, 58, 50, 50, 69, 35, 67), FatherBirth = structure(c(1952, 
    1946, 1939, 1955, NA, NA, 1926, NA, 1948, NA), label = "What is the year of birth of your father?", labels = c(`I don't know` = 99999), class = "haven_labelled"), 
    FatherAlive = structure(c(1, 1, 1, 1, NA, NA, 1, NA, 1, NA
    ), label = "Is your father still alive?", labels = c(yes = 1, 
    no = 2, `I don't know` = 99), class = "haven_labelled"), 
    MotherBirth = structure(c(1961, 1948, 1945, 1955, NA, 1928, 
    1931, NA, 1950, NA), label = "What is the year of birth of your mother?", labels = c(`I don't know` = 99999), class = "haven_labelled"), 
    MotherAlive = structure(c(1, 1, 1, 1, NA, 1, 1, NA, 1, NA
    ), label = "Is your mother still alive?", labels = c(yes = 1, 
    no = 2, `I don't know` = 99), class = "haven_labelled"), 
    Divorce = structure(c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
    ), label = "Did your own parents ever divorce?", labels = c(yes = 1, 
    no = 2, `my parents never had a relationship` = 3, `I don't know` = 99
    ), class = "haven_labelled"), SeeFather = structure(c(5, 
    4, 3, 5, NA, NA, 6, NA, 3, NA), label = "How often did you see your father over the past 12 months?", labels = c(never = 1, 
    once = 2, `a few times` = 3, `at least every month` = 4, 
    `at least every week` = 5, `a few times per week` = 6, `every day` = 7
    ), class = "haven_labelled"), SeeMother = structure(c(7, 
    4, 3, 3, NA, 1, 6, NA, 3, NA), label = "How often did you see your mother over the past 12 months?", labels = c(never = 1, 
    once = 2, `a few times` = 3, `at least every month` = 4, 
    `at least every week` = 5, `a few times per week` = 6, `every day` = 7
    ), class = "haven_labelled"), Married = structure(c(NA, 1, 
    2, 2, 1, 2, 1, 1, 1, 1), label = "Are you married to this partner?", labels = c(yes = 1, 
    no = 2), class = "haven_labelled"), Child = structure(c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_), label = "Have you had any children?", labels = c(yes = 1, 
    no = 2), class = "haven_labelled"), NumChild = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_), SchoolCH1 = structure(c(NA, 
    NA, NA, NA, NA, NA, 4, NA, NA, NA), label = "What school does child 1 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1, 
    `school for special primary education` = 2, `secondary school` = 3, 
    other = 4), class = "haven_labelled"), SchoolCH2 = structure(c(NA, 
    NA, NA, NA, NA, NA, 3, NA, NA, NA), label = "What school does child 2 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1, 
    `school for special primary education` = 2, `secondary school` = 3, 
    other = 4), class = "haven_labelled"), SchoolCH3 = structure(c(NA, 
    NA, NA, NA, NA, NA, 1, NA, NA, NA), label = "What school does child 3 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1, 
    `school for special primary education` = 2, `secondary school` = 3, 
    other = 4), class = "haven_labelled"), SchoolCH4 = structure(c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_), label = "What school does child 4 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1, 
    `school for special primary education` = 2, `secondary school` = 3, 
    other = 4), class = "haven_labelled"), BirthCH1 = c(NA, 2005, 
    2007, NA, 1983, NA, 1991, 1964, NA, 1974), BirthCH2 = c(NA, 
    2007, NA, NA, 1985, NA, 1994, 1966, NA, 1976), BirthCH3 = c(NA, 
    NA, NA, NA, NA, NA, 1999, 1970, NA, NA), BirthCH4 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_), FamSatisfaction = structure(c(NA, 
    8, 9, NA, 8, NA, 8, NA, NA, NA), label = "How satisfied are you with your family life?", labels = c(`entirely dissatisfied` = 0, 
    `entirely satisfied` = 10, `I don’t know` = 999), class = "haven_labelled"), 
    Year = c(2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 
    2009, 2009)), row.names = c(NA, -10L), class = c("tbl_df", 
"tbl", "data.frame"))
4

1 回答 1

1

我相信您可以按照以下方式做一些事情:

fam = bind_rows(fam_list)
inc = bind_rows(inc_list)
ws = bind_rows(ws_list)

result = fam %>%
  left_join(inc, by=c("HouseholdMember", "Year")) %>% 
  left_join(ws, by=c("HouseholdMember", "Year"))

输出:

   HouseholdMember  Year fam_v1 fam_v2 fam_v3  inc_v1  inc_v2 inc_v3   ws_v1 ws_v2  ws_v3
             <dbl> <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl>  <dbl>   <dbl> <dbl>  <dbl>
 1            8001  2008  0.609 -0.253 -1.30   0.0147  0.719  -0.765  0.120  0.974 -0.764
 2            8002  2008  0.395  1.73  -0.503  0.119  -3.33   -0.798  0.325  0.664  1.65 
 3            8003  2008  0.562  0.157  0.243 -1.18   -0.260   0.105  1.09   0.855  1.19 
 4            8004  2008  1.32   0.737 -1.18   0.725  -1.82    0.356  0.362  2.04   1.76 
 5            8005  2008 -0.497 -0.444 -0.632 -0.534   1.63    0.984  1.29   0.614  0.576
 6            8006  2008 -1.70  -0.989 -1.32   0.868   0.0979  0.468 -0.0146 1.11   0.957
 7            8007  2008 -2.19  -0.419  1.69   1.34   -0.404  -1.43  -0.156  0.648 -0.186
 8            8008  2008  1.48   0.350 -0.595  0.785  -0.609   1.28  -1.01   1.04   0.845
 9            8009  2008 -0.315 -0.530  0.419  0.390  -0.0951 -0.755  0.135  0.696 -1.97 
10            8010  2008 -0.882  1.38   2.06  -0.0757  1.53   -0.494 -1.03   1.14   1.87 

笔记:

我通过创建 tibbles 列表为这个示例制作了数据;我相信fam_list,inc_listws_list类似于您图像中的列表对象。这些是数据框/小标题列表。然后我使用bind_rows将这些类似结构的 tibbles 绑定在一起,这样我就有了三个大的 tibbles。

然后我使用两次left_join加入incwsfam

输入数据:

library(tidyverse)
fam_list = lapply(8:20, function(x) {
  tibble(HouseholdMember = c(8000+seq(1:100)),
         Year=2000+x, 
         fam_v1=rnorm(100), 
         fam_v2=rnorm(100), 
         fam_v3=rnorm(100)
  )
})
names(fam_list) = paste0("fam_20", 8:20)

inc_list = lapply(8:20, function(x) {
  tibble(HouseholdMember = c(8000+seq(1:100)),
         Year=2000+x, 
         inc_v1=rnorm(100), 
         inc_v2=rnorm(100), 
         inc_v3=rnorm(100)
  )
})
names(inc_list) = paste0("inc_20", 8:20)
ws_list = lapply(8:20, function(x) {
  tibble(HouseholdMember = c(8000+seq(1:100)),
         Year=2000+x, 
         ws_v1=rnorm(100), 
         ws_v2=rnorm(100), 
         ws_v3=rnorm(100)
  )
})
names(ws_list) = paste0("ws_20", 8:20)

输入

于 2022-03-03T14:12:18.510 回答