2

我需要将(某种)宽格式数据集转换为长格式数据集。

该数据集报告了各级官员任期开始结束的年份。
我想对每个级别的每年在职官员进行虚拟化(参见:预期分贝)。

笔记:

  1. 可以在同一级别多次选出官员。The begin and end years of the first time the official is elected are reported in columns start_lv1_1and stop_lv1_1, respectively. 第二次分别在 和 列start_lv1_2stop_lv1_2
  2. 如果一个官员的任期从 2000 年开始到 2005 年结束,我想将值 1 分配给直到2005 年的年份(即 2000、2001、2002、2003、2004 -而不是 2005);
  3. 任务可以重叠。

首先十分感谢。

toy <- data.frame(
  id = c("A","B","C"),
  start_lv1_1 = c(2000,2000,2005),
  stop_lv1_1 = c(2005,2005,2010),
  start_lv1_2 = c(NA,2010,2015),
  stop_lv1_2 = c(NA,2015,2020),
  start_lv2_1 = c(NA,NA,2008),
  stop_lv2_1 = c(NA,NA,2018))

> toy
  id start_lv1_1 stop_lv1_1 start_lv1_2 stop_lv1_2 start_lv2_1 stop_lv2_1
1  A        2000       2005          NA         NA          NA         NA
2  B        2000       2005        2010       2015          NA         NA
3  C        2005       2010        2015       2020        2008       2018

预期结果


expected <- data.frame(
  id = c(rep("A",21),rep("B",21),rep("C",21)),
  year = rep(2000:2020,3),
  lv1 = c(1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
          1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,
          0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,0),
  lv2 = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
          0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
          0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0))

   id year lv1 lv2
1   A 2000   1   0
2   A 2001   1   0
3   A 2002   1   0
4   A 2003   1   0
5   A 2004   1   0
6   A 2005   0   0
7   A 2006   0   0
8   A 2007   0   0
9   A 2008   0   0
10  A 2009   0   0
11  A 2010   0   0
12  A 2011   0   0
13  A 2012   0   0
14  A 2013   0   0
15  A 2014   0   0
16  A 2015   0   0
17  A 2016   0   0
18  A 2017   0   0
19  A 2018   0   0
20  A 2019   0   0
21  A 2020   0   0
22  B 2000   1   0
23  B 2001   1   0
24  B 2002   1   0
25  B 2003   1   0
26  B 2004   1   0
27  B 2005   0   0
28  B 2006   0   0
29  B 2007   0   0
30  B 2008   0   0
31  B 2009   0   0
32  B 2010   1   0
33  B 2011   1   0
34  B 2012   1   0
35  B 2013   1   0
36  B 2014   1   0
37  B 2015   0   0
38  B 2016   0   0
39  B 2017   0   0
40  B 2018   0   0
41  B 2019   0   0
42  B 2020   0   0
43  C 2000   0   0
44  C 2001   0   0
45  C 2002   0   0
46  C 2003   0   0
47  C 2004   0   0
48  C 2005   1   0
49  C 2006   1   0
50  C 2007   1   0
51  C 2008   1   1
52  C 2009   1   1
53  C 2010   0   1
54  C 2011   0   1
55  C 2012   0   1
56  C 2013   0   1
57  C 2014   0   1
58  C 2015   1   1
59  C 2016   1   1
60  C 2017   1   1
61  C 2018   1   0
62  C 2019   1   0
63  C 2020   0   0

4

1 回答 1

4

tidyverse方式 :

library(tidyverse)

toy %>%
  pivot_longer(cols = -id, 
               names_to = c('.value', 'col'), 
               names_pattern = '(\\w+)_(lv\\d+)', 
               values_drop_na = TRUE) %>%
  mutate(year = map2(start, stop - 1, seq)) %>%
  unnest(year) %>%
  dplyr::select(-start, -stop) %>%
  pivot_wider(names_from = col, values_from = col, 
              values_fn = length, values_fill = 0) %>%
  complete(id, year = seq(min(year), max(year) + 1), 
           fill = list(lv1 = 0, lv2 = 0))

#   id     year   lv1   lv2
#   <chr> <int> <dbl> <dbl>
# 1 A      2000     1     0
# 2 A      2001     1     0
# 3 A      2002     1     0
# 4 A      2003     1     0
# 5 A      2004     1     0
# 6 A      2005     0     0
# 7 A      2006     0     0
# 8 A      2007     0     0
# 9 A      2008     0     0
#10 A      2009     0     0
# … with 53 more rows

创建startstop作为不同的列以长格式获取数据。创建 和 之间的年份序列startstop - 1获取宽格式数据和complete序列。

于 2021-05-10T07:44:46.660 回答