2

我正在尝试使用 dplyr 将行集复制到列中。以下是我的数据框。

df <- data.frame(
    hid=c(1,1,1,1,2,2,2,2,2,3,3,3,3),
    mid=c(1,2,3,4,1,2,3,4,5,1,2,3,4),
    tmid=c("010","01010","010","01020",
           "010","0120","010","010","020",
           "010","01010","010","01020"),
    thid=c("010","02020","010","02020",
           "000","0120","010","010","010",
           "010","02020","010","02020"),
    )

它以以下格式打印:

> df
   hid mid  tmid  thid
1    1   1   010   010
2    1   2 01010 02020
3    1   3   010   010
4    1   4 01020 02020
5    2   1   010   000
6    2   2  0120  0120
7    2   3   010   010
8    2   4   010   010
9    2   5   020   010
10   3   1   010   010
11   3   2 01010 02020
12   3   3   010   010
13   3   4 01020 02020

我想要的输出如下所示:

     hid   mid  tmid   thid  tmid1  tmid2  tmid3  tmid4  tmid5  thid1  thid2  thid3  thid4  thid5
 * <dbl> <dbl> <fctr> <fctr> <fctr> <fctr> <fctr> <fctr> <fctr> <fctr> <fctr> <fctr> <fctr> <fctr> 
 1     1     1   010    010    010  01010    010  01020      0    010  02020    010  02020      0
 2     1     2 01010  02020    010  01010    010  01020      0    010  02020    010  02020      0
 3     1     3   010    010    010  01010    010  01020      0    010  02020    010  02020      0
 4     1     4 01020  02020    010  01010    010  01020      0    010  02020    010  02020      0
 5     2     1   010    000    010  0120     010    010    020    000   0120    010    010    010
 6     2     2  0120   0120    010  0120     010    010    020    000   0120    010    010    010
 7     2     3   010    010    010  0120     010    010    020    000   0120    010    010    010
 8     2     4   010    010    010  0120     010    010    020    000   0120    010    010    010
 9     2     5   020    010    010  0120     010    010    020    000   0120    010    010    010
10     3     1   010    010    010  01010    010  01020      0    010  02020    010   02020     0
11     3     2 01010  02020    010  01010    010  01020      0    010  02020    010   02020     0
12     3     3   010    010    010  01010    010  01020      0    010  02020    010   02020     0
13     3     4 01020  02020    010  01010    010  01020      0    010  02020    010   02020     0
  • 转换thidtmid
  • thid_x和中的后缀tmid_xmid;定义 但是,最大数量mid是不可扩展的(在实际的大型数据集中它从 1 扩展到可能 8)
  • thid_x和的相同值tmid_xhid
  • 如果值不存在,则应填充0

这种操作的思路如下图所示。 ![在此处输入图像描述

我目前正在尝试使用spread,但它返回特定的midand thidor对tmid。我需要<NA>s用一个值填充剩余部分,该值保留在按 . 分组的输出中hid

> df %>% mutate(id1=str_c("tmid",mid)) %>% group_by(hid) %>% spread(key=id1,value=tmid)
# A tibble: 13 x 8
# Groups:   hid [3]
     hid   mid   thid  tmid1  tmid2  tmid3  tmid4  tmid5
 * <dbl> <dbl> <fctr> <fctr> <fctr> <fctr> <fctr> <fctr>
 1     1     1    010    010   <NA>   <NA>   <NA>   <NA>
 2     1     2  02020   <NA>  01010   <NA>   <NA>   <NA>
 3     1     3    010   <NA>   <NA>    010   <NA>   <NA>
 4     1     4  02020   <NA>   <NA>   <NA>  01020   <NA>
 5     2     1    000    010   <NA>   <NA>   <NA>   <NA>
 6     2     2   0120   <NA>   0120   <NA>   <NA>   <NA>
 7     2     3    010   <NA>   <NA>    010   <NA>   <NA>
 8     2     4    010   <NA>   <NA>   <NA>    010   <NA>
 9     2     5    010   <NA>   <NA>   <NA>   <NA>    020
10     3     1    010    010   <NA>   <NA>   <NA>   <NA>
11     3     2  02020   <NA>  01010   <NA>   <NA>   <NA>
12     3     3    010   <NA>   <NA>    010   <NA>   <NA>
13     3     4  02020   <NA>   <NA>   <NA>  01020   <NA>

有什么建议么?

4

1 回答 1

3

我们可以gather然后做一个spread

library(tidyverse)
df1 %>% 
  select(-tdid, -tiid) %>% 
  gather(key, val, tmid:thid) %>% 
  unite(keyn, key, mid, sep="")  %>%
  spread(keyn, val, fill = '0') %>% 
  right_join(df1) %>%
  select(names(df1), everything(), -tdid, -tiid)
# A tibble: 13 x 14
#     hid   mid tmid  thid  thid1 thid2 thid3 thid4 thid5 tmid1 tmid2 tmid3
#   <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1     1     1 010   010   010   02020 010   02020 0     010   01010 010  
# 2     1     2 01010 02020 010   02020 010   02020 0     010   01010 010  
# 3     1     3 010   010   010   02020 010   02020 0     010   01010 010  
# 4     1     4 01020 02020 010   02020 010   02020 0     010   01010 010  
# 5     2     1 010   000   000   0120  010   010   010   010   0120  010  
# 6     2     2 0120  0120  000   0120  010   010   010   010   0120  010  
# 7     2     3 010   010   000   0120  010   010   010   010   0120  010  
# 8     2     4 010   010   000   0120  010   010   010   010   0120  010  
# 9     2     5 020   010   000   0120  010   010   010   010   0120  010  
#10     3     1 010   010   010   02020 010   02020 0     010   01010 010  
#11     3     2 01010 02020 010   02020 010   02020 0     010   01010 010  
#12     3     3 010   010   010   02020 010   02020 0     010   01010 010  
#13     3     4 01020 02020 010   02020 010   02020 0     010   01010 010  
# ... with 2 more variables: tmid4 <chr>, tmid5 <chr>

数据

df1 <- structure(list(hid = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3), 
    mid = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4), tmid = c("010", 
    "01010", "010", "01020", "010", "0120", "010", "010", "020", 
    "010", "01010", "010", "01020"), thid = c("010", "02020", 
    "010", "02020", "000", "0120", "010", "010", "010", "010", 
    "02020", "010", "02020"), tdid = c("000", "01010", "010", 
    "02020", "000", "0100", "010", "010", "010", "000", "01010", 
    "010", "02020"), tiid = c("010", "02020", "010", "01020", 
    "020", "0220", "020", "020", "020", "010", "02020", "010", 
    "01020")), .Names = c("hid", "mid", "tmid", "thid", "tdid", 
"tiid"), row.names = c(NA, -13L), class = c("tbl_df", "tbl", 
"data.frame"))
于 2018-05-27T06:07:53.037 回答