0

我有一个用户 ID 和登录日期的数据库导出。

structure(list(User.Id = c(2542573L, 2571394L, 2770912L, 2683246L, 
2832110L, 2773277L),  Days.Played = c("", "2020-01-15,2020-01-16,2020-01-21,2020-01-22", 
"2020-06-29", "2020-04-19,2020-04-24,2020-04-29", "2020-09-04", 
"2020-06-23"), row.names = c(NA, 
6L), class = "data.frame")
|---------------------|------------------|
|        id           |    logged_in     |
|---------------------|------------------| 
|         a           |     2019-11-21,  |
|                     |      2019-11-22, |
|                     |       2019-11-23,|
|                     |       2019-11-24,|
|                     |       2019-11-25 |
|---------------------|------------------|
|         b           |                  |
|---------------------|------------------|
|         c           | 2019-11-21,      |
|                     |   2019-11-22,    |
|---------------------|------------------|

我想要做的是用“,”分割日期列,所以每个日期都在它自己的列中

我希望它看起来像下面有一个 login.[a:zz] 延伸到数据库中最长的字符串一样宽。这可能会达到 1000 或更多。


|---------------------|------------------|------------------|
|        id           |    logged_in.a   |    loggedin.b    |
|---------------------|------------------|------------------|
|         a           |     2019-11-21,  |     2019-11-22   |
|                     |                  |                  |
|                     |                  |                  |
|                     |                  |                  |
|                     |                  |                  |
|---------------------|------------------|------------------|
|         b           |                  |                  |
|---------------------|------------------|------------------|
|         c           | 2019-11-21,      |                  |
|                     |                  |    2019-11-22,   |
|---------------------|------------------|------------------|

然后我计划将数据集收集到一个高文件中。我使用的代码如下,但我必须定义 col 名称。我的问题是我不知道会有多少。

require(tidyr)

test %>% transform(.,Days.Played=colsplit(Days.Played, pattern=",", names=c('a','b')))

有谁知道如何解决这个问题或有任何建议?

4

3 回答 3

0

你也可以试试:

library(tidyverse)
#Data
df <- data.frame(id=c('a','b','c'),
                 logged_in=c('2019-11-21,2019-11-22,2019-11-23,2019-11-24,2019-11-25','','2019-11-21,2019-11-22,'),stringsAsFactors = F)
#Code
newdf <- df %>%
  pivot_longer(-c(id)) %>%
  separate_rows(value,sep=',') %>%
  group_by(id) %>%
  mutate(Var=paste0('logged.in.',row_number())) %>%
  select(-name) %>%
  pivot_wider(names_from = Var,values_from=value,values_fill='')

输出:

# A tibble: 3 x 6
# Groups:   id [3]
  id    logged.in.1  logged.in.2  logged.in.3  logged.in.4  logged.in.5 
  <chr> <chr>        <chr>        <chr>        <chr>        <chr>       
1 a     "2019-11-21" "2019-11-22" "2019-11-23" "2019-11-24" "2019-11-25"
2 b     ""           ""           ""           ""           ""          
3 c     "2019-11-21" "2019-11-22" ""           ""           ""          
于 2020-12-01T16:05:47.897 回答
0

base R中,我们可以使用strsplitwithstack

out <- stack(setNames(strsplit(df$Days.Played, ","), df$User.Id))[2:1]
colnames(out) <- names(df)

-输出

out
#   User.Id Days.Played
#1  2571394  2020-01-15
#2  2571394  2020-01-16
#3  2571394  2020-01-21
#4  2571394  2020-01-22
#5  2770912  2020-06-29
#6  2683246  2020-04-19
#7  2683246  2020-04-24
#8  2683246  2020-04-29
#9  2832110  2020-09-04
#10 2773277  2020-06-23

数据

df <- structure(list(User.Id = c(2542573L, 2571394L, 2770912L, 
   2683246L, 2832110L, 2773277L),
   
                     Days.Played = c("", "2020-01-15,2020-01-16,2020-01-21,2020-01-22", 
        "2020-06-29", 
       
  "2020-04-19,2020-04-24,2020-04-29", "2020-09-04", "2020-06-23")), 
                row.names = c(NA, 6L), class = "data.frame")
于 2020-12-01T19:06:08.300 回答
0

我认为这就是你要找的:

library(tidyr)
df %>% separate_rows(Days.Played, sep = ",") 
#> # A tibble: 11 x 2
#>    User.Id Days.Played 
#>      <int> <chr>       
#>  1 2542573 ""          
#>  2 2571394 "2020-01-15"
#>  3 2571394 "2020-01-16"
#>  4 2571394 "2020-01-21"
#>  5 2571394 "2020-01-22"
#>  6 2770912 "2020-06-29"
#>  7 2683246 "2020-04-19"
#>  8 2683246 "2020-04-24"
#>  9 2683246 "2020-04-29"
#> 10 2832110 "2020-09-04"
#> 11 2773277 "2020-06-23"

哪里df是:

df <- structure(list(User.Id = c(2542573L, 2571394L, 2770912L, 2683246L, 2832110L, 2773277L),
                     Days.Played = c("", "2020-01-15,2020-01-16,2020-01-21,2020-01-22", "2020-06-29", "2020-04-19,2020-04-24,2020-04-29", "2020-09-04", "2020-06-23")), 
                row.names = c(NA, 6L), class = "data.frame")
于 2020-12-01T16:01:21.750 回答