1

我正在使用 data.table 寻求有关特定合并问题的帮助。

这是示例数据:

library(data.table)


# Create example dataset
DT_A = data.table(
  Store = "A",
  Date = as.Date(sprintf("10-%02d-%02d", c(22:25, 26:28), rep(1:2, 4:3)),
    '%m-%d-%y')
)

DT_B = data.table(
  Store = "B",
  Date = as.Date(sprintf("10-%02d-%02d", c(22:25, 26:28), rep(1:2, 4:3)),
                 '%m-%d-%y')
)

DT <- rbindlist(list(DT_A, DT_B))


DT
    Store       Date
 1:     A 2001-10-22
 2:     A 2001-10-23
 3:     A 2001-10-24
 4:     A 2001-10-25
 5:     A 2002-10-26
 6:     A 2002-10-27
 7:     A 2002-10-28
 8:     B 2001-10-22
 9:     B 2001-10-23
10:     B 2001-10-24
11:     B 2001-10-25
12:     B 2002-10-26
13:     B 2002-10-27
14:     B 2002-10-28

所以 ,DT有多个日期对商店 A 和 B 的观察。

例如,我有另一个数据集,manager_DT其中包含经理的开始和结束日期:


manager_DT <- data.table(Manager = c("John", "David", "Steve"),
                         Store = c("A", "A","B"),
                         min_date = c(as.Date("2001-10-22"),
                                      as.Date("2001-10-26"),
                                      as.Date("2001-10-22")),
                         max_date = c(as.Date("2001-10-27"),
                                      as.Date("2001-10-28"),
                                      as.Date("2002-10-28")))

 manager_DT

   Manager Store   min_date   max_date
1:    John     A 2001-10-22 2001-10-27
2:   David     A 2001-10-26 2001-10-28
3:   Steve     B 2001-10-22 2002-10-28

在给定时间,商店可能有不止一位经理。在这里,John 和 David 在商店 A 的任期重叠(特别是在 2001 年 10 月 26 日和 2001 年 10 月 27 日),但史蒂夫是商店 B 的唯一经理。

使用 data.table 方法,我想合并manager_DTDT,以便所需的输出是:

DT
Store       Date       Manager
 1:     A 2001-10-22    John
 2:     A 2001-10-23    John
 3:     A 2001-10-24    John
 4:     A 2001-10-25    John
 5:     A 2002-10-26    John 
 6:     A 2002-10-26    David
 7:     A 2002-10-27    John
 8:     A 2002-10-27    David
 9:     A 2002-10-28    David
 10:     B 2001-10-22    Steve  
 11:     B 2001-10-23    Steve
12:     B 2001-10-24    Steve
13:     B 2001-10-25    Steve
14:     B 2002-10-26    Steve
15:     B 2002-10-27    Steve
16:     B 2002-10-28    Steve

请注意,这里只有一个经理列,只要有重叠的日期,就会重复该行(这里,重复两个日期:2001-10-26 和 2001-10-27,其中 John 和 David 都是商店 A 的经理) .

这里的想法是我想要在 Date x Store x Manager 级别进行独特的观察。

谢谢!

4

2 回答 2

1

如果您有兴趣创建合并的数据框,解决方案可以是:

library(tidyverse)
library(lubridate)
library(data.table)

manager_DT <- structure(list(Manager = c("John", "David", "Steve"), Store = c("A", 
"A", "B"), min_date = c("2001-10-22", "2001-10-26", "2001-10-22"
), max_date = c("2001-10-27", "2001-10-28", "2002-10-28")), row.names = c(NA, 
-3L), class = "data.frame")

n <- nrow(manager_DT)

map_dfr(1:n, ~ data.table(
  Store=manager_DT$Store[.x],
  Date=seq(ymd(manager_DT$min_date[.x]),ymd(manager_DT$max_date[.x]),by="days"),
  Manager = manager_DT$Manager[.x]
  ))
于 2021-10-18T19:40:05.240 回答
0

一个可能的解决方案:

library(dplyr)

DT <- structure(list(Store = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "B", "B", "B", "B", "B", "B", "B"), Date = c("2001-10-22", 
"2001-10-23", "2001-10-24", "2001-10-25", "2002-10-26", "2002-10-26", 
"2002-10-27", "2002-10-27", "2002-10-28", "2001-10-22", "2001-10-23", 
"2001-10-24", "2001-10-25", "2002-10-26", "2002-10-27", "2002-10-28"
), Manager = c("John", "John", "John", "John", "John", "David", 
"John", "David", "David", "Steve", "Steve", "Steve", "Steve", 
"Steve", "Steve", "Steve")), row.names = c(NA, -16L), class = "data.frame")

DT %>%
  group_by(Store,Date) %>% 
  mutate(Manager = paste(Manager, collapse = " and "))
于 2021-10-18T18:56:22.950 回答