r - 根据另一个表中的日期范围在一个表中创建虚拟变量

Question

我有两张桌子。 table1看起来像这样

  date       hour     data
2010-05-01     3        5
2010-05-02     7        7
2010-05-02     10       8
2010-07-03     18       3
2011-12-09     22       1
2012-05-01     3        0

这被存储为一个data.tablewith key set ondate和hour。我有另一张桌子，看起来像这样。这是我的outages桌子。

 resource        date_out                date_back
   joey       2010-04-30 4:00:00      2010-05-02 8:30:00
   billy      2009-04-20 7:00:00      2009-02-02 5:30:00
   bob        2011-11-15 12:20:00     2010-12-09 23:00:00
   joey       2012-04-28 1:00:00      2012-05-02 17:00:00

我想将列添加到table1这些列是表中的资源的位置outages。我希望这些列中的值在没有中断时为 0，在有中断时为 1。

这个例子的结果应该是。

  date       hour     data     joey      billy      bob
2010-05-01     3        5       1          0         0        
2010-05-02     7        7       1          0         0 
2010-05-02     10       8       0          0         0 
2010-07-03     18       3       0          0         0 
2011-12-09     22       1       0          0         1
2012-05-01     3        0       1          0         0

实际上，我table1有大约 2500 行，而我的outages表有 19000 行。我能想到的唯一方法是循环遍历表的每一行，然后在正确的位置outages插入 1 。table1我的代码依赖table1于有序，因此至少它不必为outages. 但是，我的数据需要 4 个多小时。

for (out in 1:length(outages$resource)) {
  a<-as.character(outages[out]$resource)
  #if column doesn't exist then create it
  if (a %in% colnames(table1)==FALSE) {
    table1$new<-0
    setnames(table1, "new", a)
    }
  midpoint<-round(length(table1$date)/2,0)
  if (table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages    [out]$due_back)
  {
    while(table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
      table1[midpoint,a:=1,with=FALSE]
      midpoint<-midpoint-1
    }
    midpoint<-round(length(table1$date)/2,0)
    while(table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
      table1[midpoint,a:=1,with=FALSE]
      midpoint<-midpoint+1
    }
  } else {
    if (table1$date[midpoint]+table1$hour[midpoint]*60*60>outages[out]$due_back) {
      while(table1$date[midpoint]+table1$hour[midpoint]*60*60>outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
        midpoint<-midpoint-1
      }
      while(table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
        table1[midpoint,a:=1,with=FALSE]
        midpoint<-midpoint-1
      }
    } 
    midpoint<-round(length(table1$date)/2,0)
    if (table1$date[midpoint]+table1$hour[midpoint]*60*60<outages[out]$due_out) {
      while(table1$date[midpoint]+table1$hour[midpoint]*60*60<outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
        midpoint<-midpoint+1
      }
      while(table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
        table1[midpoint,a:=1,with=FALSE]
        midpoint<-midpoint+1
 }
 }
 }
if (sum(table1[,a,with=FALSE])==0) {
  table1[,a:=NULL,with=FALSE]
}
}

引用每个人最喜欢的商业广告台词“必须有更好的方法”。

score 2 · Accepted Answer

这是实现您想要的东西的一种方法。这假设您table1的时间精度为 1 小时。尽管可以将其修改为任意精度，但对于较大的时间间隔，它会表现得更好，因为它构建了date_out-date_back范围内可能时间的完整序列。请注意，我使用了与 OP 略有不同的表格来说明重叠间隔并纠正 OP 中的一些错误。

table1 = data.table(date = c("2010-05-01", "2010-05-02", "2010-05-02", "2010-07-03", "2011-12-09", "2012-05-01"), hour = c(3,7,10,18,22,3), data = c(5,7,8,3,1,0))
outages = data.table(resource = c("joey", "bob", "billy", "bob", "joey"), date_out = c("2010-04-30 4:00:00", "2010-04-30 4:00:00", "2009-04-20 7:00:00", "2011-11-15 12:20:00", "2012-04-28 1:00:00"), date_back=c("2010-05-02 8:30:00", "2010-05-02 8:30:00", "2009-06-02 5:30:00", "2011-12-09 23:00:00", "2012-05-02 17:00:00"))

# round up date_out and round down date_back
# and create a sequence in-between spaced by 1 hour
outages[, list(datetime = seq(as.POSIXct(round(as.POSIXct(date_out) + 30*60-1, "hours")),
                              as.POSIXct(round(as.POSIXct(date_back) - 30*60, "hours")),
                              60*60)),
          by = list(resource, date_out)] -> outages.expanded
setkey(outages.expanded, datetime)

# merge with the original table, then run "table" to get the frequencies/occurences
# and cbind back with the original table
cbind(table1, unclass(table(
                outages.expanded[table1[, list(datetime=as.POSIXct(paste0(date, " ", hour, ":00:00")))],
                                 resource])))

#         date hour data bob joey
#1: 2010-05-01    3    5   1    1
#2: 2010-05-02    7    7   1    1
#3: 2010-05-02   10    8   0    0
#4: 2010-07-03   18    3   0    0
#5: 2011-12-09   22    1   1    0
#6: 2012-05-01    3    0   0    1

r - 根据另一个表中的日期范围在一个表中创建虚拟变量

1 回答 1

Related

Reference