-1

我在数据表中有一个纵向数据集,类似于下面的简化示例:

> head(data)
   Country     ID        Date         Value
1:   AT        AT6306    2012-11-01   16.2
2:   AT        AT6306    2012-11-02   12.2
3:   AT        AT6306    2012-11-03   11.3
4:   AT        AT6306    2012-11-04   14.2
5:   AT        AT6306    2012-11-05   17.3
6:   AT        AT6306    2012-11-06   12.5

> tail(data)
   Country     ID        Date         Value
1:   SE        SE0935    2014-06-25   16.2
2:   SE        SE0935    2014-06-26   12.2
3:   SE        SE0935    2014-06-27   11.3
4:   SE        SE0935    2014-06-28   14.2
5:   SE        SE0935    2014-06-29   17.3
6:   SE        SE0935    2014-06-30   12.5

ID是面板变量,它是完全唯一的,没有国家之间的重叠。日期范围,仅查看唯一值,范围从2012-10-232014-09-30。显然,每个 的范围Date并不相同ID。此外,可能存在缺失值。为了有一个平衡的面板,我想填补我的数据集的空白。

根据@akron 的建议,在这里调整答案,我执行以下操作:

data2 <- data[, CJ(ID=unique(ID), Date=unique(Date))]
setkey(data2, ID, Date)

data.new <- merge(data, data2, by=c("ID", "Date"), all.y = TRUE)
setkey(data.new, ID, Date)

使用该选项all.y = TRUE,R 会为 中的每个缺失日期添加行data。但是,现在除了ID和之外的所有字段Date都是空白的,如果该行之前在 中不存在data。也就是说,我的数据看起来像这样

> head(data.new)
   Country     ID        Date         Value
1:   NA        AT6306    2012-10-23   NA
2:   NA        AT6306    2012-10-24   NA
3:   NA        AT6306    2012-10-25   NA
4:   NA        AT6306    2012-10-26   NA
5:   NA        AT6306    2012-10-27   NA
6:   NA        AT6306    2012-10-28   NA    

我确实希望Value成为 NA,因为它丢失了。但是,由于Country给定的 不会更改ID,因此我希望填写该字段。

4

1 回答 1

1
library(data.table)
DT <- data.table(dat)
setkey(DT, Date, Country, ID)
res <- DT[CJ(seq(min(Date), max(Date), by='1 day'), 
                        unique(Country), unique(ID))]

 head(res)
#    Country   ID       Date Value
#1:      AT  935 2012-11-01    NA
#2:      AT 6306 2012-11-01  16.2
#3:      SE  935 2012-11-01    NA
#4:      SE 6306 2012-11-01    NA
#5:      AT  935 2012-11-02    NA
#6:      AT 6306 2012-11-02  12.2

更新

您可以做的一种选择是

DT <- data.table(dat)
DT[,CountryID:= paste(Country,ID)]
setkey(DT, Date, CountryID)
DT1 <- DT[CJ(unique(Date), unique(CountryID))][,
      c('Country', 'ID'):=  list(gsub("[ 0-9]", "", CountryID),
               gsub("[^ 0-9]", "", CountryID)),][,-5L]


head(DT1,3)
#     Country    ID       Date Value
#1:      AT  6306 2012-11-01  16.2
#2:      SE   935 2012-11-01    NA
#3:      AT  6306 2012-11-02  12.2

nrow(DT1)
#[1] 24

数据

dat <- structure(list(Country = c("AT", "AT", "AT", "AT", "AT", "AT", 
"SE", "SE", "SE", "SE", "SE", "SE"), ID = c(6306L, 6306L, 6306L, 
6306L, 6306L, 6306L, 935L, 935L, 935L, 935L, 935L, 935L), Date = structure(c(15645, 
15646, 15647, 15648, 15649, 15650, 15669, 15670, 15671, 15672, 
15673, 15674), class = "Date"), Value = c(16.2, 12.2, 11.3, 14.2, 
17.3, 12.5, 16.2, 12.2, 11.3, 14.2, 17.3, 12.5)), .Names = c("Country", 
"ID", "Date", "Value"), row.names = c("1:", "2:", "3:", "4:", 
"5:", "6:", "1:1", "2:1", "3:1", "4:1", "5:1", "6:1"), class = "data.frame")
于 2014-11-03T15:36:31.237 回答