我将测量三种可用解决方案(Daniel.Krizian、Blue.Magister、eddi)的时间。
为此,我创建了更大的基准数据——大型信号表X
和Y
基准数据:X
和Y
表格
nobs <- 5000 # number of observations for each instrument
nopps <- nobs * 3 # opportunities to trade in the time window studied
ninstr <- 200 # number of instruments
set.seed(2) # set.seed(1) generates "MPM" instrument twice :)
universe <- replicate( ninstr , paste( sample( LETTERS , 3 , repl = TRUE ), collapse = "" ) )
window <- as.Date("2013-11-26") - 1:nopps + 1
frame <- CJ(Instrument=universe, Date=rep(1:nobs))
gen.sig.tbl <- function() {
frame[, Date:= as.IDate(sample(window, size=nobs, replace=F)), by="Instrument"]
setkey(frame,Instrument,Date)
rnd.sig.sparse <- function(nobs) {
frst <- sample(c(FALSE,TRUE), 1)
rep(c(frst,!frst), nobs/2)
}
frame[, Signal:=rnd.sig.sparse(nobs), by="Instrument"]
return(copy(frame))
}
set.seed(1)
X <- gen.sig.tbl()
set.seed(2)
Y <- gen.sig.tbl()
X
Instrument Date Signal
1: AAS 1972-11-02 FALSE
2: AAS 1972-11-04 TRUE
3: AAS 1972-11-07 FALSE
4: AAS 1972-11-08 TRUE
5: AAS 1972-11-10 FALSE
---
999996: ZVH 2013-11-14 FALSE
999997: ZVH 2013-11-15 TRUE
999998: ZVH 2013-11-18 FALSE
999999: ZVH 2013-11-25 TRUE
1000000: ZVH 2013-11-26 FALSE
Y
Instrument Date Signal
1: AAS 1972-11-13 TRUE
2: AAS 1972-11-17 FALSE
3: AAS 1972-11-20 TRUE
4: AAS 1972-11-21 FALSE
5: AAS 1972-11-23 TRUE
---
999996: ZVH 2013-11-16 TRUE
999997: ZVH 2013-11-19 FALSE
999998: ZVH 2013-11-23 TRUE
999999: ZVH 2013-11-24 FALSE
1000000: ZVH 2013-11-25 TRUE
三种解决方案:
Daniel.Krizian <- function () {
Z <- merge(X, Y, all=TRUE)[, c("Signal.x","Signal.y"):=list( na.locf(Signal.x, na.rm = F)
, na.locf(Signal.y, na.rm = F))
, by=Instrument]
Z[, Signal.z := Signal.x & Signal.y]
# and the last line because (FALSE & NA) == FALSE, whereas NA result is desired
Z[, Signal.z := ifelse(is.na(Signal.x) | is.na(Signal.y), NA, Signal.z)]
return(Z)
}
Blue.Magister <- function() {
keys <- unique(rbind(X[,key(X),with = FALSE], Y[,key(Y), with = FALSE]))
## this setkey is mostly cosmetic -
## determines whether the final output is sorted or not
setkeyv(keys, names(keys))
##cosmetic changing of column names to minimize confusion
setnames(X,"Signal","Signal.X")
setnames(Y,"Signal","Signal.Y")
## two joins, followed by the definition of the new column
Z <- X[Y[keys, roll = TRUE], roll = TRUE][,
Signal.Z := as.logical(Signal.X * Signal.Y)]
Z <- unique(Z)
return(Z)
}
eddi <- function (){
# assuming keys are set to Instrument, Date in both data.tables
Z = unique(setkey(rbind(setnames(X[Y, roll = T],
c("Instrument", "Date", "Signal.x", "Signal.y")),
setnames(Y[X, roll = T],
c("Instrument", "Date", "Signal.y", "Signal.x")),
use.names = TRUE),
Instrument, Date))[,
Signal.z := as.logical(Signal.x * Signal.y)]
return(Z)
}
基准测试:
system.time(Z.DK <- Daniel.Krizian())
user system elapsed
2.70 0.07 3.01
system.time(Z.eddi <- eddi())
user system elapsed
1.14 0.03 1.84
system.time(Z.BM <- Blue.Magister())
user system elapsed
3.35 0.14 3.52
setnames(X,"Signal.X", "Signal") # reset original data back after Blue.Magister() call
setnames(Y,"Signal.Y", "Signal") # reset original data back after Blue.Magister() call
setnames(Z.BM
, c("Signal.X", "Signal.Y", "Signal.Z")
, c("Signal.x", "Signal.y", "Signal.z"))
identical(Z.DK, Z.BM)
TRUE
identical(Z.DK, Z.eddi)
TRUE