我想在不使用 data.table 的情况下比较我的方法,发现它快 45 倍(并且可能更容易理解)。
首先,我从接受的答案中对 data.table 解决方案进行计时:
rm(list=ls())
library(readxl)
library(data.table)
############## Using data.table method() ######################
data <- setDT(read_excel("Book2.xlsx"))[!is.na(PatId)]
data[ , (names(data)) := lapply(.SD, as.integer)]
provs <- data[ , sort(unique(SeenByProv))]
nprov <- length(provs)
markov <- matrix(nrow = nprov, ncol = nprov, dimnames = list(provs, provs))
system.time( ## Timing the main loop
for (pr in provs){
markov[as.character(pr), ] <-
data[ , {nxt <- SeenByProv[which(SeenByProv == pr) + 1L]
.(prov = provs, count =
sapply(provs, function(pr2) sum(nxt == pr2, na.rm = TRUE)))}, by = PatId
][, sum(count), by = prov]$V1
}
)
# user system elapsed
# 3.128 0.000 3.135
table(markov)
#markov
# 0 1 2 3 4 5 6 7 8 9 10 11 13 22 140
#3003 308 89 34 14 11 6 4 1 3 4 1 1 1 1
接下来仅使用基本 R 调用:
############## Using all base R calls method() ###################
tm_matrix<-matrix(0, nrow = nprov, ncol = nprov, dimnames = list(provs, provs))
d<-read_excel("Book2.xlsx")
d<-d[!is.na(d$PatId),] # Note: Data is already ordered by PatId, DaysOfStudy
baseR<-function(tm_matrix){
d1<-cbind(d[-nrow(d),-3],d[-1,-3]); # Form the transitions and drop the DaysofStudy
colnames(d1)<-c("SeenByProv","PatId","NextProv","PatId2");
d1<-d1[d1$PatId==d1$PatId2,]; # Drop those transition between different patients
d1$SeenByProv<-as.character(d1$SeenByProv); # transform to strings to use as rownames
d1$NextProv <-as.character(d1$NextProv); # and column names
for (i in 1:nrow(d1)){ # Fill in the transition matrix
tm_matrix[d1$SeenByProv[i],d1$NextProv[i]]<-tm_matrix[d1$SeenByProv[i],d1$NextProv[i]]+1
};
return(tm_matrix)
}
system.time(tm_matrix<-baseR(tm_matrix))
# user system elapsed
# 0.072 0.000 0.072
table(tm_matrix)
#tm_matrix
# 0 1 2 3 4 5 6 7 8 9 10 11 13 22 140
#3003 308 89 34 14 11 6 4 1 3 4 1 1 1 1
all.equal(markov,tm_matrix)
#[1] TRUE
我的 base-R 方法快 3.135/0.072 = 43.54