我认为制表或 C++ 版本是要走的路,但这里有一些使用 rbenchmark 的代码,这是一个很好的查看时间的包(我也添加了一些较慢的功能测试):
######################
### ---Clean Up--- ###
######################
rm(list = ls())
gc()
######################
### ---Packages--- ###
#####################
require(parallel)
require(data.table)
require(rbenchmark)
require(inline)
#######################
### ---Functions--- ###
#######################
# Competitor functions by Breyal
Breyal.using_datatable <- function(x) {DT <- data.table(x = x, weight = 1, key = "x"); DT[, length(weight), by = x]}
Breyal.using_lapply_1c_eq <- function(x = sort(x)) { lapply(unique(x), function(u) sum(x == u)) } # 1 core
Breyal.using_mclapply_8c_eq <- function(x = sort(x)) { mclapply(unique(x), function(u) sum(x == u), mc.cores = 8L) } # 8 cores
# Competitor functions by tennenrishin
tennenrishin.using_table <- function(x) as.data.frame(table(x))
tennenrishin.using_hist <- function(x) hist(x,breaks=min(x):(max(x)+1),plot=FALSE,right=FALSE)
tennenrishin.using_sort <- function(x) {cdf<-cbind(sort(x),seq_along(x)); cdf<-cdf[!duplicated(cdf[,1]),2]; c(cdf[-1],length(x)+1)-cdf}
tennenrishin.using_ecdf <- function(x) {i<-min(x):max(x); cdf<-ecdf(x)(i)*length(x); cdf-c(0,cdf[-length(i)])}
tennenrishin.using_counting_loop <- function(x) {h<-rep(0,max(x)+1);for(i in seq_along(x)){h[x[i]]<-h[x[i]]+1}; h}
# Competitor function by Ulrich
Ulrich.using_tabulate <- function(x) {
adj <- 1L - min(x)
y <- setNames(tabulate(x+adj), sort(unique(x)))
return(y)
}
# I couldn't get the Joe's C++ version to work (my laptop won't install inline) butI suspect that would be the fastest solution
##################
### ---Data--- ###
##################
# Set seed so results are reproducable
set.seed(21)
# Data vector
x <- floor(runif(1000000)*1000)
#####################
### ---Timings--- ###
#####################
# Benchmarks using Ubuntu 13.04 x64 with 8GB RAM and i7-2600K CPU @ 3.40GHz
benchmark(replications = 5,
tennenrishin.using_table(x),
tennenrishin.using_hist(x),
tennenrishin.using_sort(x),
tennenrishin.using_ecdf(x),
tennenrishin.using_counting_loop(x),
Ulrich.using_tabulate(x),
Breyal.using_datatable(x),
Breyal.using_lapply_1c_eq(x),
Breyal.using_mclapply_8c_eq(x),
order = "relative")
这导致以下时间
test replications elapsed relative user.self sys.self user.child sys.child
6 Ulrich.using_tabulate(x) 5 0.176 1.000 0.176 0.000 0.00 0.000
2 tennenrishin.using_hist(x) 5 0.468 2.659 0.468 0.000 0.00 0.000
3 tennenrishin.using_sort(x) 5 0.687 3.903 0.688 0.000 0.00 0.000
4 tennenrishin.using_ecdf(x) 5 0.749 4.256 0.748 0.000 0.00 0.000
7 Breyal.using_datatable(x) 5 2.960 16.818 2.960 0.000 0.00 0.000
1 tennenrishin.using_table(x) 5 4.651 26.426 4.596 0.052 0.00 0.000
9 Breyal.using_mclapply_8c_eq(x) 5 10.817 61.460 0.140 1.196 54.62 7.112
5 tennenrishin.using_counting_loop(x) 5 10.922 62.057 10.912 0.000 0.00 0.000
8 Breyal.using_lapply_1c_eq(x) 5 36.807 209.131 36.768 0.000 0.00 0.000