更新:
Efraimidis 和 Spirakis算法的Rcpp
实现(感谢 @Hemmo、@Dinrem、@krlmlr 和@rtlgrmpf):
library(inline)
library(Rcpp)
src <-
'
int num = as<int>(size), x = as<int>(n);
Rcpp::NumericVector vx = Rcpp::clone<Rcpp::NumericVector>(x);
Rcpp::NumericVector pr = Rcpp::clone<Rcpp::NumericVector>(prob);
Rcpp::NumericVector rnd = rexp(x) / pr;
for(int i= 0; i<vx.size(); ++i) vx[i] = i;
std::partial_sort(vx.begin(), vx.begin() + num, vx.end(), Comp(rnd));
vx = vx[seq(0, num - 1)] + 1;
return vx;
'
incl <-
'
struct Comp{
Comp(const Rcpp::NumericVector& v ) : _v(v) {}
bool operator ()(int a, int b) { return _v[a] < _v[b]; }
const Rcpp::NumericVector& _v;
};
'
funFast <- cxxfunction(signature(n = "Numeric", size = "integer", prob = "numeric"),
src, plugin = "Rcpp", include = incl)
# See the bottom of the answer for comparison
p <- c(995/1000, rep(1/1000, 5))
n <- 100000
system.time(print(table(replicate(funFast(6, 3, p), n = n)) / n))
1 2 3 4 5 6
1.00000 0.39996 0.39969 0.39973 0.40180 0.39882
user system elapsed
3.93 0.00 3.96
# In case of:
# Rcpp::IntegerVector vx = Rcpp::clone<Rcpp::IntegerVector>(x);
# i.e. instead of NumericVector
1 2 3 4 5 6
1.00000 0.40150 0.39888 0.39925 0.40057 0.39980
user system elapsed
1.93 0.00 2.03
旧版:
让我们尝试一些可能的方法:
带替换的简单拒绝抽样。这是一个比sample.int.rej
@krlmlr 提供的更简单的功能,即样本大小始终等于n
. 正如我们将看到的,假设权重均匀分布,它仍然非常快,但在另一种情况下非常慢。
fastSampleReject <- function(all, n, w){
out <- numeric(0)
while(length(out) < n)
out <- unique(c(out, sample(all, n, replace = TRUE, prob = w)))
out[1:n]
}
Wong 和 Easton (1980) 的算法。这是此Python 版本的实现。它很稳定,我可能会遗漏一些东西,但与其他功能相比它要慢得多。
fastSample1980 <- function(all, n, w){
tws <- w
for(i in (length(tws) - 1):0)
tws[1 + i] <- sum(tws[1 + i], tws[1 + 2 * i + 1],
tws[1 + 2 * i + 2], na.rm = TRUE)
out <- numeric(n)
for(i in 1:n){
gas <- tws[1] * runif(1)
k <- 0
while(gas > w[1 + k]){
gas <- gas - w[1 + k]
k <- 2 * k + 1
if(gas > tws[1 + k]){
gas <- gas - tws[1 + k]
k <- k + 1
}
}
wgh <- w[1 + k]
out[i] <- all[1 + k]
w[1 + k] <- 0
while(1 + k >= 1){
tws[1 + k] <- tws[1 + k] - wgh
k <- floor((k - 1) / 2)
}
}
out
}
Wong 和 Easton 对算法的 Rcpp 实现。可能它可以进一步优化,因为这是我的第一个可用Rcpp
功能,但无论如何它运行良好。
library(inline)
library(Rcpp)
src <-
'
Rcpp::NumericVector weights = Rcpp::clone<Rcpp::NumericVector>(w);
Rcpp::NumericVector tws = Rcpp::clone<Rcpp::NumericVector>(w);
Rcpp::NumericVector x = Rcpp::NumericVector(all);
int k, num = as<int>(n);
Rcpp::NumericVector out(num);
double gas, wgh;
if((weights.size() - 1) % 2 == 0){
tws[((weights.size()-1)/2)] += tws[weights.size()-1] + tws[weights.size()-2];
}
else
{
tws[floor((weights.size() - 1)/2)] += tws[weights.size() - 1];
}
for (int i = (floor((weights.size() - 1)/2) - 1); i >= 0; i--){
tws[i] += (tws[2 * i + 1]) + (tws[2 * i + 2]);
}
for(int i = 0; i < num; i++){
gas = as<double>(runif(1)) * tws[0];
k = 0;
while(gas > weights[k]){
gas -= weights[k];
k = 2 * k + 1;
if(gas > tws[k]){
gas -= tws[k];
k += 1;
}
}
wgh = weights[k];
out[i] = x[k];
weights[k] = 0;
while(k > 0){
tws[k] -= wgh;
k = floor((k - 1) / 2);
}
tws[0] -= wgh;
}
return out;
'
fun <- cxxfunction(signature(all = "numeric", n = "integer", w = "numeric"),
src, plugin = "Rcpp")
现在一些结果:
times1 <- ldply(
1:6,
function(i) {
n <- 1024 * (2 ** i)
p <- runif(2 * n) # Uniform distribution
p <- p/sum(p)
data.frame(
n=n,
user=c(system.time(sample.int.test(n, p), gcFirst=T)['user.self'],
system.time(weighted_Random_Sample(1:(2*n), p, n), gcFirst=T)['user.self'],
system.time(fun(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(sample.int.rej(2*n, n, p), gcFirst=T)['user.self'],
system.time(fastSampleReject(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(fastSample1980(1:(2*n), n, p), gcFirst=T)['user.self']),
id=c("Base", "Reservoir", "Rcpp", "Rejection", "Rejection simple", "1980"))
},
.progress='text'
)
times2 <- ldply(
1:6,
function(i) {
n <- 1024 * (2 ** i)
p <- runif(2 * n - 1)
p <- p/sum(p)
p <- c(0.999, 0.001 * p) # Special case
data.frame(
n=n,
user=c(system.time(sample.int.test(n, p), gcFirst=T)['user.self'],
system.time(weighted_Random_Sample(1:(2*n), p, n), gcFirst=T)['user.self'],
system.time(fun(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(sample.int.rej(2*n, n, p), gcFirst=T)['user.self'],
system.time(fastSampleReject(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(fastSample1980(1:(2*n), n, p), gcFirst=T)['user.self']),
id=c("Base", "Reservoir", "Rcpp", "Rejection", "Rejection simple", "1980"))
},
.progress='text'
)
arrange(times1, id)
n user id
1 2048 0.53 1980
2 4096 0.94 1980
3 8192 2.00 1980
4 16384 4.32 1980
5 32768 9.10 1980
6 65536 21.32 1980
7 2048 0.02 Base
8 4096 0.05 Base
9 8192 0.18 Base
10 16384 0.75 Base
11 32768 2.99 Base
12 65536 12.23 Base
13 2048 0.00 Rcpp
14 4096 0.01 Rcpp
15 8192 0.03 Rcpp
16 16384 0.07 Rcpp
17 32768 0.14 Rcpp
18 65536 0.31 Rcpp
19 2048 0.00 Rejection
20 4096 0.00 Rejection
21 8192 0.00 Rejection
22 16384 0.02 Rejection
23 32768 0.02 Rejection
24 65536 0.03 Rejection
25 2048 0.00 Rejection simple
26 4096 0.01 Rejection simple
27 8192 0.00 Rejection simple
28 16384 0.01 Rejection simple
29 32768 0.00 Rejection simple
30 65536 0.05 Rejection simple
31 2048 0.00 Reservoir
32 4096 0.00 Reservoir
33 8192 0.00 Reservoir
34 16384 0.02 Reservoir
35 32768 0.03 Reservoir
36 65536 0.05 Reservoir
arrange(times2, id)
n user id
1 2048 0.43 1980
2 4096 0.93 1980
3 8192 2.00 1980
4 16384 4.36 1980
5 32768 9.08 1980
6 65536 19.34 1980
7 2048 0.01 Base
8 4096 0.04 Base
9 8192 0.18 Base
10 16384 0.75 Base
11 32768 3.11 Base
12 65536 12.04 Base
13 2048 0.01 Rcpp
14 4096 0.02 Rcpp
15 8192 0.03 Rcpp
16 16384 0.08 Rcpp
17 32768 0.15 Rcpp
18 65536 0.33 Rcpp
19 2048 0.00 Rejection
20 4096 0.00 Rejection
21 8192 0.02 Rejection
22 16384 0.02 Rejection
23 32768 0.05 Rejection
24 65536 0.08 Rejection
25 2048 1.43 Rejection simple
26 4096 2.87 Rejection simple
27 8192 6.17 Rejection simple
28 16384 13.68 Rejection simple
29 32768 29.74 Rejection simple
30 65536 73.32 Rejection simple
31 2048 0.00 Reservoir
32 4096 0.00 Reservoir
33 8192 0.02 Reservoir
34 16384 0.02 Reservoir
35 32768 0.02 Reservoir
36 65536 0.04 Reservoir
显然我们可以拒绝函数1980
,因为它比Base
这两种情况都慢。Rejection simple
当第二种情况下只有一个概率为 0.999 时,也会遇到麻烦。
所以剩下Rejection
, Rcpp
, Reservoir
. 最后一步是检查值本身是否正确。为了确定它们,我们将使用它们sample
作为基准(也是为了消除p
由于没有替换的抽样而不必一致的概率的混淆)。
p <- c(995/1000, rep(1/1000, 5))
n <- 100000
system.time(print(table(replicate(sample(1:6, 3, repl = FALSE, prob = p), n = n))/n))
1 2 3 4 5 6
1.00000 0.39992 0.39886 0.40088 0.39711 0.40323 # Benchmark
user system elapsed
1.90 0.00 2.03
system.time(print(table(replicate(sample.int.rej(2*3, 3, p), n = n))/n))
1 2 3 4 5 6
1.00000 0.40007 0.40099 0.39962 0.40153 0.39779
user system elapsed
76.02 0.03 77.49 # Slow
system.time(print(table(replicate(weighted_Random_Sample(1:6, p, 3), n = n))/n))
1 2 3 4 5 6
1.00000 0.49535 0.41484 0.36432 0.36338 0.36211 # Incorrect
user system elapsed
3.64 0.01 3.67
system.time(print(table(replicate(fun(1:6, 3, p), n = n))/n))
1 2 3 4 5 6
1.00000 0.39876 0.40031 0.40219 0.40039 0.39835
user system elapsed
4.41 0.02 4.47
注意这里的几件事。由于某种原因weighted_Random_Sample
返回不正确的值(我根本没有研究过,但假设均匀分布它可以正常工作)。sample.int.rej
重复采样很慢。
总之,Rcpp
在重复采样的情况下,这似乎是最佳选择,而在sample.int.rej
其他情况下速度更快,也更易于使用。