findInterval
比in.interval
长 x 快。
library(microbenchmark)
set.seed(123L)
x <- runif(1e6, 1, 10)
in.interval <- function(x, lo, hi) (x > lo & x < hi)
microbenchmark(
findInterval(x, c(3, 5)) == 1L,
in.interval(x, 3, 5),
times=100)
和
Unit: milliseconds
expr min lq median uq max
1 findInterval(x, c(3, 5)) == 1L 23.40665 25.13308 25.17272 25.25361 27.04032
2 in.interval(x, 3, 5) 42.91647 45.51040 45.60424 45.75144 46.38389
== 1L
如果不需要,则更快,如果要找到的“间隔”大于 1,则很有用
> system.time(findInterval(x, 0:10))
user system elapsed
3.644 0.112 3.763
如果速度至关重要,那么这个 C 实现虽然不能容忍整数而不是数字参数,但速度很快
library(inline)
in.interval_c <- cfunction(c(x="numeric", lo="numeric", hi="numeric"),
' int len = Rf_length(x);
double lower = REAL(lo)[0], upper = REAL(hi)[0],
*xp = REAL(x);
SEXP out = PROTECT(NEW_LOGICAL(len));
int *outp = LOGICAL(out);
for (int i = 0; i < len; ++i)
outp[i] = (xp[i] - lower) * (xp[i] - upper) <= 0;
UNPROTECT(1);
return out;')
其他答案中提出的一些解决方案的时间安排是
microbenchmark(
findInterval(x, c(3, 5)) == 1L,
in.interval.abs(x, 3, 5),
in.interval(x, 3, 5),
in.interval_c(x, 3, 5),
!is.na(.bincode(x, c(3, 5))),
times=100)
和
Unit: milliseconds
expr min lq median uq
1 findInterval(x, c(3, 5)) == 1L 23.419117 23.495943 23.556524 23.670907
2 in.interval.abs(x, 3, 5) 12.018486 12.056290 12.093279 12.161213
3 in.interval_c(x, 3, 5) 1.619649 1.641119 1.651007 1.679531
4 in.interval(x, 3, 5) 42.946318 43.050058 43.171480 43.407930
5 !is.na(.bincode(x, c(3, 5))) 15.421340 15.468946 15.520298 15.600758
max
1 26.360845
2 13.178126
3 2.785939
4 46.187129
5 18.558425
在 bin.cpp 文件中重新讨论速度问题
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
SEXP bin1(SEXP x, SEXP lo, SEXP hi)
{
const int len = Rf_length(x);
const double lower = REAL(lo)[0], upper = REAL(hi)[0];
SEXP out = PROTECT(Rf_allocVector(LGLSXP, len));
double *xp = REAL(x);
int *outp = LOGICAL(out);
for (int i = 0; i < len; ++i)
outp[i] = (xp[i] - lower) * (xp[i] - upper) <= 0;
UNPROTECT(1);
return out;
}
// [[Rcpp::export]]
LogicalVector bin2(NumericVector x, NumericVector lo, NumericVector hi)
{
NumericVector xx(x);
double lower = as<double>(lo);
double upper = as<double>(hi);
LogicalVector out(x);
for( int i=0; i < out.size(); i++ )
out[i] = ( (xx[i]-lower) * (xx[i]-upper) ) <= 0;
return out;
}
// [[Rcpp::export]]
LogicalVector bin3(NumericVector x, const double lower, const double upper)
{
const int len = x.size();
LogicalVector out(len);
for (int i=0; i < len; i++)
out[i] = ( (x[i]-lower) * (x[i]-upper) ) <= 0;
return out;
}
有时间
> library(Rcpp)
> sourceCpp("bin.cpp")
> microbenchmark(bin1(x, 3, 5), bin2(x, 3, 5), bin3(x, 3, 5),
+ in.interval_c(x, 3, 5), times=1000)
Unit: milliseconds
expr min lq median uq max
1 bin1(x, 3, 5) 1.546703 2.668171 2.785255 2.839225 144.9574
2 bin2(x, 3, 5) 12.547456 13.583808 13.674477 13.792773 155.6594
3 bin3(x, 3, 5) 2.238139 3.318293 3.357271 3.540876 144.1249
4 in.interval_c(x, 3, 5) 1.545139 2.654809 2.767784 2.822722 143.7500
大约相等的部分加速来自使用常量len
而不是out.size()
循环边界,并且分配逻辑向量而不初始化它(LogicalVector(len)
因为它将在循环中初始化)。