r - 更快版本的组合

Question

有什么方法可以加快combn命令从向量中获取 2 个元素的所有唯一组合？

通常会这样设置：

# Get latest version of data.table
library(devtools)
install_github("Rdatatable/data.table",  build_vignettes = FALSE)  
library(data.table)

# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000))) 

# Transform data 
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})

但是，combn它比使用 data.table 计算所有可能的组合慢 10 倍（23 秒对我的计算机上的 3 秒）。

system.time({
d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
})

处理非常大的向量，我正在寻找一种仅通过计算唯一组合来节省内存的方法（例如combn）来节省内存的方法，但使用 data.table 的速度（参见第二个代码片段）。

我很感激任何帮助。

score 22 · Accepted Answer

你可以combnPrim使用gRbase

source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") # will install dependent packages automatically.
system.time({
 d.1 <- as.data.table(t(combn(d$id, 2)))
 })
#   user  system elapsed 
# 27.322   0.585  27.674 

system.time({
d.2 <- as.data.table(t(combnPrim(d$id,2)))
 })
#   user  system elapsed 
#  2.317   0.110   2.425 

identical(d.1[order(V1, V2),], d.2[order(V1,V2),])
#[1] TRUE

score 21 · Accepted Answer

这是一种使用data.tablefunction的方法foverlaps()，结果也很快！

require(data.table) ## 1.9.4+
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)

system.time(olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid])
#  0.603   0.062   0.717

请注意，foverlaps() 不会计算所有排列。需要该子集xid != yid来消除自我重叠。通过实现ignoreSelf参数可以更有效地在内部处理子集 - 类似于IRanges::findOverlaps.

现在只需使用获得的 id 执行子集即可：

system.time(ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid])))
#   0.576   0.047   0.662

总而言之，~1.4 秒。

优点是即使您的 data.tabled有超过 1 列您必须获取组合并使用相同数量的内存（因为我们返回索引），您也可以执行相同的方式。在这种情况下，您只需执行以下操作：

cbind(d[olaps$xid, ..your_cols], d[olaps$yid, ..your_cols])

但仅限于更换combn(., 2L). 不超过2L。

score 13 · Accepted Answer

如果没有基准，标题中包含“快速”一词的任何变体的帖子都是不完整的。在我们发布任何基准之前，我想提一下，自从发布了这个问题以来，已经发布了两个高度优化的包arrangements和RcppAlgos（我是作者）用于生成组合的R. 请注意，自版本以来2.3.0，RcppAlgos我们可以利用多个线程来提高效率。

为了让您了解它们的速度combn，gRbase::combnPrim这里是一个基本基准：

## We test generating just over 3 million combinations
choose(25, 10)
[1] 3268760

microbenchmark(arrngmnt = arrangements::combinations(25, 10),
               combn = combn(25, 10),
               gRBase = gRbase::combnPrim(25, 10),
               serAlgos = RcppAlgos::comboGeneral(25, 10),
               parAlgos = RcppAlgos::comboGeneral(25, 10, nThreads = 4),
               unit = "relative", times = 20)
Unit: relative
    expr        min         lq       mean     median         uq        max neval
arrngmnt   2.979378   3.072319   1.898390   3.756307   2.139258  0.4842967    20
   combn 226.470755 230.410716 118.157110 232.905393 125.718512 17.7778585    20
  gRBase  34.219914  34.209820  18.789954  34.218320  19.934485  3.6455493    20
serAlgos   2.836651   3.078791   2.458645   3.703929   2.231475  1.1652445    20
parAlgos   1.000000   1.000000   1.000000   1.000000   1.000000  1.0000000    20

现在，我们对针对生成组合选择 2 并生成data.table对象的非常具体的情况发布的其他函数进行基准测试。

功能如下：

funAkraf <- function(d) {
    a <- comb2.int(length(d$id))      ## comb2.int from the answer given by @akraf
    setDT(list(V1 = d$id[a[,1]], V2 = d$id[a[,2]]))
}

funAnirban <- function(d) {
    indices <- combi2inds(d$id)
    ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
    ans2
}

funArun <- function(d) {
    d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
    setkey(d, id1, id2)
    olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
    ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
    ans
}

funArrangements <- function(d) {
  a <- arrangements::combinations(x = d$id, k = 2)
  setDT(list(a[, 1], a[, 2]))
}

funGRbase <- function(d) {
  a <- gRbase::combnPrim(d$id,2)
  setDT(list(a[1, ], a[2, ]))
}

funOPCombn <- function(d) {
  a <- combn(d$id, 2)
  setDT(list(a[1, ], a[2, ]))
}

funRcppAlgos <- function(d) {
  a <- RcppAlgos::comboGeneral(d$id, 2, nThreads = 4)
  setDT(list(a[, 1], a[, 2]))
}

以 OP 数据为基准

以下是 OP 给出的示例的基准：

d <- data.table(id=as.character(paste0("A", 10001:15000))) 

microbenchmark(funAkraf(d),
               funAnirban(d),
               funArrangements(d),
               funArun(d),
               funGRbase(d),
               funOPCombn(d),
               funRcppAlgos(d),
               times = 10, unit = "relative")
    Unit: relative
              expr       min        lq      mean    median        uq       max neval
       funAkraf(d)  3.220550  2.971264  2.815023  2.665616  2.344018  3.383673    10
     funAnirban(d)  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    10
funArrangements(d)  1.464730  1.689231  1.834650  1.960233  1.932361  1.693305    10
        funArun(d)  3.256889  2.908075  2.634831  2.729180  2.432277  2.193849    10
      funGRbase(d)  3.513847  3.340637  3.327845  3.196399  3.291480  3.129362    10
     funOPCombn(d) 30.310469 26.255374 21.656376 22.386270 18.527904 15.626261    10
   funRcppAlgos(d)  1.676808  1.956696  1.943773  2.085968  1.949133  1.804180    10

我们看到@AnirbanMukherjee 提供的函数对于这个任务来说是最快的，其次是RcppAlgos/ arrangements。对于这个任务，nThreads没有任何效果，因为传递的向量是 a character，它不是线程安全的。如果我们改为转换id为一个因子怎么办？

带因子的基准（即分类变量）

dFac <- d
dFac$id <- as.factor(dFac$id)

library(microbenchmark)
microbenchmark(funAkraf(dFac),
               funAnirban(dFac),
               funArrangements(dFac),
               funArun(dFac),
               funGRbase(dFac),
               funOPCombn(dFac),
               funRcppAlgos(dFac),
               times = 10, unit = "relative")
Unit: relative
                 expr        min         lq      mean   median        uq       max   neval
       funAkraf(dFac)  10.898202  10.949896  7.589814 10.01369  8.050005  5.557014      10
     funAnirban(dFac)   3.104212   3.337344  2.317024  3.00254  2.471887  1.530978      10
funArrangements(dFac)   2.054116   2.058768  1.858268  1.94507  2.797956  1.691875      10
        funArun(dFac)  10.646680  12.905119  7.703085 11.50311  8.410893  3.802155      10
      funGRbase(dFac)  16.523356  21.609917 12.991400 19.73776 13.599870  6.498135      10
     funOPCombn(dFac) 108.301876 108.753085 64.338478 95.56197 65.494335 28.183104      10
   funRcppAlgos(dFac)   1.000000   1.000000  1.000000  1.00000  1.000000  1.000000      10

现在，我们看到这比任何其他解决方案都快RcppAlgos。2x特别是，该RcppAlgos解决方案3x比 Anirban 之前提供的最快解决方案要快。应该注意的是，这种效率的提高是可能的，因为factor变量确实integers在引擎盖下，还有一些额外的attributes.

确认平等

它们也都给出相同的结果。唯一需要注意的是，该gRbase解决方案不支持因子。也就是说，如果你传递 a factor，它将被转换为character。因此，如果您要通过，所有解决方案都将给出相同的结果，但解决方案dFac除外gRbase：

identical(funAkraf(d), funOPCombn(d))
#[1] TRUE
identical(funAkraf(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funAnirban(d))
#[1] TRUE
identical(funRcppAlgos(d), funArun(d))
#[1] TRUE

## different order... we must sort
identical(funRcppAlgos(d), funGRbase(d))
[1] FALSE
d1 <- funGRbase(d)
d2 <- funRcppAlgos(d)

## now it's the same
identical(d1[order(V1, V2),], d2[order(V1,V2),])
#[1] TRUE

感谢@Frank 指出如何在data.tables不经历创建新data.tables然后安排它们的痛苦的情况下比较两者：

fsetequal(funRcppAlgos(d), funGRbase(d))
[1] TRUE

score 12 · Accepted Answer

这是使用 Rcpp 的解决方案。

library(Rcpp)
library(data.table)
cppFunction('
Rcpp::DataFrame combi2(Rcpp::CharacterVector inputVector){
    int len = inputVector.size();
    int retLen = len * (len-1) / 2;
    Rcpp::CharacterVector outputVector1(retLen);
    Rcpp::CharacterVector outputVector2(retLen);
    int start = 0;
    for (int i = 0; i < len; ++i){
        for (int j = i+1; j < len; ++j){
            outputVector1(start) = inputVector(i);
            outputVector2(start) = inputVector(j);
            ++start;
            }
        }
    return(Rcpp::DataFrame::create(Rcpp::Named("id") = outputVector1,
                              Rcpp::Named("neighbor") = outputVector2));
};
')

# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000))) 

system.time({
    d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
    })
#  1.908   0.397   2.389

system.time({
    d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
    setkey(d, id1, id2)
    olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
    ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
    })
#  0.653   0.038   0.705

system.time(ans2 <- combi2(d$id))
#  1.377   0.108   1.495

使用 Rcpp 函数获取索引，然后形成 data.table，效果更好。

cppFunction('
Rcpp::DataFrame combi2inds(const Rcpp::CharacterVector inputVector){
const int len = inputVector.size();
const int retLen = len * (len-1) / 2;
Rcpp::IntegerVector outputVector1(retLen);
Rcpp::IntegerVector outputVector2(retLen);
int indexSkip;
for (int i = 0; i < len; ++i){
    indexSkip = len * i - ((i+1) * i)/2;
    for (int j = 0; j < len-1-i; ++j){
        outputVector1(indexSkip+j) = i+1;
        outputVector2(indexSkip+j) = i+j+1+1;
        }
    }
return(Rcpp::DataFrame::create(Rcpp::Named("xid") = outputVector1,
                          Rcpp::Named("yid") = outputVector2));
};
')

system.time({
        indices <- combi2inds(d$id)
        ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
        })      
#  0.389   0.027   0.425

score 4 · Accepted Answer

如果您不想使用其他依赖项，这里有两个 base-R 解决方案：

comb2.int使用rep和其他序列生成函数来生成所需的输出。
comb2.mat创建一个矩阵，用于upper.tri()获取上三角形并which(..., arr.ind = TRUE)获取列和行索引 => 所有组合。

可能性一：`comb2.int`

comb2.int <- function(n, rep = FALSE){
  if(!rep){
    # e.g. n=3 => (1,2), (1,3), (2,3)
    x <- rep(1:n,(n:1)-1)
    i <- seq_along(x)+1
    o <- c(0,cumsum((n-2):1))
    y <- i-o[x]
  }else{
    # e.g. n=3 => (1,1), (1,2), (1,3), (2,2), (2,3), (3,3)
    x <- rep(1:n,n:1)
    i <- seq_along(x)
    o <- c(0,cumsum(n:2))
    y <- i-o[x]+x-1
  }
  return(cbind(x,y))
}

可能性2：`comb2.mat`

comb2.mat <- function(n, rep = FALSE){
  # Use which(..., arr.ind = TRUE) to get coordinates.
  m <- matrix(FALSE, nrow = n, ncol = n)
  idxs <- which(upper.tri(m, diag = rep), arr.ind = TRUE)
  return(idxs)
}

这些函数给出与以下相同的结果`combn(.)`：

for(i in 2:8){
  # --- comb2.int ------------------
  stopifnot(comb2.int(i) == t(combn(i,2)))
  # => Equal

  # --- comb2.mat ------------------
  m <- comb2.mat(i)
  colnames(m) <- NULL   # difference 1: colnames
  m <- m[order(m[,1]),] # difference 2: output order
  stopifnot(m == t(combn(i,2)))
  # => Equal up to above differences
}

但是我的向量中还有其他元素而不是顺序整数！

使用返回值作为索引：

v <- LETTERS[1:5]                                     
c <- comb2.int(length(v))                             
cbind(v[c[,1]], v[c[,2]])                             
#>       [,1] [,2]
#>  [1,] "A"  "B" 
#>  [2,] "A"  "C" 
#>  [3,] "A"  "D" 
#>  [4,] "A"  "E" 
#>  [5,] "B"  "C" 
#>  [6,] "B"  "D" 
#>  [7,] "B"  "E" 
#>  [8,] "C"  "D" 
#>  [9,] "C"  "E" 
#> [10,] "D"  "E"

基准：

时间（combn）=〜5x时间（comb2.mat）=〜80x时间（comb2.int）：

library(microbenchmark)

n <- 800
microbenchmark({
  comb2.int(n)
},{
  comb2.mat(n)
},{
  t(combn(n, 2))
})
#>   Unit: milliseconds
#>                    expr        min         lq       mean     median        uq       max neval
#>    {     comb2.int(n) }   4.394051   4.731737   6.350406   5.334463   7.22677  14.68808   100
#>    {     comb2.mat(n) }  20.131455  22.901534  31.648521  24.411782  26.95821 297.70684   100
#>  {     t(combn(n, 2)) } 363.687284 374.826268 391.038755 380.012274 389.59960 532.30305   100

r - 更快版本的组合

5 回答 5

以 OP 数据为基准

带因子的基准（即分类变量）

确认平等

可能性一：comb2.int

可能性2：comb2.mat

这些函数给出与以下相同的结果combn(.)：

但是我的向量中还有其他元素而不是顺序整数！

基准：

Related

Reference

可能性一：`comb2.int`

可能性2：`comb2.mat`

这些函数给出与以下相同的结果`combn(.)`：