6

I am in need of efficiency for finding the indexes (not the logical vector) between two vectors. I can do this with:

which(c("a", "q", "f", "c", "z") %in% letters[1:10])

In the same way it is better to find the position of the maximum number with which.max:

which(c(1:8, 10, 9) %in% max(c(1:8, 10, 9)))
which.max(c(1:8, 10, 9))

I am wondering if I have the most efficient way of finding the position of matching terms in the 2 vectors.

EDIT: Per the questions/comments below. I am operating on a list of vectors. The problem involves operating on sentences that have been broken into a bag of words as seen below. The list may contain 10000-20000 or more character vectors. Then based on that index I will grab 4 words before and 4 words after the index and calculate a score.

x <- list(c('I', 'like', 'chocolate', 'cake'), c('chocolate', 'cake', 'is', 'good'))
y <- rep(x, 5000)

lapply(y, function(x) {
    which(x %in% c("chocolate", "good"))
})
4

2 回答 2

4

这是使用相对更快的方法data.table

require(data.table)
vv <- vapply(y, length, 0L)
DT <- data.table(y = unlist(y), id = rep(seq_along(y), vv), pos = sequence(vv))
setkey(DT, y)
# OLD CODE which will not take care of no-match entries (commented)
# DT[J(c("chocolate", "good")), list(list(pos)), by=id]$V1

setkey(DT[J(c("chocolate", "good"))], id)[J(seq_along(vv)), list(list(pos))]$V1

想法:

首先,我们将您的列表取消列出到DT名为的列中y。此外,我们还创建了另外两个名为id和的列posid告诉列表中的索引并pos告诉其中的位置id。然后,通过在 上创建一个关键列id,我们可以进行快速子集化。通过这个子集,我们将获得pos每个 的相应值idpos在我们为列表中的每个收集所有id内容然后只输出列表列(V1)之前,我们通过id在第一次子集和子集的所有可能值id(如这将导致NA不存在的条目。


lapply使用您帖子中的代码进行基准测试:

x <- list(c('I', 'like', 'chocolate', 'cake'), c('chocolate', 'cake', 'is', 'good'))
y <- rep(x, 5000)

require(data.table)
arun <- function() {
    vv <- vapply(y, length, 0L)
    DT <- data.table(y = unlist(y), id = rep(seq_along(y), vv), pos = sequence(vv))
    setkey(DT, y)
    setkey(DT[J(c("chocolate", "good"))], id)[J(seq_along(vv)), list(list(pos))]$V1
}

tyler <- function() {
    lapply(y, function(x) {
        which(x %in% c("chocolate", "good"))
    })
}

require(microbenchmark)
microbenchmark(a1 <- arun(), a2 <- tyler(), times=50)

Unit: milliseconds
          expr       min        lq    median        uq       max neval
  a1 <- arun()  30.71514  31.92836  33.19569  39.31539  88.56282    50
 a2 <- tyler() 626.67841 669.71151 726.78236 785.86444 955.55803    50

> identical(a1, a2)
# [1] TRUE
于 2013-08-11T18:19:20.057 回答
2

C++ 的答案是比较单个字符更快,但我认为使用字符串向量引入了足够的开销,现在它更慢了:

char1 <- c("a", "q", "f", "c", "z")
char2 <- letters[1:10]

library(inline)
cpp_whichin_src <- '
Rcpp::CharacterVector xa(a);
Rcpp::CharacterVector xb(b);
int n_xa = xa.size();
int n_xb = xb.size();

NumericVector res(n_xa);

std::vector<std::string> sa = Rcpp::as< std::vector<std::string> >(xa);
std::vector<std::string> sb = Rcpp::as< std::vector<std::string> >(xb);

for(int i=0; i < n_xa; i++) {
  for(int j=0; j<n_xb; j++) {
    if( sa[i] == sb[j] ) res[i] = i+1;
  }
}
return res;
'
cpp_whichin <- cxxfunction(signature(a="character",b="character"), cpp_whichin_src, plugin="Rcpp")

which.in_cpp <- function(char1, char2) {
  idx <- cpp_whichin(char1,char2)
  idx[idx!=0]
}

which.in_naive <- function(char1, char2) {
  which(char1 %in% char2)
}

which.in_CW <- function(char1, char2) {
  unlist(sapply(char2,function(x) which(x==char1)))
}

which.in_cpp(char1,char2)
which.in_naive(char1,char2)
which.in_CW(char1,char2)

** 基准 **

library(microbenchmark)
microbenchmark(
  which.in_cpp(char1,char2),
  which.in_naive(char1,char2),
  which.in_CW(char1,char2)
)

set.seed(1)
cmb <- apply(combn(letters,2), 2, paste,collapse="")
char1 <- sample( cmb, 100 )
char2 <- sample( cmb, 100 )

Unit: microseconds
                          expr     min      lq   median       uq      max
1   which.in_cpp(char1, char2) 114.890 120.023 126.6930 135.5630  537.011
2    which.in_CW(char1, char2) 697.505 725.826 766.4385 813.8615 8032.168
3 which.in_naive(char1, char2)  17.391  20.289  22.4545  25.4230   76.826

# Same as above, but with 3 letter combos and 1000 sampled

Unit: microseconds
                          expr       min        lq     median        uq       max
1   which.in_cpp(char1, char2)  8505.830  8715.598  8863.3130  8997.478  9796.288
2    which.in_CW(char1, char2) 23430.493 27987.393 28871.2340 30032.450 31926.546
3 which.in_naive(char1, char2)   129.904   135.736   158.1905   180.260  3821.785
于 2013-08-11T15:49:57.003 回答