r - 比较数据框中的行的更快方法

Question

考虑下面的数据框。我想将每一行与下面的行进行比较，然后取大于 3 个值的行。

我写了下面的代码，但是如果你有一个很大的数据框，它会很慢。

我怎样才能更快地做到这一点？

data <- as.data.frame(matrix(c(10,11,10,13,9,10,11,10,14,9,10,10,8,12,9,10,11,10,13,9,13,13,10,13,9), nrow=5, byrow=T))
rownames(data)<-c("sample_1","sample_2","sample_3","sample_4","sample_5")

>data
          V1 V2 V3 V4 V5
sample_1  10 11 10 13  9
sample_2  10 11 10 14  9
sample_3  10 10  8 12  9
sample_4  10 11 10 13  9
sample_5  13 13 10 13  9

output <- data.frame(sample = NA, duplicate = NA, matches = NA)
dfrow <- 1
for(i in 1:nrow(data)) {
    sample <- data[i, ]
    for(j in (i+1):nrow(data)) if(i+1 <= nrow(data)) {
    matches <- 0
        for(V in 1:ncol(data)) {
            if(data[j,V] == sample[,V]) {       
                matches <- matches + 1
            }
        }
        if(matches > 3) {
            duplicate <- data[j, ]
            pair <- cbind(rownames(sample), rownames(duplicate), matches)
            output[dfrow, ] <- pair
            dfrow <- dfrow + 1
        }
    }
}

>output
   sample    duplicate    matches
1 sample_1   sample_2     4
2 sample_1   sample_4     5
3 sample_2   sample_4     4

score 9 · Accepted Answer

这是一个 Rcpp 解决方案。然而，如果结果矩阵变得太大（即，有太多的命中），这将引发错误。我运行循环两次，首先是获得结果矩阵的必要大小，然后填充它。可能有更好的可能性。此外，显然，这仅适用于整数。如果您的矩阵是数字的，则必须处理浮点精度。

library(Rcpp)
library(inline)

#C++ code:
body <- '
const IntegerMatrix        M(as<IntegerMatrix>(MM));
const int                  m=M.ncol(), n=M.nrow();
long                        count1;
int                         count2;
count1 = 0;
for (int i=0; i<(n-1); i++)
{
   for (int j=(i+1); j<n; j++)
   {
     count2 = 0;
     for (int k=0; k<m; k++) {
        if (M(i,k)==M(j,k)) count2++;
     }
     if (count2>3) count1++;
   } 
}
IntegerMatrix              R(count1,3);
count1 = 0;
for (int i=0; i<(n-1); i++)
{
   for (int j=(i+1); j<n; j++)
   {
     count2 = 0;
     for (int k=0; k<m; k++) {
        if (M(i,k)==M(j,k)) count2++;
     }
     if (count2>3) {
        count1++;
        R(count1-1,0) = i+1;
        R(count1-1,1) = j+1;
        R(count1-1,2) = count2;
     }
   } 
}
return  wrap(R);
'

fun <- cxxfunction(signature(MM = "matrix"), 
                     body,plugin="Rcpp")

#with your data
fun(as.matrix(data))
#      [,1] [,2] [,3]
# [1,]    1    2    4
# [2,]    1    4    5
# [3,]    2    4    4

#Benchmarks
set.seed(42)
mat1 <- matrix(sample(1:10,250*26,TRUE),ncol=26)
mat2 <- matrix(sample(1:10,2500*26,TRUE),ncol=26)
mat3 <- matrix(sample(1:10,10000*26,TRUE),ncol=26)
mat4 <- matrix(sample(1:10,25000*26,TRUE),ncol=26)
library(microbenchmark)
microbenchmark(
  fun(mat1),
  fun(mat2),
  fun(mat3),
  fun(mat4),
  times=3
  )
# Unit: milliseconds
#      expr          min           lq       median           uq          max neval
# fun(mat1)     2.675568     2.689586     2.703603     2.732487     2.761371     3
# fun(mat2)   272.600480   274.680815   276.761151   276.796217   276.831282     3
# fun(mat3)  4623.875203  4643.634249  4663.393296  4708.067638  4752.741979     3
# fun(mat4) 29041.878164 29047.151348 29052.424532 29235.839275 29419.254017     3

score 3 · Accepted Answer

编辑：不确定我昨晚在减去行时在想什么，考虑到我可以直接测试是否相等。从下面的代码中删除了不必要的步骤。

这是一种可能有点聪明或考虑不周的方法……但希望是前者。这个想法是，您可以通过从数据帧的其余部分中减去该行然后查看等于零的元素数来执行一些矢量化操作，而不是逐行进行一系列比较。这是该方法的简单实现：

> library(data.table)
> data <- as.data.frame(matrix(c(10,11,10,13,9,10,11,10,14,9,10,10,8,12,9,10,11,10,13,9,13,13,10,13,9), nrow=5, byrow=T))
> rownames(data)<-c("sample_1","sample_2","sample_3","sample_4","sample_5")
> 
> findMatch <- function(i,n){
+   tmp <- colSums(t(data[-(1:i),]) == unlist(data[i,]))
+   tmp <- tmp[tmp > n]
+   if(length(tmp) > 0) return(data.table(sample=rownames(data)[i],duplicate=names(tmp),match=tmp))
+   return(NULL)
+ }
> 
> system.time(tab <- rbindlist(lapply(1:(nrow(data)-1),findMatch,n=3)))
   user  system elapsed 
  0.003   0.000   0.003 
> tab
     sample duplicate match
1: sample_1  sample_2     4
2: sample_1  sample_4     5
3: sample_2  sample_4     4

编辑：这是使用矩阵并预转置数据的版本 2，因此您只需执行一次。它应该通过大量数据更好地扩展到您的示例。

library(data.table)
data <- matrix(round(runif(26*250000,0,25)),ncol=26)
tdata <- t(data)

findMatch <- function(i,n){
    tmp <- colSums(tdata[,-(1:i)] == data[i,])
    j <- which(tmp > n)
    if(length(tmp) > 0) return(data.table(sample=i,duplicate=j+1,match=tmp[j]))
    return(NULL)
}

tab <- rbindlist(lapply(1:(nrow(data)-1),findMatch,n=3))

我在我的机器上跑了一会儿，在 15 分钟内完成了前 1500 次迭代，一个完整的 250,000 x 26 矩阵，需要 600 Mb 内存。由于以前的迭代不会影响未来的迭代，因此您当然可以将其分块并在需要时单独运行。

score 2 · Accepted Answer

这不是一个完整的答案，只是想到的快速锻炼是使用矩阵而不是data.frame（那些非常慢）。矩阵在 R 中非常快，通过在其中完成至少一些操作，然后在向量中附加列名，将显着提高速度。

只是一个快速演示：

data <- matrix(c(10,11,10,13,9,10,11,10,14,9,10,10,8,12,9,10,11,10,13,9,13,13,10,13,9), nrow=5, byrow=T)rownames(data)<-c("sample_1","sample_2","sample_3","sample_4","sample_5")
mu<-c("sample_1","sample_2","sample_3","sample_4","sample_5")

t=proc.time()
tab <- data.frame(sample = NA, duplicate = NA, matches = NA)
dfrow <- 1
for(i in 1:nrow(data)) {
    sample <- data[i, ]
    for(j in (i+1):nrow(data)) if(i+1 <= nrow(data)) {
    matches <- 0
        for(V in 1:ncol(data)) {
            if(data[j,V] == sample[V]) {       
                matches <- matches + 1
            }
        }
        if(matches > 3) {
            duplicate <- data[j, ]
            pair <- cbind(mu[i], mu[j], matches)
            tab[dfrow, ] <- pair
            dfrow <- dfrow + 1
        }
    }
}
proc.time()-t

平均而言，在我的机器上，产量

   user  system elapsed 
   0.00    0.06    0.06

在你的情况下，我得到

 user  system elapsed 
   0.02    0.06    0.08

我不确定是否有比矩阵更快的东西。您也可以使用并行化，但 for 循环C++代码内联经常使用（包Rcpp）。

score 2 · Accepted Answer

library(data.table)

#creating the data
dt <- data.table(read.table(textConnection(
"Sample          V1 V2 V3 V4 V5
sample_1  10 11 10 13  9
sample_2  10 11 10 14  9
sample_3  10 10  8 12  9
sample_4  10 11 10 13  9
sample_5  13 13 10 13  9"), header= TRUE))

# some constants which will be used frequently
nr = nrow(dt)
nc = ncol(dt)-1

#list into which we will insert the no. of matches for each sample 
#for example's sake, i still suggest you write output to a file possibly
totalmatches <- vector(mode = "list", length = (nr-1))

#looping over each sample
for ( i in 1:(nr-1))
{
   # all combinations of i with i+1 to nr
   samplematch <- cbind(dt[i],dt[(i+1):nr])

   # renaming the comparison sample columns
   setnames(samplematch,append(colnames(dt),paste0(colnames(dt),"2")))

   #calculating number of matches
   samplematch[,noofmatches := 0]
   for (j in 1:nc)
   {
      samplematch[,noofmatches := noofmatches+1*(get(paste0("V",j)) == get(paste0("V",j,"2")))]
   }

   # removing individual value columns and matches < 3
   samplematch <- samplematch[noofmatches >= 3,list(Sample,Sample2,noofmatches)]

   # adding to the list
   totalmatches[[i]] <- samplematch
}

输出 -

rbindlist(totalmatches)
     Sample  Sample2 noofmatches
1: sample_1 sample_2           4
2: sample_1 sample_4           5
3: sample_1 sample_5           3
4: sample_2 sample_4           4
5: sample_4 sample_5           3

不过，矩阵的性能似乎更好，这种方法计时 -

   user  system elapsed 
   0.17    0.01    0.19

score 0 · Accepted Answer

评论中所说的一切都非常有效；特别是，我也不一定认为 R 是最好的地方。也就是说，这对我来说比你在更大的数据集上提出的要快得多（约 9.7 秒与两分钟后未完成）：

data <- matrix(sample(1:30, 10000, replace=TRUE), ncol=5)
#Pre-prepare
x <- 1
#Loop
for(i in seq(nrow(data)-2)){
  #Find the number of matches on that row
  sums <- apply(data[seq(from=-1,to=-i),], 1, function(x) sum(x==data[i,]))
  #Find how many are greater than/equal to 3
  matches <- which(sums >= 3)
  #Prepare output
  output[seq(from=x, length.out=length(matches)),1] <- rep(i, length(matches))
  output[seq(from=x, length.out=length(matches)),2] <- matches
  output[seq(from=x, length.out=length(matches)),3] <- sums[matches]
  #Alter the counter of how many we've made...
  x <- x + length(matches)
}
#Cleanup output
output <- output[!is.na(output[,1]),]})

...我相当确定我的怪异x变量和分配output可以改进/变成一个apply-type 问题，但已经晚了，我累了！祝你好运！

score 0 · Accepted Answer

好吧，我试了一下，下面的代码运行速度比原来的快了大约 3 倍。

f <- function(ind, mydf){
    res <- NULL
    matches <- colSums(t(mydf[-(1:ind),])==mydf[ind,])
    Ndups <- sum(matches > 3)
    if(Ndups > 0){
        res <- data.frame(sample=rep(ind,Ndups),duplicate=which(matches > 3), 
                      matches= matches[matches > 3],stringsAsFactors = F)
        rownames(res) <- NULL
        return(as.matrix(res))
    }
    return(res)
}


f(1,mydf=as.matrix(data))
f(2,mydf=as.matrix(data))
system.time( 
for(i in 1:1000){
    tab <- NULL
    for(j in 1:(dim(data)[1]-1))
        tab <- rbind(tab,f(j,mydf=as.matrix(data)))
}
)/1000
tab

score 0 · Accepted Answer

假设数据集中的所有条目都具有相同的模式（数字），请将其转换为矩阵。通过转置，您可以利用==向量化的方式。

data <- as.matrix(data)
data <- t(data)

output <- lapply(seq_len(ncol(data) - 1), function(x) {
    tmp <- data[,x] == data[, (x+1):ncol(data)]
    n_matches <- {
        if (x == ncol(data) - 1) {
            setNames(sum(tmp),colnames(data)[ncol(data)])
        } else {
            colSums(tmp)
        }
    }
    good_matches <- n_matches[n_matches >= 3]
})

最大的问题是如何输出结果。就目前而言，我将您的数据列在一个列表中。我认为这是存储数据的内存最少的方式。

[[1]]
sample_2 sample_4 sample_5 
       4        5        3 

[[2]]
sample_4 
       4 

[[3]]
named numeric(0)

[[4]]
sample_5 
       3

如果您想要一个数据框输出，那么您需要在lapply. 也许在函数的最后一行添加：

return(data.frame(
    sample = colnames(data)[x], 
    duplicate = names(good_matches), 
    noofmatches = good_matches,
    stringsAsFactors = FALSE))

然后使用：

newoutput <- do.call(rbind, output)
## or, using plyr
# require(plyr)
# newoutput <- rbind.fill(output)

r - 比较数据框中的行的更快方法

7 回答 7

Related

Reference