r - R中列表中相交向量的并集

Question

我有一个向量列表如下。

data <- list(v1=c("a", "b", "c"), v2=c("g", "h", "k"), 
             v3=c("c", "d"), v4=c("n", "a"), v5=c("h", "i"))

我正在努力实现以下目标：

检查是否有任何向量相互交叉
如果找到相交向量，则获取它们的并集

所以想要的输出是

out <- list(v1=c("a", "b", "c", "d", "n"), v2=c("g", "h", "k", "i"))

我可以得到一组相交集的并集，如下所示。

 Reduce(union, list(data[[1]], data[[3]], data[[4]]))
 Reduce(union, list(data[[2]], data[[5]])

如何首先识别相交向量？有没有办法将列表划分为相交向量组的列表？

＃更新

这是使用 data.table 的尝试。得到想要的结果。但是对于本示例数据集中的大型列表仍然很慢。

datasets. 
data <- sapply(data, function(x) paste(x, collapse=", "))
data <- as.data.frame(data, stringsAsFactors = F)

repeat {
  M <- nrow(data)
  data <- data.table( data , key = "data" )
  data <- data[ , list(dataelement = unique(unlist(strsplit(data , ", " )))), by = list(data)]
  data <- data.table(data , key = "dataelement" )
  data <- data[, list(data = paste0(sort(unique(unlist(strsplit(data, split=", ")))), collapse=", ")), by = "dataelement"]
  data$dataelement <- NULL
  data <- unique(data)
  N <- nrow(data)
  if (M == N)
    break
}

data <- strsplit(as.character(data$data) , "," )

score 24 · Accepted Answer

这有点像图形问题，所以我喜欢为此使用igraph库，使用您的示例数据，您可以

library(igraph)
#build edgelist
el <- do.call("rbind",lapply(data, embed, 2))
#make a graph
gg <- graph.edgelist(el, directed=F)
#partition the graph into disjoint sets
split(V(gg)$name, clusters(gg)$membership)

# $`1`
# [1] "b" "a" "c" "d" "n"
# 
# $`2`
# [1] "h" "g" "k" "i"

我们可以查看结果

V(gg)$color=c("green","purple")[clusters(gg)$membership]
plot(gg)

在此处输入图像描述

score 17 · Accepted Answer

这是另一种仅使用基础 R 的方法

更新

在 akrun 的评论和他的示例数据之后的下一次更新：

data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))

修改功能：

x <- lapply(seq_along(data), function(i) {
  if(!any(data[[i]] %in% unlist(data[-i]))) {
    data[[i]]
  } else if (any(data[[i]] %in% unlist(data[seq_len(i-1)]))) {
    NULL 
  } else {
    z <- lapply(data[-seq_len(i)], intersect,  data[[i]]) 
    z <- names(z[sapply(z, length) >= 1L])
    if (is.null(z)) NULL else union(data[[i]], unlist(data[z]))
  }
})
x[!sapply(x, is.null)]
#[[1]]
#[1] "g" "k"
#
#[[2]]
#[1] "a" "b" "c" "d"

这适用于原始样本数据、MrFlick 的样本数据和 akrun 的样本数据。

score 9 · Accepted Answer

效率该死，你们还睡觉吗？仅以 R 为基础，并且比最快的答案慢得多。既然写了，那就贴吧。

f.union = function(x) {
  repeat{
    n = length(x)
    m = matrix(F, nrow = n, ncol = n)
    for (i in 1:n){
      for (j in 1:n) {
        m[i,j] = any(x[[i]] %in% x[[j]])
      }
    }
    o = apply(m, 2, function(v) Reduce(union, x[v]))
    if (all(apply(m, 1, sum)==1)) {return(o)} else {x=unique(o)}
  }
}

f.union(data)

[[1]]
[1] "a" "b" "c" "d" "n"

[[2]]
[1] "g" "h" "k" "i"

因为我喜欢慢。（在基准测试之外加载库）

Unit: microseconds
    expr      min        lq      mean    median        uq       max neval
   vlo()  896.435 1070.6540 1315.8194 1129.4710 1328.6630  7859.999  1000
 akrun()  596.263  658.6590  789.9889  694.1360  804.9035  3470.158  1000
 flick()  805.854  928.8160 1160.9509 1001.8345 1172.0965  5780.824  1000
  josh() 2427.752 2693.0065 3344.8671 2943.7860 3524.1550 16505.909  1000 <- deleted :-(
   doc()  254.462  288.9875  354.6084  302.6415  338.9565  2734.795  1000

score 8 · Accepted Answer

一种选择是使用combn然后找到相交。会有更简单的选择。

indx <- combn(names(data),2)
lst <- lapply(split(indx, col(indx)), 
        function(i) Reduce(`intersect`,data[i]))
indx1 <- names(lst[sapply(lst, length)>0])
indx2 <- indx[,as.numeric(indx1)]
indx3 <- apply(indx2,2, sort)
lapply(split(1:ncol(indx3), indx3[1,]),
   function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE)))
#$v1
#[1] "a" "b" "c" "d" "n"

#$v2
#[1] "g" "h" "k" "i"

更新

您可以使用combnPrimfromlibrary(gRbase)来加快速度。使用稍大的数据集

library(gRbase)
set.seed(25)
data <- setNames(lapply(1:1e3,function(i)sample(letters,
         sample(1:20), replace=FALSE)), paste0("v", 1:1000))

并与fastest. 这些是根据 OP 对@docendo discimus 的评论修改的功能。

akrun2M <- function(){
     ind <- sapply(seq_along(data), function(i){#copied from @docendo discimus
            !any(data[[i]] %in% unlist(data[-i]))
              })
     data1 <- data[!ind] 
     indx <- combnPrim(names(data1),2)
     lst <- lapply(split(indx, col(indx)), 
              function(i) Reduce(`intersect`,data1[i]))
     indx1 <- names(lst[sapply(lst, length)>0])
     indx2 <- indx[,as.numeric(indx1)]
     indx3 <- apply(indx2,2, sort)
     c(data[ind],lapply(split(1:ncol(indx3), indx3[1,]),
        function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE))))
   } 

doc2 <- function(){
      x <- lapply(seq_along(data), function(i) {
          if(!any(data[[i]] %in% unlist(data[-i]))) {
               data[[i]]
           } 
          else {
            z <- unlist(data[names(unlist(lapply(data[-c(1:i)],
                                     intersect, data[[i]])))]) 
          if (is.null(z)){ 
               z
               }
          else union(data[[i]], z)
        }
   })
x[!sapply(x, is.null)]
}

基准

 microbenchmark(doc2(), akrun2M(), times=10L)
 # Unit: seconds
 #    expr      min       lq     mean   median       uq      max neval  cld
 #   doc2() 35.43687 53.76418 54.77813 54.34668 62.86665 67.76754    10   b
 #akrun2M() 26.64997 28.74721 38.02259 35.35081 47.56781 49.82158    10   a

score 4 · Accepted Answer

我遇到了一个类似的问题，促使我到处寻找解决方案。感谢这里的许多出色的贡献者，我终于找到了一个非常好的函数，但是当我看到这篇文章时，我想我会为此编写自己的自定义函数。它实际上并不优雅，而且速度太慢，但我认为它非常有效，现在可以解决问题，直到我做出一些改进：

anoush <- function(x) {
# First we check whether x is a list

  stopifnot(is.list(x)) 

# Then we take every element of the input and calculate the intersect between
# that element & others. In case there were some we would store the indices 
# in `vec` vector. So in the end we have a list called `ind` whose elements 
# are all the indices connected with the corresponding elements of the original 
# list for example first element of `ind` is `1`, `2`, `3` which means in 
# the original list these elements have common values.
  
  ind <- lapply(1:length(x), function(a) {
    vec <- c()
    for(i in 1:length(x)) {
      if(length(unique(base::intersect(x[[a]], x[[i]]))) > 0) {
        vec <- c(vec, i)
      }
    }
    vec 
    })

# Then we go on to again compare each element of `ind` with other elements
# in case there were any intersect, we will calculate the `union` of them.
# for each element we will end up with a list of accumulated values but
# but in the end we use `Reduce` to capture only the last one. So for each
# element of `ind` we end up having a collection of indices that also 
# result in duplicated values. For example elements `1` through `5` of 
# `dup_ind` contains the same value cause in the original list these 
# elements have common values.

  dup_ind <- lapply(1:length(ind), function(a) {
    out <- c()
    for(i in 1:length(ind)) {
      if(length(unique(base::intersect(ind[[a]], ind[[i]]))) > 0) {
        out[[i]] <- union(ind[[a]], ind[[i]])
      }
      vec2 <- Reduce("union", out)
    }
    vec2
  }) 

# Here we get rid of the duplicated elements of the list by means of 
# `relist` funciton and since in this process all the duplicated elements
# will turn to `integer(0)` I have filtered those out.
  
  un <- unlist(dup_ind)
  res <- Map(`[`, dup_ind, relist(!duplicated(un), skeleton = dup_ind))
  res2 <- Filter(length, res)
  
  sapply(res2, function(a) unique(unlist(lapply(a, function(b) `[[`(x, b)))))
  
}

OP 的数据样本

> anoush(data)

[[1]]
[1] "a" "b" "c" "d" "n"

[[2]]
[1] "g" "h" "k" "i"

亲爱的@akrun 的数据样本

data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))

> anoush(data)
[[1]]
[1] "g" "k"

[[2]]
[1] "a" "b" "c" "d"

score 0 · Accepted Answer

一般来说，你不能比 Floyd-Warshall-Algorithm 做得更好/更快，如下所示：

library(Rcpp)

cppFunction(
  "LogicalMatrix floyd(LogicalMatrix w){
    int n = w.nrow();
    for( int k = 0; k < n; k++ )
     for( int i = 0; i < (n-1); i++ )
      for( int j = i+1; j < n; j++ ) 
       if( w(i,k) && w(k,j) ) {
        w(i,j) = true;
        w(j,i) = true;
       }
   return w;
}")

fw.union<-function(x) {
  n<-length(x)
  w<-matrix(F,nrow=n,ncol=n)
  for( i in 1:n ) {
   w[i,i]<-T
  }
  for( i in 1:(n-1) ) {
   for( j in (i+1):n ) {
     w[i,j]<-w[j,i]<- any(x[[i]] %in% x[[j]])
   }
  }
 apply( unique( floyd(w) ), 1, function(y) { Reduce(union,x[y]) } )
}

不过，运行基准测试会很有趣。初步测试表明我的实现比 Vlo 快 2-3 倍。

r - R中列表中相交向量的并集

6 回答 6

更新

更新

基准

Related

Reference