在这些问题1、2中受到启发。我试图把 adata.table
变成一个对象adjacency matrix/edgelist
,然后再变成一个igraph
对象。我有一个包含两列 ( A
, B
)的数据集,IDs
用于配对。换句话说,A
表示links
,并且B
包含nodes
或 顶点。在我的数据集中,每列的唯一长度是 25352
x 75352
。这些将创建一个大网络,因此,我试图找到最有效的方法来获得一个adjacency matrix
或一个edgelist
. 到目前为止,我已经尝试过这些方法:
library(data.table)
library(dplyr)
library(microbenchmark)
n <- 1000
set.seed(123634)
DT <- data.table(A=replicate(n, paste0(sample(LETTERS, 2), collapse = "")),
B=replicate(n, paste0(sample(LETTERS, 4), collapse = "")))
lapply(DT, function(x){length(unique(x))})
$A
[1] 503
$B
[1] 998
### `table + crossprod` Method (adjecency matrix):
fn1 <- function(DT) {
crossprod(table(DT))
}
### `dcast + crossprod` Method (adjecency matrix):
fn2 <-
function(DT) {
crossprod(as.matrix(dcast(
DT, A ~ B, value.var = "B", fun.aggregate = length
)[, -1]))
}
### `xtabs + tcrossprod` Method (adjecency matrix):
fn3 <- function(DF) {
tcrossprod(xtabs( ~ B + A, DT))
}
### `merge` Method (edge list):
fn4 <-
function(DT) {
temp <- merge(DT, DT, by = "A", allow.cartesian = TRUE)
temp[temp$B.x != temp$B.y , -1]
}
### `dplyr` Method (edgelist):
fn5 <- function(DT) {
DT %>% group_by(A) %>%
filter(n() >= 2) %>% group_by(A) %>%
do(data.frame(t(combn(.$B, 2)), stringsAsFactors = FALSE))
}
更新 1:@Axeman 的以下评论
### `merge` Method (edge list):
fn4 <-
function(DT) {
setkey(DT, A)
temp <- merge(DT, DT, by = "A", allow.cartesian = TRUE)
temp[temp$B.x != temp$B.y , ]
}
### `full_join + filter`
fn6 <- function(DT) {
full_join(DT, DT, by = 'A') %>% filter(B.x != B.y)
}
结果 1
microbenchmark(fn1(DT), fn2(DT), fn3(DT), fn4(DT), fn5(DT), fn6(DT), times = 100)
expr min lq mean median uq max neval cld
fn1(DT) 291.754120 293.959476 304.203825 294.875436 300.686430 373.804013 100 d
fn2(DT) 346.626929 349.101024 367.754884 350.903514 370.477299 448.036178 100 e
fn3(DT) 9.969924 10.420903 14.692905 10.784544 11.451784 78.009518 100 b
fn4(DT) 1.816473 2.156643 2.430527 2.366402 2.504144 4.551233 100 a
fn5(DT) 125.481956 133.189609 157.177028 137.107701 195.092453 297.355731 100 c
fn6(DT) 2.339659 2.719236 3.058402 2.985036 3.138265 5.468647 100 a
输入速度更快merge
,(fn4)
任何想法或建议将不胜感激。
警告:
和fn4
,fn6
它们更快地依赖于cartesian product
并且merge
它们创建了重复的连接。此外,由于temp$B.x != temp$B.y
,所有未连接的顶点都从图中删除,这也可能会产生误导。
n <- 5
set.seed(123634)
DT <- data.table(A=replicate(n, sample(1:2, 1)),
B=replicate(n, paste0(sample(LETTERS[1:3], 2), collapse = "")))
DT
A B
1: 2 AB
2: 2 AC
3: 1 AC
4: 1 AB
5: 2 BA
## Method 1
get.adjacency(a)
a <- graph_from_adjacency_matrix(fn1(DT), mode = "undirected")
a <- simplify(a, remove.multiple = F, remove.loops = TRUE)
get.adjacency(a)
AB AC BA
AB . 2 1
AC 2 . 1
BA 1 1
## Method 4
c <- graph_from_data_frame(fn4(DT), directed=F)
get.adjacency(c)
AB AC BA
AB . 4 2
AC 4 . 2
BA 2 2 .
## Method 6
f <- graph_from_data_frame(fn6(DT)[,2:3], directed=F)
get.adjacency(f)
AB AC BA
AB . 4 2
AC 4 . 2
BA 2 2 .
更新 2:更正重复并考虑断开的节点。
fn4 <- function(DT) {
setkey(DT, A)
temp <- merge(DT, DT, by = "A", allow.cartesian = TRUE)[, 2:3]
setorder(temp,+B.x)
get.adjacency(simplify(
graph_from_data_frame(temp, directed = F),
remove.multiple = F,
remove.loops = TRUE)) * 1 / 2
}
fn6 <- function(DT) {
full_join(DT, DT, by = 'A')[2:3] %>%
setorder(+B.x) %>%
graph_from_data_frame(directed = F) %>%
simplify(remove.multiple = F, remove.loops = TRUE) %>%
get.adjacency * 1 / 2
}
结果 2
expr min lq mean median uq max neval cld
fn1(DT) 292.755855 295.047878 301.545026 295.890292 297.364117 382.01720 100 c
fn2(DT) 349.139294 351.886946 371.612651 353.392465 394.686377 528.48418 100 d
fn3(DT) 10.075716 10.500732 15.642757 10.767010 11.379872 79.36882 100 a
fn4(DT) 7.382669 7.968354 8.494499 8.204351 8.585933 18.17826 100 a
fn5(DT) 126.307694 134.317938 152.548209 135.883273 177.473529 210.14054 100 b
fn6(DT) 8.540844 9.119288 9.833154 9.637090 10.055865 18.84172 100 a