0

在 R 中,我有多个非常大(大约 140e6)的 IP 地址列表。多个列表之间有很多重叠的IP。我想创建一个数据框或数据表,其中包含 ip 地址作为行名(不重复),列表名称作为列,0 或 1 表示该列表中是否存在 ip。

例如,我们有以下两个列表,两者之间有一些 % 交集。

a <- c("192.168.0.1","192.168.0.2","192.168.0.3","192.168.0.4","192.168.0.5","192.168.0.6","192.168.0.7","192.168.0.8","192.168.0.9","192.168.0.10")
b <- c("192.168.1.1","192.168.1.2","192.168.1.3","192.168.1.4","192.168.0.5","192.168.0.6","192.168.0.7","192.168.0.8","192.168.0.9","192.168.0.10")

我想要的是这样的:

             a b
192.168.0.1  1 0
192.168.0.2  1 0
192.168.0.3  1 0
192.168.0.4  1 0
192.168.0.5  1 1
192.168.0.6  1 1
192.168.0.7  1 1
192.168.0.8  1 1
192.168.0.9  1 1
192.168.0.10 1 1
192.168.1.1  0 1
192.168.1.2  0 1
192.168.1.3  0 1
192.168.1.4  0 1

我尝试过使用 reshape2、tidyr、model.matrix、intersect 和 good ol' for 循环。我发现了一些人们从数据帧创建虚拟矩阵的例子,但不是将向量名称作为列,将值作为行名,而不是重复。

4

3 回答 3

2

首先我将介绍2个新的解决方案

合并的解决方案

df1 <- merge(data.frame(ip=a,a=1), data.frame(ip=b,b=1),all=TRUE) %>%
set_rownames(.,`[`(.,,'ip')) %>% select(-ip) %>% replace(.,is.na(.),0)

#              a b
# 192.168.0.1  1 0
# 192.168.0.10 1 1
# 192.168.0.2  1 0
# 192.168.0.3  1 0
# 192.168.0.4  1 0
# 192.168.0.5  1 1
# 192.168.0.6  1 1
# 192.168.0.7  1 1
# 192.168.0.8  1 1
# 192.168.0.9  1 1
# 192.168.1.1  0 1
# 192.168.1.2  0 1
# 192.168.1.3  0 1
# 192.168.1.4  0 1

这也是一个重塑的解决方案

这个很酷的一点是,当你有两个以上的源向量时它可以工作:

df2 <- list(data.frame(a),data.frame(b)) %>%
  lapply(. %>% transform(source = names(.)) %>% rename_("ip" = names(.)[1])) %>%
  do.call(rbind,.) %>%
  transform(v=1) %>%
  reshape(idvar="ip",timevar="source",direction="wide",sep="") %>%
  replace(.,is.na(.),0) %>%
  setNames(gsub("v","",colnames(.))) %>%
  set_rownames(.,`[`(.,,'ip')) %>% select(-ip)

#              a b
# 192.168.0.1  1 0
# 192.168.0.2  1 0
# 192.168.0.3  1 0
# 192.168.0.4  1 0
# 192.168.0.5  1 1
# 192.168.0.6  1 1
# 192.168.0.7  1 1
# 192.168.0.8  1 1
# 192.168.0.9  1 1
# 192.168.0.10 1 1
# 192.168.1.1  0 1
# 192.168.1.2  0 1
# 192.168.1.3  0 1
# 192.168.1.4  0 1

2 个向量的所有解决方案的基准

让我们对迄今为止提供的解决方案进行基准测试。我使用from和from添加了我的第一个解决方案的data.table变体以及我的第二个解决方案的变体dcastreshape2spreadtidyR

microbenchmark(
merge = merge(data.frame(ip=a,a=1), data.frame(ip=b,b=1),all=TRUE) %>%
  set_rownames(.,`[`(.,,'ip')) %>% select(-ip) %>% replace(.,is.na(.),0),
merge_dt = merge(data.table(ip=a,a=1,key="ip"), data.table(ip=b,b=1,key="ip"),all=TRUE) %>%
  as.data.frame %>% # to go back to desired output format
  set_rownames(.,`[`(.,,'ip')) %>% select(-ip) %>% replace(.,is.na(.),0),
dcast = list(data.frame(a),data.frame(b)) %>%
  lapply(. %>% transform(source = names(.)) %>% rename_("ip" = names(.)[1])) %>%
  do.call(rbind,.) %>%
  transform(v=1) %>%
  dcast(ip ~ source,value.var="v") %>%
  replace(.,is.na(.),0) %>%
  setNames(gsub("v","",colnames(.))) %>%
  set_rownames(.,`[`(.,,'ip')) %>% select(-ip),
spread = list(data.frame(a),data.frame(b)) %>%
  lapply(. %>% transform(source = names(.)) %>% rename_("ip" = names(.)[1])) %>%
  do.call(rbind,.) %>%
  transform(v=1) %>%
  spread(source,v) %>%
  replace(.,is.na(.),0) %>%
  setNames(gsub("v","",colnames(.))) %>%
  set_rownames(.,`[`(.,,'ip')) %>% select(-ip),
reshape = list(data.frame(a),data.frame(b)) %>%
  lapply(. %>% transform(source = names(.)) %>% rename_("ip" = names(.)[1])) %>%
  do.call(rbind,.) %>%
  transform(v=1) %>%
  reshape(idvar="ip",timevar="source",direction="wide",sep="") %>%
  replace(.,is.na(.),0) %>%
  setNames(gsub("v","",colnames(.))) %>%
  set_rownames(.,`[`(.,,'ip')) %>% select(-ip),
akrun   = {lvl <- unique(c(a,b));mapply(table, list(a = factor(a, levels = lvl),b = factor(b, levels = lvl)))},
p_routh = {df <- data.frame("IP" = unique(c(a,b)));df2 <- df%>%mutate(a = ifelse(df$IP %in% a,1,0),b = ifelse(df$IP %in% b,1,0))},
d.b     = {ALL  <- unique(c(a,b));data.frame(sapply(list(a = a, b = b), function(x) as.numeric(ALL %in% x)), row.names = ALL)},
times = 100
)

对于给定的示例:

# Unit: microseconds
#     expr      min        lq      mean    median        uq       max neval
#    merge 2368.754 2670.8205 3866.2288 2942.6280 3685.1415 38459.947   100
# merge_dt 4220.084 4702.4700 5547.1978 5222.3705 6239.1685  9170.293   100
#    dcast 6153.875 6870.3760 9031.8770 7521.7570 8793.9045 46529.917   100
#   spread 4329.090 4814.6610 6023.5993 5313.3275 6301.9890 38972.416   100
#  reshape 4376.514 5007.1905 5995.1480 5694.1395 6811.4495  8744.180   100
#    akrun  238.893  304.3680  366.0376  327.7265  416.3815   654.744   100
#  p_routh 1013.967 1190.9255 1418.8037 1296.7450 1651.7220  2162.775   100
#      d.b  133.072  183.8595  228.7220  207.0415  278.1780   417.974   100

举个更大的例子:140E6 的基准测试有点多,所以我尝试使用 1E5。我随意选择了 a 和 b 之间大约 50% 的重叠。

n <- 1E5
set.seed(1)
a <- sample(2*n,n)
b <- sample(2*n,n)

我运行基准测试 10 次

# Unit: milliseconds
#     expr       min       lq      mean    median       uq      max neval
#    merge 582.41885 617.4348 676.40615 651.84618 698.1091 911.8320    10
# merge_dt  98.72318 100.6648 114.72754 103.57925 119.9722 176.5360    10
#    dcast 267.51729 347.8337 366.85554 360.17472 411.5002 454.1912    10
#   spread 425.26005 447.7959 471.03577 477.02525 490.0484 502.8333    10
#  reshape 697.14005 738.6921 763.31876 751.01547 791.3207 818.0778    10
#    akrun 791.00964 815.5621 838.08296 832.31382 849.5231 923.6849    10
#  p_routh  78.77724  82.8646  98.38296  84.34238 101.7304 151.0339    10
#      d.b 191.00546 194.5754 209.02133 200.35484 207.1666 279.7900    10

我们看到 P Routh 的解对于 2 个向量是最快的,并且dcast是最快的一般解。merge然而,对于 140E6 行, withdata.table可能是最快的。


一般解决方案

Hopefulle 最终编辑:

我根据我最好的受限解决方案设计了 2 个通用解决方案,并在 3 个大小为 10E6 的向量上运行它们。

merge_dt_gen <- function(...){
  args <- as.character(substitute(list(...)))[-1]
  dts <- args %>% lapply(.%>% data.table(ip=get(.),key="ip"))
  all_ips <- data.table(ip = unique(c(...)),key="ip") # all_ips <- data.table(ip = unique(c(a,b)))
  for(dt in dts){
    all_ips <- merge(all_ips,dt,all.x = TRUE,by="ip")
  }
  all_ips %>%
    as.data.frame %>%
    set_rownames(.,`[`(.,,'ip')) %>%
    select(-ip) %>%
    setNames(args) %>%
    replace(.,!is.na(.),1) %>%
    replace(.,is.na(.),0) 
}

d_cast_gen <- function(...){
  args <- as.character(substitute(list(...)))[-1]
  args %>%
    lapply(.%>% data.frame(get(.)) %>% setNames(c("src","ip"))) %>% 
    do.call(rbind,.) %>%
    transform(v=1) %>%
    dcast(ip ~ src,value.var="v") %>%
    replace(.,is.na(.),0) %>%
    setNames(gsub("v","",colnames(.))) %>%
    set_rownames(.,`[`(.,,'ip')) %>% select(-ip)  
}

n <- 10E6
set.seed(1)
a <- sample(2*n,n)
b <- sample(2*n,n)
d <- sample(unique(a,b),n)

microbenchmark(
  d_cast_gen   = d_cast_gen(a,b,d),
  merge_dt_gen = merge_dt_gen(a,b,d),
  times = 1
)

# Unit: seconds
#         expr      min       lq     mean   median       uq      max neval
#   d_cast_gen 70.99771 70.99771 70.99771 70.99771 70.99771 70.99771     1
# merge_dt_gen 47.41809 47.41809 47.41809 47.41809 47.41809 47.41809     1

mergewithdata.table是最快的

于 2017-07-12T23:53:06.110 回答
0

我们可以通过将“a”、“b”转换factorlevels指定为unique“a”、“b”组合中的元素并获取频率来做到这一点

lvl <- unique(c(a,b))
mapply(table, list(a = factor(a, levels = lvl),b = factor(b, levels = lvl)))
#             a b
#192.168.0.1  1 0
#192.168.0.2  1 0
#192.168.0.3  1 0
#192.168.0.4  1 0
#192.168.0.5  1 1
#192.168.0.6  1 1
#192.168.0.7  1 1
#192.168.0.8  1 1
#192.168.0.9  1 1
#192.168.0.10 1 1
#192.168.1.1  0 1
#192.168.1.2  0 1
#192.168.1.3  0 1
#192.168.1.4  0 1
于 2017-07-13T00:58:23.853 回答
0

一个 dplyr 解决方案:

df <- data.frame("IP" = unique(c(a,b)))
df2 <- df%>%mutate(a = ifelse(df$IP %in% a,1,0),b = ifelse(df$IP %in% b,1,0))

输出:

> df2
             IP a b
1   192.168.0.1 1 0
2   192.168.0.2 1 0
3   192.168.0.3 1 0
4   192.168.0.4 1 0
5   192.168.0.5 1 1
6   192.168.0.6 1 1
7   192.168.0.7 1 1
8   192.168.0.8 1 1
9   192.168.0.9 1 1
10 192.168.0.10 1 1
11  192.168.1.1 0 1
12  192.168.1.2 0 1
13  192.168.1.3 0 1
14  192.168.1.4 0 1
于 2017-07-12T23:30:44.720 回答