22

我想根据两列纬度和经度在 R 中创建一个唯一 ID,以便重复的位置具有相同的集群 ID。

例如:

LAT        LONG    Cluster_ID
13.5330 -15.4180   1
13.5330 -15.4180   1
13.5330 -15.4180   1
13.5330 -15.4180   1
13.5330 -15.4170   2
13.5330 -15.4170   2
13.5330 -15.4170   2
13.5340 -14.9350   3
13.5340 -14.9350   3
13.5340 -15.9170   4
13.3670 -14.6190   5
4

4 回答 4

26

这是使用interaction.

d <- read.table(text='LAT LONG
13.5330 -15.4180 
13.5330 -15.4180 
13.5330 -15.4180 
13.5330 -15.4180 
13.5330 -15.4170 
13.5330 -15.4170 
13.5330 -15.4170 
13.5340 -14.9350 
13.5340 -14.9350 
13.5340 -15.9170 
13.3670 -14.6190', header=TRUE)

d <- transform(d, Cluster_ID = as.numeric(interaction(LAT, LONG, drop=TRUE)))

#       LAT    LONG Cluster_ID
# 1  13.533 -15.418          2
# 2  13.533 -15.418          2
# 3  13.533 -15.418          2
# 4  13.533 -15.418          2
# 5  13.533 -15.417          3
# 6  13.533 -15.417          3
# 7  13.533 -15.417          3
# 8  13.534 -14.935          4
# 9  13.534 -14.935          4
# 10 13.534 -15.917          1
# 11 13.367 -14.619          5

编辑:合并@Spacedman 的建议以提供drop=TRUEinteraction.

于 2012-11-26T14:22:11.653 回答
13

数据:

dat <- read.table(text="
LAT        LONG
13.5330 -15.4180
13.5330 -15.4180
13.5330 -15.4180
13.5330 -15.4180
13.5330 -15.4170
13.5330 -15.4170
13.5330 -15.4170
13.5340 -14.9350
13.5340 -14.9350
13.5340 -15.9170
13.3670 -14.6190", header = TRUE)

这些命令创建一个以 id 开头的变量1

comb <- with(dat, paste(LAT, LONG))
within(dat, Cluster_ID <- match(comb, unique(comb)))

输出:

      LAT    LONG Cluster_ID
1  13.533 -15.418          1
2  13.533 -15.418          1
3  13.533 -15.418          1
4  13.533 -15.418          1
5  13.533 -15.417          2
6  13.533 -15.417          2
7  13.533 -15.417          2
8  13.534 -14.935          3
9  13.534 -14.935          3
10 13.534 -15.917          4
11 13.367 -14.619          5
于 2012-11-26T14:24:10.120 回答
13

.GRP已添加到data.table1.8.3,允许您执行以下操作:

# Your data, as a data.frame
dat <- read.table(text='LAT LONG
13.5330 -15.4180 
13.5330 -15.4180 
13.5330 -15.4180 
13.5330 -15.4180 
13.5330 -15.4170 
13.5330 -15.4170 
13.5330 -15.4170 
13.5340 -14.9350 
13.5340 -14.9350 
13.5340 -15.9170 
13.3670 -14.6190', header=TRUE)

# Convert it to a data.table
# with keys as the combination of LAT and LONG
library(data.table)
DT <- data.table(dat, key="LAT,LONG")
DT[, Cluster_ID:=.GRP, by=key(DT)]
DT
#        LAT    LONG Cluster_ID
#  1: 13.367 -14.619          1
#  2: 13.533 -15.418          2
#  3: 13.533 -15.418          2
#  4: 13.533 -15.418          2
#  5: 13.533 -15.418          2
#  6: 13.533 -15.417          3
#  7: 13.533 -15.417          3
#  8: 13.533 -15.417          3
#  9: 13.534 -15.917          4
# 10: 13.534 -14.935          5
# 11: 13.534 -14.935          5
于 2012-11-27T04:52:04.453 回答
3

比较性能建议的解决方案:

df <- read.table(text='LAT LONG
13.5330 -15.4180 
13.5330 -15.4180 
13.5330 -15.4180 
13.5330 -15.4180 
13.5330 -15.4170 
13.5330 -15.4170 
13.5330 -15.4170 
13.5340 -14.9350 
13.5340 -14.9350 
13.5340 -15.9170 
13.3670 -14.6190', header=TRUE)
f1 <- function(df, cols) {
    df$id <- as.numeric(interaction(df[cols], drop = TRUE))
    df
}
f2 <- function(df, cols) {
    comb <- do.call(paste, c(as.list(df[cols]), sep = "."))
    df$id <- match(comb, unique(comb))
    df
}
f2(df, 1:2)
#>       LAT    LONG id
#> 1  13.533 -15.418  1
#> 2  13.533 -15.418  1
#> 3  13.533 -15.418  1
#> 4  13.533 -15.418  1
#> 5  13.533 -15.417  2
#> 6  13.533 -15.417  2
#> 7  13.533 -15.417  2
#> 8  13.534 -14.935  3
#> 9  13.534 -14.935  3
#> 10 13.534 -15.917  4
#> 11 13.367 -14.619  5
microbenchmark::microbenchmark(f1(df, 1:2), f2(df, 1:2))
#> Unit: microseconds
#>         expr     min      lq      mean   median       uq      max neval cld
#>  f1(df, 1:2) 486.400 510.422 575.26659 573.3945 594.1165 1622.243   100   b
#>  f2(df, 1:2)  72.952  79.208  86.09265  83.5275  89.7195  159.740   100  a 
于 2016-01-06T14:43:23.973 回答