1

我有一个包含两列的数据框,我想在这两列(addressgmap_var)之间逐行计算 Jaccard 的相似度索引:

这是我尝试过的(基于此贡献Mutate with a list column function in dplyr)。它返回 jaccard_sim = 0。

example <- example %>%
  rowwise() %>%
  mutate(jaccard_sim = length(intersect(address, gmap_var))/length(union(address, gmap_var)))

示例数据框:

structure(list(address = c("AVENUE DES AVIATEURS, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", 
"MPIKA, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "H775+677, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", 
"H7QQ+VX8, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "AVE DE KATO, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", 
"J974+373, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "COMMUNE DE KIMBANSEKE QUARTIER 17, MAIN, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", 
"COMMUNE DE KIMBANSEKE QUARTIER 17, MAIN, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", 
"AVE DE LA SCIENCE, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", 
"10 NSELE, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "J974+373, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", 
"J85X+J4W, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "39HM+RFQ, MUANDA, DEMOCRATIC REPUBLIC OF THE CONGO", 
"FMPM+M22, KWILU NGONGO, DEMOCRATIC REPUBLIC OF THE CONGO", "QV6F+HGW, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO", 
"QV6F+HGW, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO", 
"QV46+X55, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO", 
"QV6F+HGW, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO", 
"UNNAMED ROAD, SONGOLOLO, DEMOCRATIC REPUBLIC OF THE CONGO", 
"AV. MOBUTU, INKISI, DEMOCRATIC REPUBLIC OF THE CONGO", "QV46+X55, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO", 
"M8R8+9W2, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "NGILIMA II, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", 
"M8R8+9W2, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO"), gmap_var = c("PAROISE ST ANNE, GOMBE, KINSHASA, CD", 
"AVENUE MOSSAMBA, 116, NGIRI-NGIRI, KINSHASA, CD", "6E ET 7EME RUE CITE VERTE/PAROISSE ST., SELEMBAO, KINSHASA, CD", 
"ROUTE MATADIPAROISSE MATERDEI, MONT-NGAFULA, KINSHASA, CD", 
"AVENUE MARINE, 362, NGALIEMA, KINSHASA, CD", "AVENUE DU MARCHE NO. 46 PAROISSE ST. THOMAS, KIMBANSEKE, KINSHASA, CD", 
"AVENUE LINGOMO, NO 123/124, KIMBANSEKE, KINSHASA, CD", "24,LINGOMO Q/ NGAMAYAMA, KIMBANSEKE, KINSHASA, CD", 
"CENTRE MISSIONNAIRE BUSIRA DIMI, NON LOIN DU, MALUKU, KINSHASA, CD", 
"AVENUE MBIMI NO 10, NSELE, KINSHASA, CD", "PAROISSE ST AMBROISE, 1. Q. KABILA, KISENSO, KINSHASA, CD", 
"LOC. KINZAZI/PAROISSE ST ALPHONSE, MATETE, KINSHASA, CD", "PAROISSE KIMBANGUISTE, LUKULA, CONGO CENTRAL, CD", 
"PAROISSE NOTRES DAME, KWILU-NGONGO, CONGO CENTRAL, CD", "PAROISSE SAINTE MARIE DE LOMA, MBANZA-NGUNGU, CONGO CENTRAL, CD", 
"PAROISSE CATHOLIQUE STE THERESE, MBANZA-NGUNGU, CONGO CENTRAL, CD", 
"PAROISSE CATHOLIQUE CHRIST ROI, MBANZA-NGUNGU, CONGO CENTRAL, CD", 
"PAROISSE NKAMBA, NTIMANSI, CONGO CENTRAL, CD", "PAROISSE CATHOLIQUE SONGOLOLO, SONGOLOLO, CONGO CENTRAL, CD", 
"PAROISSE ST JOSEPH, INKISI, CONGO CENTRAL, CD", "PAROISSE STE FAMILLE, INKISI, CONGO CENTRAL, CD", 
"PAROISSE ARMEE DU SALUT, INKISI, CONGO CENTRAL, CD", "PAROISSE CEC KILOMBO, INKISI, CONGO CENTRAL, CD", 
"PAROISSE ARMEE DU SALUT, KASANGULU, CONGO CENTRAL, CD")), row.names = c(NA, 
-24L), groups = structure(list(.rows = structure(list(1L, 2L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 
    16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, -24L), class = c("tbl_df", 
"tbl", "data.frame")), class = c("rowwise_df", "tbl_df", "tbl", 
"data.frame"))
4

1 回答 1

1

首先将句子分成单词,然后您可以使用其他帖子中的公式。

library(dplyr)

df %>%
  ungroup %>%
  mutate(across(.fns = ~strsplit(., ',?\\s*'), .names = '{col}_vec')) %>%
  rowwise() %>%
  mutate(jaccard_sim = length(intersect(address_vec, gmap_var_vec))/
                       length(union(address_vec, gmap_var_vec)))

#  address                     gmap_var                 address_vec gmap_var_vec jaccard_sim
#   <chr>                       <chr>                    <list>      <list>             <dbl>
# 1 AVENUE DES AVIATEURS, KINS… PAROISE ST ANNE, GOMBE,… <chr [62]>  <chr [33]>         0.810
# 2 MPIKA, KINSHASA, DEMOCRATI… AVENUE MOSSAMBA, 116, N… <chr [47]>  <chr [43]>         0.667
# 3 H775+677, KINSHASA, DEMOCR… 6E ET 7EME RUE CITE VER… <chr [50]>  <chr [59]>         0.741
# 4 H7QQ+VX8, KINSHASA, DEMOCR… ROUTE MATADIPAROISSE MA… <chr [50]>  <chr [54]>         0.704
# 5 AVE DE KATO, KINSHASA, DEM… AVENUE MARINE, 362, NGA… <chr [53]>  <chr [38]>         0.667
# 6 J974+373, KINSHASA, DEMOCR… AVENUE DU MARCHE NO. 46… <chr [50]>  <chr [66]>         0.621
# 7 COMMUNE DE KIMBANSEKE QUAR… AVENUE LINGOMO, NO 123/… <chr [80]>  <chr [48]>         0.607
# 8 COMMUNE DE KIMBANSEKE QUAR… 24,LINGOMO Q/ NGAMAYAMA… <chr [80]>  <chr [46]>         0.593
# 9 AVE DE LA SCIENCE, KINSHAS… CENTRE MISSIONNAIRE BUS… <chr [59]>  <chr [62]>         0.810
#10 10 NSELE, KINSHASA, DEMOCR… AVENUE MBIMI NO 10, NSE… <chr [50]>  <chr [36]>         0.739
# … with 14 more rows
于 2021-08-25T10:30:08.027 回答