我有一个包含两列的数据框,我想在这两列(address
和gmap_var
)之间逐行计算 Jaccard 的相似度索引:
这是我尝试过的(基于此贡献Mutate with a list column function in dplyr)。它返回 jaccard_sim = 0。
example <- example %>%
rowwise() %>%
mutate(jaccard_sim = length(intersect(address, gmap_var))/length(union(address, gmap_var)))
示例数据框:
structure(list(address = c("AVENUE DES AVIATEURS, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO",
"MPIKA, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "H775+677, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO",
"H7QQ+VX8, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "AVE DE KATO, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO",
"J974+373, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "COMMUNE DE KIMBANSEKE QUARTIER 17, MAIN, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO",
"COMMUNE DE KIMBANSEKE QUARTIER 17, MAIN, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO",
"AVE DE LA SCIENCE, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO",
"10 NSELE, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "J974+373, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO",
"J85X+J4W, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "39HM+RFQ, MUANDA, DEMOCRATIC REPUBLIC OF THE CONGO",
"FMPM+M22, KWILU NGONGO, DEMOCRATIC REPUBLIC OF THE CONGO", "QV6F+HGW, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO",
"QV6F+HGW, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO",
"QV46+X55, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO",
"QV6F+HGW, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO",
"UNNAMED ROAD, SONGOLOLO, DEMOCRATIC REPUBLIC OF THE CONGO",
"AV. MOBUTU, INKISI, DEMOCRATIC REPUBLIC OF THE CONGO", "QV46+X55, MBANZA-NGUNGU, DEMOCRATIC REPUBLIC OF THE CONGO",
"M8R8+9W2, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO", "NGILIMA II, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO",
"M8R8+9W2, KINSHASA, DEMOCRATIC REPUBLIC OF THE CONGO"), gmap_var = c("PAROISE ST ANNE, GOMBE, KINSHASA, CD",
"AVENUE MOSSAMBA, 116, NGIRI-NGIRI, KINSHASA, CD", "6E ET 7EME RUE CITE VERTE/PAROISSE ST., SELEMBAO, KINSHASA, CD",
"ROUTE MATADIPAROISSE MATERDEI, MONT-NGAFULA, KINSHASA, CD",
"AVENUE MARINE, 362, NGALIEMA, KINSHASA, CD", "AVENUE DU MARCHE NO. 46 PAROISSE ST. THOMAS, KIMBANSEKE, KINSHASA, CD",
"AVENUE LINGOMO, NO 123/124, KIMBANSEKE, KINSHASA, CD", "24,LINGOMO Q/ NGAMAYAMA, KIMBANSEKE, KINSHASA, CD",
"CENTRE MISSIONNAIRE BUSIRA DIMI, NON LOIN DU, MALUKU, KINSHASA, CD",
"AVENUE MBIMI NO 10, NSELE, KINSHASA, CD", "PAROISSE ST AMBROISE, 1. Q. KABILA, KISENSO, KINSHASA, CD",
"LOC. KINZAZI/PAROISSE ST ALPHONSE, MATETE, KINSHASA, CD", "PAROISSE KIMBANGUISTE, LUKULA, CONGO CENTRAL, CD",
"PAROISSE NOTRES DAME, KWILU-NGONGO, CONGO CENTRAL, CD", "PAROISSE SAINTE MARIE DE LOMA, MBANZA-NGUNGU, CONGO CENTRAL, CD",
"PAROISSE CATHOLIQUE STE THERESE, MBANZA-NGUNGU, CONGO CENTRAL, CD",
"PAROISSE CATHOLIQUE CHRIST ROI, MBANZA-NGUNGU, CONGO CENTRAL, CD",
"PAROISSE NKAMBA, NTIMANSI, CONGO CENTRAL, CD", "PAROISSE CATHOLIQUE SONGOLOLO, SONGOLOLO, CONGO CENTRAL, CD",
"PAROISSE ST JOSEPH, INKISI, CONGO CENTRAL, CD", "PAROISSE STE FAMILLE, INKISI, CONGO CENTRAL, CD",
"PAROISSE ARMEE DU SALUT, INKISI, CONGO CENTRAL, CD", "PAROISSE CEC KILOMBO, INKISI, CONGO CENTRAL, CD",
"PAROISSE ARMEE DU SALUT, KASANGULU, CONGO CENTRAL, CD")), row.names = c(NA,
-24L), groups = structure(list(.rows = structure(list(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -24L), class = c("tbl_df",
"tbl", "data.frame")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"))