我在做 LDA 分析,我有主题,但是我需要根据 Hellinger 距离对主题进行聚类。我需要将 LDA 模型生成的 20 个主题分组并呈现在树状图中。我分享部分代码。
textos <-select(Base_Articulos, Articulo, Evento, Ano)
textorder <- textos[order(textos$Ano),]
bd_duplicados <- textos[duplicated(textos),]
bd_unicos <- unique (textos)
bd_unicos <- na.omit(bd_unicos)
ap_td <- tibble(textos) ap_td
tidy_articulo <- ap_td %>% unnest_tokens(word, Evento)
espstopwords <- tibble(word = c(stopwords(kind = "es"))) enpstopwords <- tibble(word = c(stopwords(kind = "en")))
miastopwords <- tibble(word = c("colombia", "study", "bogota", "colombian", "colombiano", "t", "medellin", "n", "k", "b", "hom", "cc", "92", "85", "m", "1", "l", "sp", "50", "155.000", "155", "59", "64", "70", "80", "18", "ri", "2", "3", "4", "5", "6", "7","8", "9"))
tidy_articulo <- tidy_articulo %>% anti_join(espstopwords) tidy_articulo <- tidy_articulo %>% anti_join(enpstopwords) tidy_articulo <- tidy_articulo %>% anti_join(miastopwords)
ap_td <- mutate(ap_td, Evento = as.character(ap_td$Evento))
tidy_articulo %>% count(word, sort = TRUE)
word_counts <- tidy_articulo %>% count(Articulo, word, sort = TRUE) %>% ungroup()
word_counts
desc_dtm <- word_counts %>% cast_dtm(Articulo, word, n)
desc_dtm
ap_lda <- LDA(desc_dtm, k = 20, control = list(seed = 1234))
ap_lda
ap_topics <- tidy(ap_lda, matrix = "beta")
ap_documents <- tidy(ap_lda, matrix = "gamma")