0

我在做 LDA 分析,我有主题,但是我需要根据 Hellinger 距离对主题进行聚类。我需要将 LDA 模型生成的 20 个主题分组并呈现在树状图中。我分享部分代码。

textos <-select(Base_Articulos, Articulo, Evento, Ano) 
textorder <- textos[order(textos$Ano),]

bd_duplicados <- textos[duplicated(textos),] 
bd_unicos <- unique (textos) 
bd_unicos <- na.omit(bd_unicos) 
ap_td <- tibble(textos) ap_td

tidy_articulo <- ap_td %>% unnest_tokens(word, Evento)

espstopwords <- tibble(word = c(stopwords(kind = "es"))) enpstopwords <- tibble(word = c(stopwords(kind = "en")))

miastopwords <- tibble(word = c("colombia", "study", "bogota", "colombian", "colombiano", "t", "medellin", "n", "k", "b", "hom", "cc", "92", "85", "m", "1", "l", "sp", "50", "155.000", "155", "59", "64", "70", "80", "18", "ri", "2", "3", "4", "5", "6", "7","8", "9"))

tidy_articulo <- tidy_articulo %>% anti_join(espstopwords) tidy_articulo <- tidy_articulo %>% anti_join(enpstopwords) tidy_articulo <- tidy_articulo %>% anti_join(miastopwords)

ap_td <- mutate(ap_td, Evento = as.character(ap_td$Evento))

tidy_articulo %>% count(word, sort = TRUE)

word_counts <- tidy_articulo %>% count(Articulo, word, sort = TRUE) %>% ungroup()

word_counts

desc_dtm <- word_counts %>% cast_dtm(Articulo, word, n)

desc_dtm

ap_lda <- LDA(desc_dtm, k = 20, control = list(seed = 1234))

ap_lda

ap_topics <- tidy(ap_lda, matrix = "beta") 

ap_documents <- tidy(ap_lda, matrix = "gamma")
4

0 回答 0