我被分配了类似的任务,我也在学习和做,我已经发展了一些,所以我分享我的代码片段,希望对你有所帮助。
library("topicmodels")
library("tm")
func<-function(input){
x<-c("I like to eat broccoli and bananas.",
"I ate a banana and spinach smoothie for breakfast.",
"Chinchillas and kittens are cute.",
"My sister adopted a kitten yesterday.",
"Look at this cute hamster munching on a piece of broccoli.")
#whole file is lowercased
#text<-tolower(x)
#deleting all common words from the text
#text2<-setdiff(text,stopwords("english"))
#splitting the text into vectors where each vector is a word..
#text3<-strsplit(text2," ")
# Generating a structured text i.e. Corpus
docs<-Corpus(VectorSource(x))
创建内容转换器,即用于修改 R 中的对象的函数。
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
#Removing all the special charecters..
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
docs<-tm_map(docs,removeWords,c("\t"," ",""))
dtm<- TermDocumentMatrix(docs, control = list(removePunctuation = TRUE, stopwords=TRUE))
#print(dtm)
freq<-colSums(as.matrix(dtm))
print(names(freq))
ord<-order(freq,decreasing=TRUE)
write.csv(freq[ord],"word_freq.csv")
设置 LDA 的参数
burnin<-4000
iter<-2000
thin<-500
seed<-list(2003,5,63,100001,765)
nstart<-5
best<-TRUE
#Number of Topics
k<-3
# Docs to topics
ldaOut<-LDA(dtm,k,method="Gibbs",control=list(nstart=nstart,seed=seed,best=best,burnin=burnin,iter=iter,thin=thin))
ldaOut.topics<-as.matrix(topics(ldaOut))
write.csv(ldaOut.topics,file=paste("LDAGibbs",k,"DocsToTopics.csv"))