我正在对大约 30000 条推文进行文本挖掘,现在的问题是为了使结果更可靠,我想将“同义词”转换为 ex 的相似词。有些用户使用“girl”一词,有些使用“girls”,有些使用“gal”。同样,“给予”、“给予”仅表示一件事。“come”、“come”也是如此。一些用户使用“plz”、“pls”等短格式。此外,tm 包中的“stemdocument”无法正常工作。它正在将舞蹈转换为 danc,将表格转换为表格。 ...有没有其他好的词干包。我想用一个相似的词替换所有这些词,以便计算这些数据的正确频率。所以我的情绪分析会更可靠。以下是可重现的代码(我不能在这里包含所有 30000X1 数据帧),
content<-c("n.n.t.t.t.t.t.t.girl.do.it.to.me.t.t.n.t.t.t.t.t.t.n.n.t.t.t.t.t.t.n.n.t.t.t.t.t.t.t.n.n.t.t.t.t.t.t.t.n.t.n.t.t.n.t.t.t.n.t.t.t.tajinkx.said..n.t.t.t.n.t.t.n.t.n.t.n.t.t.n.t.t.n.t.t.n.t.t.tok.guyz...srry.to.sound.dumb.toilets.i.dnt.drink.while.m.just.searching.for.fun..nso.is.going.to.bar.good.for.me.i.dnt.knw.what.washroom.all.happens.there.inside...so.would.like.if.someone.gals.helps.me.thankuu..n.t.t.n.t.t.t.tClick.to.expand....n.t.nBhai.tu.plz.rehne.de.....n.n.t.n.n.t.t.n.t.t.t.n.t.t.n.n.t.t.n.t.n.n.t.t.t.t.t.t.t.t..n.t.t.t.t.t.t.t.t.n.toilet.is.not .t.t.t.t.t.t.t.n.n.t.t.t.t.t.t.n.n.t.t.t.t.t.t.n.t.n.n.t.t.n.t.t.t.n.t.t.n.n.t.t.n.t.n.n.n.t.n.n.n.t.n.n.t.t.n.t.t.t.n.t.t.n.n.t.t.n.t.n.n.t.t.t.t.t.................................................................................................................................................. \n\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\t\tajinkx said:\n\t\t\t\n\t\t\n\t\n\t\n\t\t\n\t\t\n\t\t\n\t\t\tok guyz...srry to sound dumb!i dnt drink while m just searching for fun!\nso is going to bar good for me?i dnt knw what all happens there inside...so would like if someone helps me.thankuu!\n\t\t\n\t\t\t\tClick to expand...\n\t\nBhai,tu plz rehne de....\n\n\t\n\n\t\t\n\t\t\t\n\t\t\n\n\t\t\n\t\n\n\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\n\n\t\t\n\t\t\t\n\t\t\n\n\t\t\n\t\n\n\n\t\n\n\n\t\n\n\t\t\n\t\t\t\n\t\t\n\n\t\t\n\t\n\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t is this da bar which u guys r talking about???\nSent from my SM-N900 using Tapatalk\n\n\t\n\n\t\t\n\t\t\t\n\t\t\n\n\t\t\n\t\n\n\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\n\n\t")
np<-600;postop<-1200;fr<-"yes"#input from GUI
#wbpage<-function (np,postop,fr){
#load("data_mpdb.Rdata")
#content<-as.data.frame(raw_dat[np:postop,],stringsAsFactors = FALSE)
#last<-rbind(tail(content,1),head(content,1));colnames(last)<-#c("stopdate","startdate")
message("Initializing part-1")
#---------------------data cleaning-----------------------------------------------------
#replied post
content2<-as.data.frame(content$txt,stringsAsFactors = FALSE);colnames(content2)<-c("txt")
content2 <- as.data.frame(gsub("(said:).*?(click to expand\\.{3})", " ", content$txt),stringsAsFactors = FALSE);
content2<-as.data.frame(lapply(content$txt, gsub, pattern = '(said:).*?(click to expand\\.{3})', replacement ="\\1 \\2", perl=TRUE),stringsAsFactors = FALSE);
content2<- as.data.frame(t(as.matrix(content2)));colnames(content2)<-c("txt");rownames(content2)<-NULL
#----------------ken's addition: lemmitization---------------------------
sp <- spacy_parse(as.character(content2$txt), lemma = TRUE)
sp$token <- ifelse(!grepl("^\\-[A-Z]+\\-$", sp$lemma), sp$lemma, sp$token)
# define equivalencies for please variants
dict <- dictionary(list(
please = c("please", "pls", "plz"),
girl = c("girl", "gal"),
toilet=c("toilet","shit","shitty","washroom")
))
toks <- as.tokens(sp) %>%
tokens(remove_punct = TRUE)
toks
new_stopwords<-c("said","one","click","expand","sent","using","attachment",
"tapatalk","will","can","hai","forum","like","just",
"get","know","also","now","bro","bhai","back","wat",
"ur","naa","nai","sala","email","urself","arnd","sim",
"pl","kayko","ho","gmail","sm","ll","g7102","iphone","yeah","time","asked","went","want","look","call","sit",
"even","first","place","left","visit","guy","around","started","came","dont","got","took","see","take","see","come")
toks <- tokens_remove(toks, c(stopwords("en"), new_stopwords))
#--------I have to make toks to be same as content2 so that i can use it in #
further corpus buildin---------------------------
#the data- punctuation, digits, stopwords, whitespace, and lowercase.
docs <- Corpus(VectorSource(content2$txt));#mname<-Corpus(VectorSource(content2$name))
message("Initializing part-1.2")
docs <- tm_map(docs, content_transformer(tolower));#mname<-tm_map(mname,content_transformer(tolower))
docs <- tm_map(docs, removePunctuation,preserve_intra_word_contractions=TRUE,preserve_intra_word_dashes=TRUE);#mname <- tm_map(mname, removePunctuation)
message("Initializing part-1.3")
docs <- tm_map(docs, removeWords, c(stopwords("english"),new_stopwords))
docs <- tm_map(docs, stripWhitespace);#mname <- tm_map(mname, stripWhitespace)
message("Initializing part-1.4")
docs <- tm_map(docs, removeWords,new_stopwords)
#------------------------Text stemming------------------------------------------
#docs <- tm_map(docs, stemDocument,language="english")
#-------------sentiment analysis--------------------------------------------------
message("Initializing part-2")
n <- 4
rnorm(10000, 0,1)
#incProgress(1/n, detail = paste("Finished section 1"))
docs_df <- data.frame(matrix(unlist(docs),nrow=length(docs), byrow=F),stringsAsFactors=FALSE)
docs_df<-docs_df[-c(2)];content2$editedtxt<-docs_df;
#----------------fr|fr:----------------------------------------------
if (fr=="yes"){
frlogic<-grepl("fr\\s|fr:", docs_df$X1);docs_df<-as.data.frame(docs_df[frlogic=="TRUE",],stringsAsFactors = FALSE);
docs_df[order(nchar(as.character(docs_df)),decreasing = FALSE),]
}
colnames(docs_df)<-c("txt")
d<-get_nrc_sentiment(as.character(docs_df))
td<-data.frame(t(d))
td_new <- data.frame(rowSums(td))
#Transformation and cleaning
names(td_new)[1] <-"count"
td_new <- cbind("sentiment"=rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]
sentimentplot<-qplot(sentiment, data=td_new2, weight=count, geom="bar",fill=sentiment)+ggtitle("sentiments")
sentimentplot
现在我得到错误查找安装了 spaCy 的 python 可执行文件... set_spacy_python_option 中的错误(python_executable,virtualenv,condaenv,:在系统路径上找不到 python
还,
I have to make toks to be same as content2 so that i can use it in #
further corpus building for furhter analysis.
等待你的答复。谢谢。