我有两个包含相似单词的语料库。足够相似,使用setdiff
并不能真正帮助我的事业。所以我已经转向寻找一种方法来提取更频繁的单词列表或语料库(最终制作一个 wordcloud)(假设这样的东西会有一个阈值 - 所以可能像 50% 更频繁?)在语料库中# 1,与语料库#2相比。
这就是我现在所拥有的一切:
> install.packages("tm")
> install.packages("SnowballC")
> install.packages("wordcloud")
> install.packages("RColorBrewer")
> library(tm)
> library(SnowballC)
> library(wordcloud)
> library(RColorBrewer)
> UKDraft = read.csv("UKDraftScouting.csv", stringsAsFactors=FALSE)
> corpus = Corpus(VectorSource(UKDraft$Report))
> corpus = tm_map(corpus, tolower)
> corpus = tm_map(corpus, PlainTextDocument)
> corpus = tm_map(corpus, removePunctuation)
> corpus = tm_map(corpus, removeWords, c("strengths", "weaknesses", "notes", "kentucky", "wildcats", stopwords("english")))
> frequencies = DocumentTermMatrix(corpus)
> allReports = as.data.frame(as.matrix(frequencies))
> SECDraft = read.csv("SECMinusUKDraftScouting.csv", stringsAsFactors=FALSE)
> SECcorpus = Corpus(VectorSource(SECDraft$Report))
> SECcorpus = tm_map(SECcorpus, tolower)
> SECcorpus = tm_map(SECcorpus, PlainTextDocument)
> SECcorpus = tm_map(SECcorpus, removePunctuation)
> SECcorpus = tm_map(SECcorpus, removeWords, c("strengths", "weaknesses", "notes", stopwords("english")))
> SECfrequencies = DocumentTermMatrix(SECcorpus)
> SECallReports = as.data.frame(as.matrix(SECfrequencies))
因此,如果单词“wingspan”在语料库#2('SECcorpus')中的计数频率为 100,但在语料库#1('corpus')中的计数频率为 150,我们希望该词出现在我们的结果语料库/列表中。