这是一种方法......它适用于您的数据吗?有关包含 OP 数据的详细信息,请参阅下文
# load text mining library
library(tm)
# make first corpus for text mining (data comes from package, for reproducibility)
data("crude")
corpus1 <- Corpus(VectorSource(crude[1:10]))
# process text (your methods may differ)
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers,
stripWhitespace, skipWords, MinDocFrequency=5)
crude1 <- tm_map(corpus1, FUN = tm_reduce, tmFuns = funcs)
crude1.dtm <- TermDocumentMatrix(crude1, control = list(wordLengths = c(3,10)))
# prepare 2nd corpus
corpus2 <- Corpus(VectorSource(crude[11:20]))
# process text as above
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
crude2 <- tm_map(corpus2, FUN = tm_reduce, tmFuns = funcs)
crude2.dtm <- TermDocumentMatrix(crude1, control = list(wordLengths = c(3,10)))
crude2.dtm.mat <- as.matrix(crude2.dtm)
# subset second corpus by words in first corpus
crude2.dtm.mat[rownames(crude2.dtm.mat) %in% crude1.dtm.freq, ]
Docs
Terms reut-00001.xml reut-00002.xml reut-00004.xml reut-00005.xml reut-00006.xml
oil 5 12 2 1 1
opec 0 15 0 0 0
prices 3 5 0 0 0
Docs
Terms reut-00007.xml reut-00008.xml reut-00009.xml reut-00010.xml reut-00011.xml
oil 7 4 3 5 9
opec 8 1 2 2 6
prices 5 1 2 1 9
提供数据和评论后更新我认为这更接近你的问题。
这是使用文档术语矩阵而不是 TDM 的相同过程(正如我在上面使用的,略有不同):
# load text mining library
library(tm)
# make corpus for text mining (data comes from package, for reproducibility)
data("crude")
corpus1 <- Corpus(VectorSource(crude[1:10]))
# process text (your methods may differ)
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
crude1 <- tm_map(corpus1, FUN = tm_reduce, tmFuns = funcs)
crude1.dtm <- DocumentTermMatrix(crude1, control = list(wordLengths = c(3,10)))
corpus2 <- Corpus(VectorSource(crude[11:20]))
# process text (your methods may differ)
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers,
stripWhitespace, skipWords, MinDocFrequency=5)
crude2 <- tm_map(corpus2, FUN = tm_reduce, tmFuns = funcs)
crude2.dtm <- DocumentTermMatrix(crude1, control = list(wordLengths = c(3,10)))
crude2.dtm.mat <- as.matrix(crude2.dtm)
crude2.dtm.mat[,colnames(crude2.dtm.mat) %in% crude1.dtm.freq ]
Terms
Docs oil opec prices
reut-00001.xml 5 0 3
reut-00002.xml 12 15 5
reut-00004.xml 2 0 0
reut-00005.xml 1 0 0
reut-00006.xml 1 0 0
reut-00007.xml 7 8 5
reut-00008.xml 4 1 1
reut-00009.xml 3 2 2
reut-00010.xml 5 2 1
reut-00011.xml 9 6 9
这是使用添加到 OP 问题中的数据的解决方案
text <- c('saying text is good',
'saying text once and saying text twice is better',
'saying text text text is best',
'saying text once is still ok',
'not saying it at all is bad',
'because text is a good thing',
'we all like text',
'even though sometimes it is missing')
validationText <- c("This has different words in it.",
"But I still want to count",
"the occurence of text",
"for example")
TextCorpus <- Corpus(VectorSource(text))
ValiTextCorpus <- Corpus(VectorSource(validationText))
Control = list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, MinDocFrequency=5)
TextDTM = DocumentTermMatrix(TextCorpus, Control)
ValiTextDTM = DocumentTermMatrix(ValiTextCorpus, Control)
# find high frequency terms in TextDTM
(TextDTM.hifreq <- findFreqTerms(TextDTM, 5))
[1] "saying" "text"
# find out how many times each high freq word occurs in TextDTM
TextDTM.mat <- as.matrix(TextDTM)
colSums(TextDTM.mat[,TextDTM.hifreq])
saying text
6 9
以下是关键行,根据来自第一个 DTM 的高频词列表对第二个 DTM 进行子集化。在这种情况下,我使用了该intersect
函数,因为高频词的向量包含一个根本不在第二个语料库中的词(并且intersect
似乎比 处理得更好%in%
)
# now look into second DTM
ValiTextDTM.mat <- as.matrix(ValiTextDTM)
common <- data.frame(ValiTextDTM.mat[, intersect(colnames(ValiTextDTM.mat), TextDTM.hifreq) ])
names(common) <- intersect(colnames(ValiTextDTM.mat), TextDTM.hifreq)
text
1 0
2 0
3 1
4 0
如何找到第二个语料库中高频词的总数:
colSums(common)
text
1