r - 比较两个文档中的词袋并找到第二个文档中的匹配词及其频率

Question

我计算了“yelp.csv”、“yelpp.csv”、“yelpn.csv”的词袋，并创建了个人数据集的词频矩阵。现在，我想将 yelp 的词袋与 yelpn 进行比较，并检查 yelp 中有多少词出现在 yelpn 中以及它们的频率，并将其存储在一个变量中作为矩阵，然后对于 yelpp 也是如此。yelp包含正面和负面。yelpp，只有正面和 yelpn，只有负面。任何人都可以完成代码吗？我不知道这段代码是否相关，我希望如此。

getwd()
setwd("/Users/ash/RProjects/exc")
getwd()
df <- read.csv("yelp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
           strip.white = TRUE)
df
dfd<-as.character(df[,2])
dfd
df2<-as.character(df[,1])
df2
words <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
s<-remove_stopwords(dfd, words, lines = TRUE)
s
print(paste("****Stopwords are removed successfully****"))
n<-removeNumbers(s)
n
t<-removePunctuation(n, preserve_intra_word_dashes = FALSE)
t

#pos
dfp <- read.csv("yelpp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
           strip.white = TRUE)
dfp
dfdp<-as.character(dfp[,2])
dfdp
df2p<-as.character(dfp[,1])
df2p
wordsp <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
sp<-remove_stopwords(dfdp, words, lines = TRUE)
sp
print(paste("****Stopwords are removed successfully****"))
np<-removeNumbers(sp)
np
tp<-removePunctuation(np, preserve_intra_word_dashes = FALSE)
tp

#neg
dfn <- read.csv("yelpn.CSV",header = TRUE,quote="\"",stringsAsFactors=   TRUE,
           strip.white = TRUE)
dfn
dfdn<-as.character(dfn[,2])
dfdn
df2n<-as.character(dfn[,1])
df2n
wordsn <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
sn<-remove_stopwords(dfdn, words, lines = TRUE)
sn
print(paste("****Stopwords are removed successfully****"))
nn<-removeNumbers(sn)
nn
tn<-removePunctuation(nn, preserve_intra_word_dashes = FALSE)
tn



#bag
b<-bag_o_words(t, apostrophe.remove = TRUE)
b
b.mat = as.matrix(b)
b.mat
bp<-bag_o_words(tp, apostrophe.remove = TRUE)
bp
bp.mat = as.matrix(bp)
bp.mat
bn<-bag_o_words(tn, apostrophe.remove = TRUE)
bn
bn.mat = as.matrix(bn)
bn.mat

#frequent terms
frequent_terms <- freq_terms(b.mat, 2000)
frequent_terms
frequent_termsp <- freq_terms(tp, 2000)
frequent_termsp
frequent_termsn <- freq_terms(tn, 2000)
frequent_termsn

score 2 · Accepted Answer

我正在以文本为例，例如来自wiki Text mining的语料库。使用tm包和findFreqTerms函数agrep是这种方法的要点。

agrep

使用广义 Levenshtein 编辑距离（将一个字符串转换为另一个字符串所需的插入、删除和替换的最小可能加权数）在字符串 x（第二个参数）的每个元素内搜索与模式（第一个参数）的近似匹配。

方法步骤：

文本 -> 语料库 -> 数据清理 -> findfreqterms -> 与其他术语文档矩阵进行比较

library(tm)

c1 <- Corpus(VectorSource("Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning"))

c2 <- Corpus(VectorSource("Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output"))

c3 <- Corpus(VectorSource("Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities)"))

# Data Cleaning and transformation
c1 <- tm_map(c1, content_transformer(tolower))
c2 <- tm_map(c2, content_transformer(tolower))
c3 <- tm_map(c3, content_transformer(tolower))

c1 <- tm_map(c1, removePunctuation)
c1 <- tm_map(c1, removeNumbers)
c1 <- tm_map(c1, removeWords, stopwords("english"))
c1 <- tm_map(c1, stripWhitespace)

c2 <- tm_map(c2, removePunctuation)
c2 <- tm_map(c2, removeNumbers)
c2 <- tm_map(c2, removeWords, stopwords("english"))
c2 <- tm_map(c2, stripWhitespace)

c3 <- tm_map(c3, removePunctuation)
c3 <- tm_map(c3, removeNumbers)
c3 <- tm_map(c3, removeWords, stopwords("english"))
c3 <- tm_map(c3, stripWhitespace)

dtm1 <- DocumentTermMatrix(c1, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm2 <- DocumentTermMatrix(c2, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm3 <- DocumentTermMatrix(c3, control = list(weighting = weightTfIdf, stopwords = TRUE))

ft1 <- findFreqTerms(dtm1)
ft2 <- findFreqTerms(dtm2)
ft3 <- findFreqTerms(dtm3)

#similarity between c1 and c2
common.c1c2 <- data.frame(term = character(0), freq = integer(0))
for(t in ft1){
  find <- agrep(t, ft2)
  if(length(find) != 0){
    common.c1c2 <- rbind(common.c1c2, data.frame(term = t, freq = length(find)))
  }
}
# Note : this for loop can be substituted by apply family functions if taking time for large text

common.c1c2包含 corpus1 和 corpus2 之间的常用词和频率

> common.c1c2
      term freq
1     also    1
2     data    2
3  derived    1
4 deriving    1
5   mining    1
6  pattern    1
7 patterns    1
8  process    1
9     text    1

> ft1
 [1] "also"        "analytics"   "data"        "derived"     "deriving"    "devising"    "equivalent" 
 [8] "highquality" "information" "learning"    "means"       "mining"      "pattern"     "patterns"   
[15] "process"     "referred"    "roughly"     "statistical" "text"        "trends"      "typically"  

> ft2
 [1] "addition"       "along"          "data"           "database"       "derived"        "deriving"      
 [7] "evaluation"     "features"       "finally"        "input"          "insertion"      "interpretation"
[13] "involves"       "linguistic"     "mining"         "others"         "output"         "parsing"       
[19] "patterns"       "process"        "removal"        "structured"     "structuring"    "subsequent"    
[25] "text"           "usually"        "within"

此解决方案不是最有效的解决方案，但希望对您有所帮助。

r - 比较两个文档中的词袋并找到第二个文档中的匹配词及其频率

1 回答 1

Related

Reference