r - R文本挖掘：使用tm包中的stemDocuments对相似词进行分组

Question

我正在对大约 30000 条推文进行文本挖掘，现在的问题是为了使结果更可靠，我想将“同义词”转换为 ex 的相似词。有些用户使用“girl”一词，有些使用“girls”，有些使用“gal”。同样，“给予”、“给予”仅表示一件事。“come”、“come”也是如此。一些用户使用“plz”、“pls”等短格式。此外，tm 包中的“stemdocument”无法正常工作。它正在将舞蹈转换为 danc，将表格转换为表格。 ...有没有其他好的词干包。我想用一个相似的词替换所有这些词，以便计算这些数据的正确频率。所以我的情绪分析会更可靠。以下是可重现的代码（我不能在这里包含所有 30000X1 数据帧），

 content<-c("n.n.t.t.t.t.t.t.girl.do.it.to.me.t.t.n.t.t.t.t.t.t.n.n.t.t.t.t.t.t.n.n.t.t.t.t.t.t.t.n.n.t.t.t.t.t.t.t.n.t.n.t.t.n.t.t.t.n.t.t.t.tajinkx.said..n.t.t.t.n.t.t.n.t.n.t.n.t.t.n.t.t.n.t.t.n.t.t.tok.guyz...srry.to.sound.dumb.toilets.i.dnt.drink.while.m.just.searching.for.fun..nso.is.going.to.bar.good.for.me.i.dnt.knw.what.washroom.all.happens.there.inside...so.would.like.if.someone.gals.helps.me.thankuu..n.t.t.n.t.t.t.tClick.to.expand....n.t.nBhai.tu.plz.rehne.de.....n.n.t.n.n.t.t.n.t.t.t.n.t.t.n.n.t.t.n.t.n.n.t.t.t.t.t.t.t.t..n.t.t.t.t.t.t.t.t.n.toilet.is.not .t.t.t.t.t.t.t.n.n.t.t.t.t.t.t.n.n.t.t.t.t.t.t.n.t.n.n.t.t.n.t.t.t.n.t.t.n.n.t.t.n.t.n.n.n.t.n.n.n.t.n.n.t.t.n.t.t.t.n.t.t.n.n.t.t.n.t.n.n.t.t.t.t.t..................................................................................................................................................                                                                                       \n\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\t\tajinkx said:\n\t\t\t\n\t\t\n\t\n\t\n\t\t\n\t\t\n\t\t\n\t\t\tok guyz...srry to sound dumb!i dnt drink while m just searching for fun!\nso is going to bar good for me?i dnt knw what all happens there inside...so would like if someone helps me.thankuu!\n\t\t\n\t\t\t\tClick to expand...\n\t\nBhai,tu plz rehne de....\n\n\t\n\n\t\t\n\t\t\t\n\t\t\n\n\t\t\n\t\n\n\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\n\n\t\t\n\t\t\t\n\t\t\n\n\t\t\n\t\n\n\n\t\n\n\n\t\n\n\t\t\n\t\t\t\n\t\t\n\n\t\t\n\t\n\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t is this da bar which u guys r talking about???\nSent from my SM-N900 using Tapatalk\n\n\t\n\n\t\t\n\t\t\t\n\t\t\n\n\t\t\n\t\n\n\t\t\t\t\t\t\t\t \n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\n\n\t")  


    np<-600;postop<-1200;fr<-"yes"#input from GUI

    #wbpage<-function (np,postop,fr){
    #load("data_mpdb.Rdata")
    #content<-as.data.frame(raw_dat[np:postop,],stringsAsFactors = FALSE)
    #last<-rbind(tail(content,1),head(content,1));colnames(last)<-#c("stopdate","startdate")
    message("Initializing part-1")
    #---------------------data cleaning-----------------------------------------------------
    #replied post
    content2<-as.data.frame(content$txt,stringsAsFactors = FALSE);colnames(content2)<-c("txt")
        content2 <- as.data.frame(gsub("(said:).*?(click to expand\\.{3})", " ", content$txt),stringsAsFactors = FALSE);
        content2<-as.data.frame(lapply(content$txt, gsub, pattern = '(said:).*?(click to expand\\.{3})', replacement ="\\1 \\2", perl=TRUE),stringsAsFactors = FALSE);
        content2<- as.data.frame(t(as.matrix(content2)));colnames(content2)<-c("txt");rownames(content2)<-NULL
    #----------------ken's addition: lemmitization---------------------------
    sp <- spacy_parse(as.character(content2$txt), lemma = TRUE)    
    sp$token <- ifelse(!grepl("^\\-[A-Z]+\\-$", sp$lemma), sp$lemma, sp$token)    
    # define equivalencies for please variants
    dict <- dictionary(list(
      please = c("please", "pls", "plz"),
      girl = c("girl", "gal"),
      toilet=c("toilet","shit","shitty","washroom")
    ))    
    toks <- as.tokens(sp) %>%
      tokens(remove_punct = TRUE)
    toks
    new_stopwords<-c("said","one","click","expand","sent","using","attachment",
                     "tapatalk","will","can","hai","forum","like","just",
                     "get","know","also","now","bro","bhai","back","wat",
                     "ur","naa","nai","sala","email","urself","arnd","sim",
                     "pl","kayko","ho","gmail","sm","ll","g7102","iphone","yeah","time","asked","went","want","look","call","sit",
                     "even","first","place","left","visit","guy","around","started","came","dont","got","took","see","take","see","come")

    toks <- tokens_remove(toks, c(stopwords("en"), new_stopwords))
#--------I have to make toks to be same as content2 so that i can use it in # 
 further corpus buildin---------------------------        

    #the data- punctuation, digits, stopwords, whitespace, and lowercase.
    docs <- Corpus(VectorSource(content2$txt));#mname<-Corpus(VectorSource(content2$name))
    message("Initializing part-1.2")
    docs <- tm_map(docs, content_transformer(tolower));#mname<-tm_map(mname,content_transformer(tolower))
    docs <- tm_map(docs, removePunctuation,preserve_intra_word_contractions=TRUE,preserve_intra_word_dashes=TRUE);#mname <- tm_map(mname, removePunctuation)
    message("Initializing part-1.3")
    docs <- tm_map(docs, removeWords, c(stopwords("english"),new_stopwords))
    docs <- tm_map(docs, stripWhitespace);#mname <- tm_map(mname, stripWhitespace)
    message("Initializing part-1.4")
    docs <- tm_map(docs, removeWords,new_stopwords)
    #------------------------Text stemming------------------------------------------
        #docs <- tm_map(docs, stemDocument,language="english")

    #-------------sentiment analysis--------------------------------------------------
    message("Initializing part-2")
    n <- 4
    rnorm(10000, 0,1)
    #incProgress(1/n, detail = paste("Finished section 1"))

    docs_df <- data.frame(matrix(unlist(docs),nrow=length(docs), byrow=F),stringsAsFactors=FALSE)
    docs_df<-docs_df[-c(2)];content2$editedtxt<-docs_df;

    #----------------fr|fr:----------------------------------------------
    if (fr=="yes"){
    frlogic<-grepl("fr\\s|fr:", docs_df$X1);docs_df<-as.data.frame(docs_df[frlogic=="TRUE",],stringsAsFactors = FALSE);
    docs_df[order(nchar(as.character(docs_df)),decreasing = FALSE),]
    }

    colnames(docs_df)<-c("txt")
    d<-get_nrc_sentiment(as.character(docs_df))
    td<-data.frame(t(d))
    td_new <- data.frame(rowSums(td))
    #Transformation and cleaning
    names(td_new)[1] <-"count"
    td_new <- cbind("sentiment"=rownames(td_new), td_new)
    rownames(td_new) <- NULL
    td_new2<-td_new[1:8,]
    sentimentplot<-qplot(sentiment, data=td_new2, weight=count, geom="bar",fill=sentiment)+ggtitle("sentiments")
    sentimentplot

现在我得到错误查找安装了 spaCy 的 python 可执行文件... set_spacy_python_option 中的错误（python_executable，virtualenv，condaenv，：在系统路径上找不到 python

还，

I have to make toks to be same as content2 so that i can use it in # 
 further corpus building for furhter analysis.

等待你的答复。谢谢。

score 0 · Accepted Answer

该代码不可重现，因为我们没有 input content2。但这里有一个你可以使用的例子。

您所说的变体的“转换同义词”，例如“give”和“gave”或“girl”与“girls”，这不仅仅是词干的问题，而是词形还原的问题（例如 give-gave）。要进行词形还原，您需要tm包中不存在的功能。

我建议您尝试使用 spacyr进行词形还原，并使用 quanteda进行其余的。就是这样。我们从一些文本开始，然后使用spacy_parse().

txt <- c(
  "The girl and the girls gave all they had to give.",
  "Pls say plz, please, gal."
)
new_stopwords <- c(
  "yeah", "time", "asked", "went", "want", "look", "call",
  "sit", "even", "first", "place", "left", "visit", "guy",
  "around", "started", "came", "dont", "got", "took", "see",
  "take", "see", "come"
)


library("spacyr")
sp <- spacy_parse(txt, lemma = TRUE)
## Found 'spacy_condaenv'. spacyr will use this environment
## successfully initialized (spaCy Version: 2.2.3, language model: en_core_web_sm)
## (python options: type = "condaenv", value = "spacy_condaenv")
sp
##    doc_id sentence_id token_id  token  lemma   pos entity
## 1   text1           1        1    The    the   DET       
## 2   text1           1        2   girl   girl  NOUN       
## 3   text1           1        3    and    and CCONJ       
## 4   text1           1        4    the    the   DET       
## 5   text1           1        5  girls   girl  NOUN       
## 6   text1           1        6   gave   give  VERB       
## 7   text1           1        7    all    all   DET       
## 8   text1           1        8   they -PRON-  PRON       
## 9   text1           1        9    had   have   AUX       
## 10  text1           1       10     to     to  PART       
## 11  text1           1       11   give   give  VERB       
## 12  text1           1       12      .      . PUNCT       
## 13  text2           1        1    Pls    pls  INTJ       
## 14  text2           1        2    say    say  VERB       
## 15  text2           1        3    plz    plz  INTJ       
## 16  text2           1        4      ,      , PUNCT       
## 17  text2           1        5 please please  INTJ       
## 18  text2           1        6      ,      , PUNCT       
## 19  text2           1        7    gal    gal PROPN       
## 20  text2           1        8      .      . PUNCT

我们将把它转换成quanteda标记，但首先让我们用它的引理替换标记（除非它是词性标识符，如“-PRON-”）。

# replace the token with its lemma (unless it's "-PRON-" for instance)
sp$token <- ifelse(!grepl("^\\-[A-Z]+\\-$", sp$lemma), sp$lemma, sp$token)

对于您的俚语变体，我们需要手动定义等效项，我们可以使用quanteda “字典”来完成。

library("quanteda", warn.conflicts = FALSE)
## Package version: 2.0.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.

# define equivalencies for please variants
dict <- dictionary(list(
  please = c("please", "pls", "plz"),
  girl = c("girl", "gal")
))

我们将在一分钟内使用它。首先，让我们从 spacyr 解析的输出创建一个标记对象，并删除标点符号。

toks <- as.tokens(sp) %>%
  tokens(remove_punct = TRUE)
toks
## Tokens consisting of 2 documents.
## text1 :
##  [1] "the"  "girl" "and"  "the"  "girl" "give" "all"  "they" "have" "to"  
## [11] "give"
## 
## text2 :
## [1] "pls"    "say"    "plz"    "please" "gal"

tokens_remove()使用该功能可以轻松删除停用词。

# now remove stopwords
toks <- tokens_remove(toks, c(stopwords("en"), new_stopwords))
toks
## Tokens consisting of 2 documents.
## text1 :
## [1] "girl" "girl" "give" "give"
## 
## text2 :
## [1] "pls"    "say"    "plz"    "please" "gal"

现在为了使“girl”和“please”的变体等效，我们使用tokens_lookup()：

toks <- tokens_lookup(toks, dictionary = dict, exclusive = FALSE, capkeys = FALSE)
toks
## Tokens consisting of 2 documents.
## text1 :
## [1] "girl" "girl" "give" "give"
## 
## text2 :
## [1] "please" "say"    "please" "please" "girl"

对于情感分析，您可以再次使用情感字典tokens_lookup()，并从中创建 dfm（文档特征矩阵）。（注意：“say”并不是一个真正的否定词，但我在这里使用它作为示例。）

sentdict <- dictionary(list(
    positive = c("nice", "good", "please", "give"),
    negative = c("bad", "say")
))
tokens_lookup(toks, dictionary = sentdict) %>%
    dfm()
## Document-feature matrix of: 2 documents, 2 features (25.0% sparse).
##        features
## docs    positive negative
##   text1        2        0
##   text2        3        1

r - R文本挖掘：使用tm包中的stemDocuments对相似词进行分组

1 回答 1

Related

Reference