2

Wordcloud 阿拉伯语 R 工作室

在尝试在 r studio 中为一些阿拉伯语推文创建 wordcloud 时,我一直在尝试搜索并弄清楚这一点,但我不确定为什么我会收到乱码文本,代码和系统信息如下。

很高兴有人可以指导我,我可以为任何英文推文做到这一点

SessionInfo()

R version 3.4.2 (2017-09-28)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

.

tweets = searchTwitter("syria", n=500, lang="ar")
tweets_text = sapply(tweets, function(x) x$getText())
tweets_text <- iconv(tweets_text, "UTF-8", "UTF-8", sub="")
# create a corpus
corpus = Corpus(VectorSource(tweets_text))


# create document term matrix applying some transformations
tdm = TermDocumentMatrix(corpus,
      control = list(removePunctuation = TRUE,
      stopwords = c("machine", "learning",  "machinelearning","https",stopwords("english")),
      removeNumbers = TRUE, tolower = TRUE))
    # define tdm as matrix
    m = as.matrix(tdm)
    # get word counts in decreasing order
    word_freqs = sort(rowSums(m), decreasing=TRUE) 
    # create a data frame with words and their frequencies
    dm = data.frame(word=names(word_freqs), freq=word_freqs)

# plot wordcloud
wordcloud(dm$word, dm$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))

或者,我尝试了以下代码来获取错误 Error: all(!is.na(match(c("doc_id", "text"), names(x)))) is not TRUE

library(wordcloud2)
print(Sys.getlocale(category = "LC_CTYPE"))
original_ctype <- Sys.getlocale(category = "LC_CTYPE")
## Switch to the appropriate local for the script
Sys.setlocale("LC_CTYPE", "arabic" )
# read text with UTF-8 encoding
arabic_text<-readLines("C:/......./tweets.txt", encoding="UTF-8")
arabic_text<-removePunctuation(arabic_text)
arabic_text<-removeNumbers(arabic_text)
arabic_text<-removeNewlineChars(arabic_text)
arabic_text<-stripWhitespace(arabic_text)
# create arabic text corpus
arabic_corpus <- Corpus(DataframeSource(data.frame(arabic_text)))
#-------This is where i get the above error--------------
4

0 回答 0