train <- read.delim('train.tsv', header= T, fileEncoding= "windows-1252",stringsAsFactors=F)
Train.tsv 包含 1,56,060 行文本,其中包含 4 个列名称 Phrase、PhraseID、SentenceID 和 Sentiment(范围为 0 到 4)。Phrase 列具有文本行。(Tm 包已经加载) R 版本: 3.1.2 ; 操作系统:Windows 7、64 位、4 GB RAM。
> dput(head(train,6))
structure(list(PhraseId = 1:6, SentenceId = c(1L, 1L, 1L, 1L,
1L, 1L), Phrase = c("A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",
"A series of escapades demonstrating the adage that what is good for the goose",
"A series", "A", "series", "of escapades demonstrating the adage that what is good for the goose"
), Sentiment = c(1L, 2L, 2L, 2L, 2L, 2L)), .Names = c("PhraseId",
"SentenceId", "Phrase", "Sentiment"), row.names = c(NA, 6L), class = "data.frame")
这是火车文件的前 6 行。
clean_corpus <- function(corpus)
{
mycorpus <- tm_map(corpus, removeWords,stopwords("english"))
mycorpus <- tm_map(mycorpus, removeWords,c("movie","actor","actress"))
mycorpus <- tm_map(mycorpus, stripWhitespace)
mycorpus <- tm_map(mycorpus, tolower)
mycorpus <- tm_map(mycorpus, removeNumbers)
mycorpus <- tm_map(mycorpus, removePunctuation)
mycorpus <- tm_map(mycorpus, PlainTextDocument )
return(mycorpus)
}
# Build DTM
generateDTM <- function(df)
{
m <- list(Sentiment = "Sentiment", Phrase = "Phrase")
myReader <- readTabular(mapping = m)
mycorpus <- Corpus(DataframeSource(df), readerControl = list(reader = myReader))
#Code to attach sentiment label with every text line
for (i in 1:length(mycorpus))
{
attr(mycorpus[[i]], "Sentiment") <- df$Sentiment[i]
}
mycorpus <- clean_corpus(mycorpus)
dtm <- DocumentTermMatrix(mycorpus)
return(dtm)
}
dtm1 <- generateDTM(train)
这里我做了两个函数。一个用于清理语料库,另一个用于制作 DTM(文档术语矩阵)。我还将每个情绪值与每一行文本联系起来。现在当我使用 dtm1 的尺寸时;它显示 156060 行但 0 列。
那么,如何生成带有情感标签的 DTM?