我正在使用 R 做一篇关于文本挖掘的学期论文。我们的任务是猜测一篇文章的语气(正面/负面)。文章存储在各自的文件夹中。我需要创建一个分类系统,该系统将通过训练样本进行学习。我重用了来自http://www.youtube.com/watch?v=j1V2McKbkLo 的代码 除了最后一行之外的整个代码都成功执行了。以下是代码。
tone<- c("Positive", "Negative")
folderpath <- "C:/Users/Tanmay/Desktop/R practice/Week8"
options(stringAsFactors = FALSE)
corpus<-Corpus(DirSource(folderpath))
corpuscopy<-corpus
summary(corpus)
inspect(corpus)
#Clean data
CleanCorpus <- function(corpus){
corpustemp <- tm_map(corpus, removeNumbers)
corpustemp <- tm_map(corpus, removePunctuation)
corpustemp <- tm_map(corpus, tolower)
corpustemp <- tm_map(corpus, removeWords, stopwords("english"))
corpustemp <- tm_map(corpus, stemDocument,language="english")
corpustemp <- tm_map(corpus, stripWhitespace)
return(corpustemp )
}
#Document term matrix
generateTDM <- function(tone,path) {
corpusdir <- sprintf("%s/%s",path,tone)
corpus<- Corpus(DirSource( directory=corpusdir ,encoding = "ANSI"))
corpustemp <- CleanCorpus(corpus)
corpusclean <- DocumentTermMatrix(corpustemp)
corpusclean <- removeSparseTerms(corpusclean , 0.7)
result <- list(Tone = tone, tdm = corpusclean)
}
tdm <- lapply(tone,generateTDM,path=folderpath)
#Attach tone
ToneBindTotdm <- function(tdm){
temp.mat <- data.matrix(tdm[["tdm"]])
temp.df <- as.data.frame(temp.mat)
temp.df <- cbind(temp.df,rep(tdm[["Tone"]]),nrow(temp.df))
colnames(temp.df)[ncol(temp.df)] <- "PredictTone"
return(temp.df)
}
Tonetdm <- lapply(tdm,ToneBindTotdm)
#Stack
Stacktdm <- do.call(rbind.fill,Tonetdm)
Stacktdm[is.na(Stacktdm)] <- 0
#Holdout
trainid <- sample(nrow(Stacktdm),ceiling(nrow(Stacktdm) * 0.7))
testid <- (1:nrow(Stacktdm)) [- trainid]
#knn
tdmone <- Stacktdm[,"PredictTone"]
tdmone.nl <- Stacktdm[, !colnames(Stacktdm) %in% "PredictTone"]
knnPredict <- knn(tdmone.nl[trainid,],tdmone.nl[testid,],tdmone[trainid],k=5)
当我尝试执行此操作时,最后一行 (knn) 出现错误:
**Error in knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid], :
NA/NaN/Inf in foreign function call (arg 6)
In addition: Warning messages:
1: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid], :
NAs introduced by coercion
2: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid], :
NAs introduced by coercion**
谁能帮帮我。另外,如果还有其他更简单更好的分类方法,请指点我。感谢和抱歉,很长的帖子。