我一直在尝试分析一个数据集(大约 7000 个条目)以进行 Twitter 情绪分析。我一直在尝试使用朴素贝叶斯模型来预测推文是否为负面。混淆矩阵没有预测,只有基本比率,这意味着模型没有做出任何预测。我怎样才能让它做出预测?也许 removeSparseTerms 参数需要更改。如果贝叶斯无法预测任何东西,那么对于这个数据集,还有哪些模型可以很好地使用?
tweets$Negative = as.factor(tweets$Sentiment <= -1)
# Create corpus, Convert to lower-case, remove punctuation, remove stopwords,
# stem document, create frequency matrix
sparse = removeSparseTerms(frequencies, 0.995)
tweetsSparse = as.data.frame(as.matrix(sparse))
tweetsSparse$Negative = tweets$Negative
split = sample.split(tweetsSparse$Negative, SplitRatio = 0.7)
trainSparse = subset(tweetsSparse, split==TRUE)
testSparse = subset(tweetsSparse, split==FALSE)
prepare_testData <- function(model.training.data, test.dtm){
# Create an empty dataframe with column names same as features in training data
train.features <- names(model.training.data)
testData <- matrix(data = rep(0, length(train.features) * nrow(test.dtm)),
nrow = nrow(test.dtm), ncol = length(train.features))
colnames(testData) <- train.features
row.names(testData) <- row.names(test.dtm)
# features common to both train and test are copied from test data
common.features <- intersect(train.features, names(test.dtm))
for(i in 1:length(common.features)) {
testData[,common.features[i]] <- test.dtm[,common.features[i]]
}
testData <- as.data.frame(testData)
return(testData)
}
########### Naive Bayes model training ###########
naive.bayes.model <- train(Negative ~.,
data = trainSparse,
trControl = trainControl(method = "cv", number = 5),
method = "nb")
naive.bayes.testData <- prepare_testData(naive.bayes.model$trainingData[, -ncol(naive.bayes.model$trainingData)],
testSparse)
naive.bayes.pred <- predict(naive.bayes.model, naive.bayes.testData)
naive.bayes.metrics <- confusionMatrix(naive.bayes.pred, testSparse$Negative)