所以我使用text2vec
R 包来构建用于特征选择的词向量化。我是根据 Dmitriy Selivanov 的页面http://text2vec.org/vectorization.html做到的,该页面解释了如何text2vec
在构建分类器之前正确使用。
这是我的代码:
# Loading packages and movie review data
require(text2vec)
require(data.table)
data("movie_review")
library(tidyverse)
# Converting list of movie reviews to data table by reference
setDT(movie_review)
# Sorting the data table by ID
setkey(movie_review, id)
#Set seed for reproducibe results
set.seed(2016L)
# Splitting data set into training and testing data
all_ids = movie_review$id
train_ids = sample(all_ids, 4000)
test_ids = setdiff(all_ids, train_ids)
train = movie_review[J(train_ids)]
test = movie_review[J(test_ids)]
# Vocabulary-based vecorization
# Define preprocessing function and tokenization function
# Setting up a pre-processing function
prep_fun <- function(x) {
x %>%
# make text lower case
str_to_lower %>%
# remove non-alphanumeric symbols
str_replace_all("[^[:alpha:]]", " ") %>%
# collapse multiple spaces
str_replace_all("\\s+", " ")}
# Setting up the tokenization function
tok_fun = word_tokenizer
# Using an iterator over tokens to create the vocabulary
it_train = itoken(train$review,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$id,
progressbar = FALSE)
vocab = create_vocabulary(it_train)
# The created vocabulary consists of 35070 unique words.
# Now that the vocabulary is set up, it is time to construct a DTM.
# The vocab_vectorizer() function will create the vectors of the tokens.
vectorizer = vocab_vectorizer(vocab)
t1 = Sys.time()
dtm_train = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))
# Vectorization and DTM creation of the training data is now complete.
# Looking up the dimensions of the created DTM
dim(dtm_train)
identical(rownames(dtm_train), train$id)
然后我继续使用glmnet
R 包拟合 LASSO 回归模型。
# Training the model using LASSO regression to avoid high variance in coefficients
require(glmnet)
NFOLDS = 4
t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['sentiment']],
family = 'binomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = NFOLDS,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))
# Since the classifier is now trained, it is possible to plot the area under the curve (AUC).
plot(glmnet_classifier)
# It is also possible to compute the AUC as a figure.
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
# Now that the model has been fitted to the DTM, it is time for validating the model's performance on the test data.
# Therefore, the same preprocessing and tokenization functions will be applied as well as the same vectorizer function.
# Preprocessing and tokenizing test data
it_test = itoken(test$review,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = test$id,
progressbar = FALSE)
# Creating DTM of test data
dtm_test = create_dtm(it_test, vectorizer)
# Using classifier for prediction on test data
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
glmnet:::auc(test$sentiment, preds)
现在我想对我自己的数据使用分类器来对我自己的数据中的情绪进行分类。我的问题是:我是否必须对我的数据应用相同的预处理和标记化功能,然后还要对我的数据中的单词进行矢量化?或者只是简单地对数据使用内置的分类器?
我真的希望找到一些答案谢谢!