以下代码:
library(text2vec)
text8_file = "text8"
if (!file.exists(text8_file)) {
download.file("http://mattmahoney.net/dc/text8.zip", "text8.zip")
unzip ("text8.zip", files = "text8")
}
wiki = readLines(text8_file, n = 1, warn = FALSE)
# Create iterator over tokens
tokens <- space_tokenizer(wiki)
# Create vocabulary. Terms will be unigrams (simple words).
it = itoken(tokens, progressbar = FALSE)
vocab <- create_vocabulary(it)
vocab <- prune_vocabulary(vocab, term_count_min = 5L)
# Use our filtered vocabulary
vectorizer <- vocab_vectorizer(vocab)
# use window of 5 for context words
tcm <- create_tcm(it, vectorizer, skip_grams_window = 5L)
RcppParallel::setThreadOptions(numThreads = 4)
glove_model = GloVe$new(word_vectors_size = 50, vocabulary = vocab, x_max = 10, learning_rate = .25)
word_vectors_main = glove_model$fit_transform(tcm, n_iter = 20)
word_vectors_context = glove_model$components
word_vectors = word_vectors_main + t(word_vectors_context)
导致错误:
qlst <- prepare_analogy_questions("questions-words.txt", rownames(word_vectors))
> Error in (function (fmt, ...) :
invalid format '%d'; use format %s for character objects
来自 word2vec 来源的文件 questions-words.txt https://github.com/nicholas-leonard/word2vec/blob/master/questions-words.txt