我正在尝试与https://thiloshon.wordpress.com/2018/03/11/build-your-own-word-sentence-prediction-application-part-02/中相同的代码来进行单词级预测。输入文本数据也在提到的链接中,我使用en_US.news.txt
文件作为我唯一的输入文件。
library(quanteda)
library(data.table)
#read the .txt file
df=readLines('en_US.news.txt')
#take a sample of the df
sampleHolderNews <- sample(length(df), length(df) * 0.1)
US_News_Sample <- df[sampleHolderNews]
#build the corpus of the data
corp <- corpus(US_News_Sample)
#Preprocessing
master_Tokens <- tokens(x = tolower(corp),remove_punct =
TRUE,remove_numbers = TRUE,remove_hyphens = TRUE,remove_symbols = TRUE)
stemed_words <- tokens_wordstem(master_Tokens, language = "english")
#tokenization#
bi_gram <- tokens_ngrams(stemed_words, n = 2)
tri_gram <- tokens_ngrams(stemed_words, n = 3)
uni_DFM <- dfm(stemed_words)
bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)
uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)
sums_U <- colSums(uni_DFM)
sums_B <- colSums(bi_DFM)
sums_T <- colSums(tri_DFM)
# Create data tables with individual words as columns
uni_words <- data.table(word_1 = names(sums_U), count = sums_U)
bi_words <- data.table(
word_1 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 2),
count = sums_B)
tri_words <- data.table(
word_1 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 2),
word_3 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 3),
count = sums_T)
#indexing#
setkey(uni_words, word_1)
setkey(bi_words, word_1, word_2)
setkey(tri_words, word_1, word_2, word_3)
######## Finding Bi-Gram Probability #################
discount_value <- 0.75
# Finding number of bi-gram words
numOfBiGrams <- nrow(bi_words[.(word_1, word_2)])
# Dividing number of times word 2 occurs as second part of bigram, by total number of bigrams.
# Finding probability for a word given the number of times it was second word of a bigram
ckn <- bi_words[, .(Prob = ((.N) / numOfBiGrams)), by = word_2]
setkey(ckn, word_2)
# Assigning the probabilities as second word of bigram, to unigrams
uni_words[, Prob := ckn[word_1, Prob]]
uni_words <- uni_words[!is.na(uni_words$Prob)]
# Finding number of times word 1 occurred as word 1 of bi-grams
n1wi <- bi_words[, .(N = .N), by = word_1]
setkey(n1wi, word_1)
# Assigning total times word 1 occured to bigram cn1
bi_words[, Cn1 := uni_words[word_1, count]]
# Kneser Kney Algorithm
bi_words[, Prob := ((count - discount_value) / Cn1 + discount_value / Cn1 *
n1wi[word_1, N] * uni_words[word_2, Prob])]
######## End of Finding Bi-Gram Probability #################
######## Finding Tri-Gram Probability #################
# Finding count of word1-word2 combination in bigram
tri_words[, Cn2 := bi_words[.(word_1, word_2), .N]]
n1w12 <- tri_words[, .N, by = .(word_1, word_2)]
setkey(n1w12, word_1, word_2)
# Kneser Kney Algorithm
tri_words[, Prob := ((count - discount_value) / Cn2 + discount_value / Cn2 *
n1w12[.(word_1, word_2), .N] * bi_words[.(word_1, word_2), Prob])]
在这里,我得到以下关于三元组 Kneser 算法的错误:
Error in `[.data.table`(tri_words, , `:=`(Prob, ((count - discount_value)/Cn2 + :
Supplied 13867 items to be assigned to 3932 items of column 'Prob'. If you wish to 'recycle'
the RHS please use rep() to make this intent clear to readers of your code.
In addition: Warning messages:
1: In discount_value/Cn2 * n1w12[list(word_1, word_2), .N] * bi_words[list(word_1, :
longer object length is not a multiple of shorter object length
2: In (count - discount_value)/Cn2 + discount_value/Cn2 * n1w12[list(word_1, :
longer object length is not a multiple of shorter object length
我可以找到一些与数据表错误相关的类似问题,但我不明白我应该如何在代码中解决这个错误。