r - 查找三元组概率的 data.table 错误和警告

Question

我正在尝试与https://thiloshon.wordpress.com/2018/03/11/build-your-own-word-sentence-prediction-application-part-02/中相同的代码来进行单词级预测。输入文本数据也在提到的链接中，我使用en_US.news.txt文件作为我唯一的输入文件。

library(quanteda)
library(data.table)

#read the .txt file
df=readLines('en_US.news.txt')

#take a sample of the df
sampleHolderNews <- sample(length(df), length(df) * 0.1)
US_News_Sample <- df[sampleHolderNews]

#build the corpus of the data 
corp <- corpus(US_News_Sample)

#Preprocessing

master_Tokens <- tokens(x = tolower(corp),remove_punct = 
TRUE,remove_numbers = TRUE,remove_hyphens = TRUE,remove_symbols = TRUE)
stemed_words <- tokens_wordstem(master_Tokens, language = "english")

#tokenization#
bi_gram <- tokens_ngrams(stemed_words, n = 2)
tri_gram <- tokens_ngrams(stemed_words, n = 3)

uni_DFM <- dfm(stemed_words)
bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)

uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)

sums_U <- colSums(uni_DFM)
sums_B <- colSums(bi_DFM)
sums_T <- colSums(tri_DFM)

# Create data tables with individual words as columns
uni_words <- data.table(word_1 = names(sums_U), count = sums_U)

bi_words <- data.table(
word_1 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 2),
count = sums_B)

tri_words <- data.table(
word_1 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 2),
word_3 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 3),
count = sums_T)

#indexing#
setkey(uni_words, word_1)
setkey(bi_words, word_1, word_2)
setkey(tri_words, word_1, word_2, word_3)

######## Finding Bi-Gram Probability #################

discount_value <- 0.75
# Finding number of bi-gram words
numOfBiGrams <- nrow(bi_words[.(word_1, word_2)])
# Dividing number of times word 2 occurs as second part of bigram, by total number of bigrams.  
# Finding probability for a word given the number of times it was second word of a bigram
ckn <- bi_words[, .(Prob = ((.N) / numOfBiGrams)), by = word_2]
setkey(ckn, word_2)
# Assigning the probabilities as second word of bigram, to unigrams
uni_words[, Prob := ckn[word_1, Prob]]
uni_words <- uni_words[!is.na(uni_words$Prob)]
# Finding number of times word 1 occurred as word 1 of bi-grams
n1wi <- bi_words[, .(N = .N), by = word_1]
setkey(n1wi, word_1)
# Assigning total times word 1 occured to bigram cn1
bi_words[, Cn1 := uni_words[word_1, count]]
# Kneser Kney Algorithm
bi_words[, Prob := ((count - discount_value) / Cn1 + discount_value / Cn1 * 
n1wi[word_1, N] * uni_words[word_2, Prob])]

######## End of Finding Bi-Gram Probability #################

######## Finding Tri-Gram Probability #################

# Finding count of word1-word2 combination in bigram 
tri_words[, Cn2 := bi_words[.(word_1, word_2), .N]]
n1w12 <- tri_words[, .N, by = .(word_1, word_2)]
setkey(n1w12, word_1, word_2)

# Kneser Kney Algorithm
tri_words[, Prob := ((count - discount_value) / Cn2 + discount_value / Cn2 * 
n1w12[.(word_1, word_2), .N] * bi_words[.(word_1, word_2), Prob])]

在这里，我得到以下关于三元组 Kneser 算法的错误：

 Error in `[.data.table`(tri_words, , `:=`(Prob, ((count - discount_value)/Cn2 +  : 
 Supplied 13867 items to be assigned to 3932 items of column 'Prob'. If you wish to 'recycle' 
 the RHS please use rep() to make this intent clear to readers of your code.
 In addition: Warning messages:
 1: In discount_value/Cn2 * n1w12[list(word_1, word_2), .N] * bi_words[list(word_1,  :
 longer object length is not a multiple of shorter object length
 2: In (count - discount_value)/Cn2 + discount_value/Cn2 * n1w12[list(word_1,  :
 longer object length is not a multiple of shorter object length

我可以找到一些与数据表错误相关的类似问题，但我不明白我应该如何在代码中解决这个错误。

score 0 · Accepted Answer

问题在于您尝试将最后一行中的数量相乘。这个表达式：

(count - discount_value) / Cn2 + discount_value / Cn2

长度为 20，如tri_words. 但接下来的表达

n1w12[.(word_1, word_2), .N]

长度为 19。然后是最后一部分，

bi_words[.(word_1, word_2), Prob])

长度为 155（并且包含很多 NA）。

错误消息是说较短的项目不能被回收到较长的项目中，因为较长的项目的长度不是较短项目的长度的倍数。要解决此问题，您需要更仔细地实施此算法。

r - 查找三元组概率的 data.table 错误和警告

1 回答 1

Related

Reference