

#read the .txt file

#take a sample of the df
sampleHolderNews <- sample(length(df), length(df) * 0.1)
US_News_Sample <- df[sampleHolderNews]

#build the corpus of the data 
corp <- corpus(US_News_Sample)


master_Tokens <- tokens(x = tolower(corp),remove_punct = 
TRUE,remove_numbers = TRUE,remove_hyphens = TRUE,remove_symbols = TRUE)
stemed_words <- tokens_wordstem(master_Tokens, language = "english")

bi_gram <- tokens_ngrams(stemed_words, n = 2)
tri_gram <- tokens_ngrams(stemed_words, n = 3)

uni_DFM <- dfm(stemed_words)
bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)

uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)

sums_U <- colSums(uni_DFM)
sums_B <- colSums(bi_DFM)
sums_T <- colSums(tri_DFM)

# Create data tables with individual words as columns
uni_words <- data.table(word_1 = names(sums_U), count = sums_U)

bi_words <- data.table(
word_1 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 2),
count = sums_B)

tri_words <- data.table(
word_1 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 2),
word_3 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 3),
count = sums_T)

setkey(uni_words, word_1)
setkey(bi_words, word_1, word_2)
setkey(tri_words, word_1, word_2, word_3)

######## Finding Bi-Gram Probability #################

discount_value <- 0.75
# Finding number of bi-gram words
numOfBiGrams <- nrow(bi_words[.(word_1, word_2)])
# Dividing number of times word 2 occurs as second part of bigram, by total number of bigrams.  
# Finding probability for a word given the number of times it was second word of a bigram
ckn <- bi_words[, .(Prob = ((.N) / numOfBiGrams)), by = word_2]
setkey(ckn, word_2)
# Assigning the probabilities as second word of bigram, to unigrams
uni_words[, Prob := ckn[word_1, Prob]]
uni_words <- uni_words[!is.na(uni_words$Prob)]
# Finding number of times word 1 occurred as word 1 of bi-grams
n1wi <- bi_words[, .(N = .N), by = word_1]
setkey(n1wi, word_1)
# Assigning total times word 1 occured to bigram cn1
bi_words[, Cn1 := uni_words[word_1, count]]
# Kneser Kney Algorithm
bi_words[, Prob := ((count - discount_value) / Cn1 + discount_value / Cn1 * 
n1wi[word_1, N] * uni_words[word_2, Prob])]

######## End of Finding Bi-Gram Probability #################

######## Finding Tri-Gram Probability #################

# Finding count of word1-word2 combination in bigram 
tri_words[, Cn2 := bi_words[.(word_1, word_2), .N]]
n1w12 <- tri_words[, .N, by = .(word_1, word_2)]
setkey(n1w12, word_1, word_2)

# Kneser Kney Algorithm
tri_words[, Prob := ((count - discount_value) / Cn2 + discount_value / Cn2 * 
n1w12[.(word_1, word_2), .N] * bi_words[.(word_1, word_2), Prob])]

在这里,我得到以下关于三元组 Kneser 算法的错误:

 Error in `[.data.table`(tri_words, , `:=`(Prob, ((count - discount_value)/Cn2 +  : 
 Supplied 13867 items to be assigned to 3932 items of column 'Prob'. If you wish to 'recycle' 
 the RHS please use rep() to make this intent clear to readers of your code.
 In addition: Warning messages:
 1: In discount_value/Cn2 * n1w12[list(word_1, word_2), .N] * bi_words[list(word_1,  :
 longer object length is not a multiple of shorter object length
 2: In (count - discount_value)/Cn2 + discount_value/Cn2 * n1w12[list(word_1,  :
 longer object length is not a multiple of shorter object length



(count - discount_value) / Cn2 + discount_value / Cn2

长度为 20,如tri_words. 但接下来的表达

n1w12[.(word_1, word_2), .N]

长度为 19。然后是最后一部分,

bi_words[.(word_1, word_2), Prob])

长度为 155(并且包含很多 NA)。


