我正在尝试在 R 中创建一个带有字符级二元组的文档特征矩阵。我的代码的最后一行需要永远运行并且永远不会完成。其他线路最多花费不到一分钟。我不知道该怎么办。任何意见,将不胜感激。
代码:
library(quanteda)
#Tokenise corpus by characters
character_level_tokens = quanteda::tokens(corpus,
what = "character",
remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T,
split_hyphens = T)
#Convert tokens to characters
character_level_tokens = as.character(character_level_tokens)
#Keep A-Z, a-z letters
character_level_tokens = gsub("[^A-Za-z]","",character_level_tokens)
#Extract character-level bigrams
final_data_char_bigram = char_ngrams(character_level_tokens, n = 2L, concatenator = "")
#Create document-feature matrix (DFM)
dfm.final_data_char_bigram = dfm(final_data_char_bigram)
length(final_data_char_bigram)
[1] 37115571
head(final_data_char_bigram)
[1] "lo" "ov" "ve" "el" "ly" "yt"