我正在尝试处理 tf-idf 加权语料库(我希望 tf 是按文档而不是简单计数的比例)。我希望所有经典文本挖掘库都返回相同的值,但我得到不同的值。我的代码中是否有错误(例如,我是否需要转置一个对象?)或者 tf-idf 计数的默认参数是否因包而异?
library(tm)
library(tidyverse)
library(quanteda)
df <- as.data.frame(cbind(doc = c("doc1", "doc2"), text = c("the quick brown fox jumps over the lazy dog", "The quick brown foxy ox jumps over the lazy god")), stringsAsFactors = F)
df.count1 <- df %>% unnest_tokens(word, text) %>%
count(doc, word) %>%
bind_tf_idf(word, doc, n) %>%
select(doc, word, tf_idf) %>%
spread(word, tf_idf, fill = 0)
df.count2 <- df %>% unnest_tokens(word, text) %>%
count(doc, word) %>%
cast_dtm(document = doc,term = word, value = n, weighting = weightTfIdf) %>%
as.matrix() %>% as.data.frame()
df.count3 <- df %>% unnest_tokens(word, text) %>%
count(doc, word) %>%
cast_dfm(document = doc,term = word, value = n) %>%
dfm_tfidf() %>% as.data.frame()
> df.count1
# A tibble: 2 x 12
doc brown dog fox foxy god jumps lazy over ox quick the
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 doc1 0 0.0770 0.0770 0 0 0 0 0 0 0 0
2 doc2 0 0 0 0.0693 0.0693 0 0 0 0.0693 0 0
> df.count2
brown dog fox jumps lazy over quick the foxy god ox
doc1 0 0.1111111 0.1111111 0 0 0 0 0 0.0 0.0 0.0
doc2 0 0.0000000 0.0000000 0 0 0 0 0 0.1 0.1 0.1
> df.count3
brown dog fox jumps lazy over quick the foxy god ox
doc1 0 0.30103 0.30103 0 0 0 0 0 0.00000 0.00000 0.00000
doc2 0 0.00000 0.00000 0 0 0 0 0 0.30103 0.30103 0.30103