0

我刚刚开始使用 R 进行文本挖掘并且遇到了一个问题。

我已经成功地为我的数据集中的单个单词绘制了 tf_idf 图表,其中包括 3 个不同的列(正面、负面和银行) - 列名为“Box”。

我正在尝试对二元组和三元组做同样的事情,并使用相同的代码:

Trigram_tibble %>%
  arrange(desc(tf_idf)) %>%
  mutate(trigram = factor(trigram, levels = rev(unique(trigram)))) %>% 
  group_by(Box) %>% 
  top_n(10, tf_idf) %>% 
  ungroup %>%
  ggplot(aes(trigram, tf_idf, fill = Box)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~Box, ncol = 2, scales = "free") +
  coord_flip()

我发现(我认为)'top_n' 函数返回排名最高的三元组,并且它自动使用 tibble 中的最后一个变量(在我的情况下这是 tf_idf,我选择了 n-10) . 但是,当为 bigrams 运行此程序时,我只能生成沿 y 轴似乎有数百(千??) bigrams 的图表。

tf-idf 按变量分组

在图片中,您可以看到负变量看起来不错(我已将其编辑以保护数据),但其他两个不是!

我最初从整洁的文本挖掘书中获取了这段代码。

编辑 - 添加数据样本

样本数据

我现在最好的猜测是'top_n' tf_idf 分数恰好有许多完全相同。在这种情况下,我现在不确定这是一个有用的计算,我想知道为什么它在整洁的教科书中运行得这么好,但不适用于我的数据。

编辑 2

我将 Trigram_tibble 减少到 50 个观察值,这是 dput(Trigram_tibble) 的输出(我已经模糊了调查响应文本三元组)

a<-Trigram_tibble [1:50, 1:8] dput(a) structure(list(Respondent = c(1294L, 2693L, 42L, 463L, 463L, 1481L, 1706L, 1891L, 1917L, 2442L, 2693L, 3590L, 3590L , 3916L, 4454L, 4682L, 5996L, 6283L, 6283L, 6568L, 9101L, 2L, 3L, 4L, 4L, 4L, 8L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 8L, 13, , 18L, 18L, 18L, 20L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L ), Box = c("正", "负", "负", "负", "负", “负”、“银行”、“正”、“负”、“负”、“负”、“银行”、“银行”、“负”、“正”、“负”、“负”、“负” ", "负", "负", "负", "负", "负",“负”、“负”、“负”、“负”、“银行”、“银行”、“负”、“负”、“负”、“负”、“负”、“正”、“正” "、"正"、"负"、"负"、"负"、"负"、"负"、"银行"、"银行"、"银行"、"负"、"负"、"负"、 "负", "负"), trigram = c("xxx xxx xxx", "xxx xxx xxx", "xxx xxx xxx", "xxx xxx xxx", "xxx xxx xxx", "xxx xxx xxx", " xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx” xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx” 、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“ xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx xxx”、“xxx xxx” xxx", "xxx xxx xxx”, “xxx xxx xxx”, “xxx xxx xxx”, “xxx xxx xxx”, “xxx xxx xxx”, “xxx xxx xxx”), n = c(4L, 3L, 2L, 2L, 2L, 2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,2L,1L,1L,1L,1L,1L,1L,1L,1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),总计 = c(1714L, 2899L, 2899L , 2899L, 2899L, 2899L, 836L, 1714L, 2899L, 2899L, 2899L, 836L, 836L, 2899L, 1714L, 2899L, 2899L, 2899L, 2899L, 2899L, 2899L, 2899L, 2899L, 2899L, 2899L, 2899L, 2899L, 836L , 836L, 2899L, 2899L, 2899L, 2899L, 2899L, 1714L, 1714L, 1714L, 2899L, 2899L, 2899L, 2899L, 2899L, 836L, 836L, 836L, 2899L, 2899L, 2899L, 2899L, 2899L), tf = c( 0.00233372228704784, 0.00103483959986202, 0.000689893066574681, 0.000689893066574681, 0.000689893066574681, 0.000689893066576.00239234449760766, 0.00116686114352392, 0.000689893066574681, 0.000689893066574681, 0.000689893066574681, 0.00239234449760766, 0.00239234449760766, 0.000689893066574681, 0.00116686114352392, 0.000689893066574681, 0.000689893066574681, 0.000689893066574681, 0.000689893066574681, 0.000689893066574681, 0.000689893066574681, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00119617224880383, 0.00119617224880383, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00058343057176196, 0.00058343057176196, 0.00058343057176196, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00119617224880383, 0.00119617224880383, 0.00119617224880383, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734, 0.00034494653328734), idf = c(-2.07944154167984, 0.405465108108164, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 0, 1.09861228866811, 1.09861228866811, -2.07944154167984, 1.09861228866811, 0.405465108108164, 1.09861228866811, -0.693147180559945, 1.09861228866811, 1.09861228866811, 0, 1.09861228866811, 1.09861228866811, -1.29928298413026, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811 , 1.09861228866811, 1.09861228866811,1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811, 1.09861228866811), tf_idf = c(-0.00485283907043135, 0.000419591350232664, 0.000757925000805871, 0.000757925000805871, 0.000757925000805871, 0.000757925000805871, 0, 0.0012819279914447, 0.000757925000805871, -0.00143459230195228, 0.000757925000805871, 0.00097001222035446, 0.00262825906379931, -0.000478197433984095, 0.0012819279914447, 0.000757925000805871, 0, 0.000757925000805871, 0.000757925000805871, -0.000896366322269928, 0.000757925000805871, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.00131412953189965, 0.00131412953189965, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000640963995722351, 0.000640963995722351, 0.000640963995722351, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.00131412953189965, 0.00131412953189965, 0.00131412953189965, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935, 0.000378962500402935)), row.names = c(NA, -50L), class = c("tbl_df ", "tbl", "data.frame"))

4

0 回答 0