r - 使用 Tidytext 进行文本挖掘：问题 pairwise_count 和 pairwise_cor

Question

我正在尝试使用 Tidytext（使用 R 进行文本挖掘），我想使用 widyr 库中的函数 pairwise_count 和 pairwise_cor。我的语料库来自按处理的文本文件。

library(readr)
library(dplyr)
library(tidytext)
library(widyr)

set.seed(2017)

Korpus <- read_file("/home/knecht/korpus.res")
print(Korpus)

Korpus_DF <-data_frame(document= 1, text=Korpus)

spon_words <- Korpus_DF %>%
  unnest_tokens(word, text)
print(spon_words)

spon_words %>%
  count(word, sort=TRUE)

word_cors <- spon_words %>%
  group_by(word) %>%
 filter(n()>= 10) %>%
  pairwise_cor(word, document, sort = TRUE, upper= FALSE)
word_cors

pair_test <- spon_words %>%
  pairwise_count(word, document)
print(pair_test)

我想，我没有得到正确的结果，因为语料库包含多个短语，如“spiegel online”或“spiegel plus”短语，但这些没有出现在结果表中：

> library(readr)

> library(dplyr)

> library(tidytext)

> library(widyr)

> set.seed(2017)

> Korpus <- read_file("/home/knecht/korpus.res")

> print(Korpus)
[1] "29.12.2017 17:24:57 Results of ResultWriter 'Write as Text' [1]: \n29.12.2017 17:24:57 SimpleExampleSet:\n1 examples,\n0 regular attributes,\nspecial attributes = {\n    text = #0: text (text/single_value)/values=[SPIEGEL ONLINE Aktuelle Nachrichten Nachrichten SPIEGEL ONLINE Mein SPIEGEL 29. Dezember 2017 TV-Programm Wetter Schlagzeilen Themenwochen Wahl Verbraucher Service Unternehmen Märkte Staat Soziales LOTTO 6aus49 Spielerindex SPIX Champions League Formel Bundesliga präsentiert von Continental Uno-Klimakonferenz 2017 Diagnose Therapie Ernährung Fitness Sex Partnerschaft Schwangerschaft Kind Erster Weltkrieg Zweiter Weltkrieg Leben und Lernen Deals der Woche IAA 2017 Front Page SPIEGEL Plus SPIEGEL Plus Deutschland SPIEGEL Plus Wirtschaft SPIEGEL Plus Gesellschaft SPIEGEL Plus Ausland SPIEGEL Plus Sport SPIEGEL Plus Wissenschaft SPIEGEL Plus Kultur SPIEGEL AKADEMIE DER SPIEGEL live DER SPIEGEL DER SPIEGEL digitales Magazin Titelbilder Heftarchive SPIEGEL SPIEGEL Magazin SPIE... <truncated>

> Korpus_DF <-data_frame(document= 1, text=Korpus)

> spon_words <- Korpus_DF %>%
+   unnest_tokens(word, text)

> print(spon_words)
# A tibble: 3,267 x 2
   document         word
      <dbl>        <chr>
 1        1   29.12.2017
 2        1           17
 3        1           24
 4        1           57
 5        1      results
 6        1           of
 7        1 resultwriter
 8        1        write
 9        1           as
10        1         text
# ... with 3,257 more rows

> spon_words %>%
+   count(word, sort=TRUE)
# A tibble: 1,645 x 2
      word     n
     <chr> <int>
 1    mehr    84
 2     die    78
 3     und    75
 4     der    63
 5 spiegel    58
 6     von    35
 7     sie    32
 8     das    31
 9     ein    31
10     für    31
# ... with 1,635 more rows

> word_cors <- spon_words %>%
+   group_by(word) %>%
+  filter(n()>= 10) %>%
+   pairwise_cor(word, document, sort = TRUE, upper= FALSE)

> word_cors
# A tibble: 561 x 3
     item1  item2 correlation
     <chr>  <chr>       <dbl>
 1 spiegel online         NaN
 2 spiegel   2017         NaN
 3  online   2017         NaN
 4 spiegel    von         NaN
 5  online    von         NaN
 6    2017    von         NaN
 7 spiegel    und         NaN
 8  online    und         NaN
 9    2017    und         NaN
10     von    und         NaN
# ... with 551 more rows

> pair_test <- spon_words %>%
+   pairwise_count(word, document)

> print(pair_test)
# A tibble: 2,704,380 x 3
          item1      item2     n
          <chr>      <chr> <dbl>
 1           17 29.12.2017     1
 2           24 29.12.2017     1
 3           57 29.12.2017     1
 4      results 29.12.2017     1
 5           of 29.12.2017     1
 6 resultwriter 29.12.2017     1
 7        write 29.12.2017     1
 8           as 29.12.2017     1
 9         text 29.12.2017     1
10            1 29.12.2017     1
# ... with 2,704,370 more rows

有人在这里吗，谁能给我一个提示，好吗？

关于托比亚斯

score 6 · Accepted Answer

我在这里注意到你document对所有单词都有相同的值，这使得计算单词对或尝试计算相关性不是很有意义。

这是一个例子来告诉你我的意思。让我们以 Jane Austen 的小说为例，设置一个包含两列的整洁数据框，一列document总是像你的那样具有 1 的值，另一列section将文本分成块。

library(dplyr)
library(janeaustenr)
library(tidytext)
library(widyr)

austen_section_words <- austen_books() %>%
    filter(book == "Pride & Prejudice") %>%
    mutate(section = row_number() %/% 10,
           document = 1) %>%
    filter(section > 0) %>%
    unnest_tokens(word, text) %>%
    filter(!word %in% stop_words$word)

austen_section_words
#> # A tibble: 37,240 x 4
#>    book              section document word        
#>    <fctr>              <dbl>    <dbl> <chr>       
#>  1 Pride & Prejudice    1.00     1.00 truth       
#>  2 Pride & Prejudice    1.00     1.00 universally 
#>  3 Pride & Prejudice    1.00     1.00 acknowledged
#>  4 Pride & Prejudice    1.00     1.00 single      
#>  5 Pride & Prejudice    1.00     1.00 possession  
#>  6 Pride & Prejudice    1.00     1.00 fortune     
#>  7 Pride & Prejudice    1.00     1.00 wife        
#>  8 Pride & Prejudice    1.00     1.00 feelings    
#>  9 Pride & Prejudice    1.00     1.00 views       
#> 10 Pride & Prejudice    1.00     1.00 entering    
#> # ... with 37,230 more rows

这两列在开始时的值都是 1，但在保持为 1 的section同时还有许多其他值document。如果我们尝试使用widyr::pairwise_count()or比较单词集，widyr::pairwise_cor()使用这两列会得到非常不同的结果。在第一种情况下，我们要问，“这些词在我定义的部分中多久一起使用？” 在第二种情况下，我们要问，“这些词在整个文档中一起使用的频率如何？” 答案是定义 1，适用于所有单词。

word_pairs <- austen_section_words %>%
    pairwise_count(word, section, sort = TRUE)

word_pairs
#> # A tibble: 796,008 x 3
#>    item1     item2         n
#>    <chr>     <chr>     <dbl>
#>  1 darcy     elizabeth 144  
#>  2 elizabeth darcy     144  
#>  3 miss      elizabeth 110  
#>  4 elizabeth miss      110  
#>  5 elizabeth jane      106  
#>  6 jane      elizabeth 106  
#>  7 miss      darcy      92.0
#>  8 darcy     miss       92.0
#>  9 elizabeth bingley    91.0
#> 10 bingley   elizabeth  91.0
#> # ... with 795,998 more rows

word_pairs <- austen_section_words %>%
    pairwise_count(word, document, sort = TRUE)

word_pairs
#> # A tibble: 36,078,042 x 3
#>    item1         item2     n
#>    <chr>         <chr> <dbl>
#>  1 universally   truth  1.00
#>  2 acknowledged  truth  1.00
#>  3 single        truth  1.00
#>  4 possession    truth  1.00
#>  5 fortune       truth  1.00
#>  6 wife          truth  1.00
#>  7 feelings      truth  1.00
#>  8 views         truth  1.00
#>  9 entering      truth  1.00
#> 10 neighbourhood truth  1.00
#> # ... with 36,078,032 more rows

因此，我认为您需要退后一步，重新考虑您要回答的分析问题。你想识别二元组吗？您是否想查看哪些词在彼此附近使用得更频繁？你需要根据你想要达到的目标来改变你的方法。

r - 使用 Tidytext 进行文本挖掘：问题 pairwise_count 和 pairwise_cor

1 回答 1

Related

Reference