0

我想txt从这个链接中收集诗歌并保存,这里有一些提示:

  1. 用诗人的名字创建文件夹,
  2. 将红圈内的诗歌一一点击保存为txt格式的诗歌,
  3. 文件名应该是带有扩展名的诗歌标题txt

在此处输入图像描述

我是 R 网络爬虫的新手,有人可以帮忙吗?我会感谢您的建议或帮助。

代码:

library(Rcrawler)
library(rvest)

Rcrawler(Website = 'http://famouspoetsandpoems.com/top_poems.html', no_cores = 4, no_conn = 4, Obeyrobots = TRUE)

page <- LinkExtractor(url = 'http://famouspoetsandpoems.com/top_poems.html', ExternalLInks=TRUE)

page$InternalLinks

出去:

  [1] "http://famouspoetsandpoems.com/"                                      
  [2] "http://famouspoetsandpoems.com/poets.html"                            
  [3] "http://famouspoetsandpoems.com/month_poem.html"                       
  [4] "http://famouspoetsandpoems.com/month_poet.html"                       
  [5] "http://famouspoetsandpoems.com/top_poems.html"                        
  [6] "http://famouspoetsandpoems.com/poets_quotes.html"                     
  [7] "http://famouspoetsandpoems.com/love_poems.html"                       
  [8] "http://famouspoetsandpoems.com/thematic_poems.html"                   
  [9] "http://famouspoetsandpoems.com/thematic_quotes.html"                  
 [10] "http://famouspoetsandpoems.com/thematic_poems/birthday_poems.html"    
 [11] "http://famouspoetsandpoems.com/thematic_poems/death_poems.html"       
 [12] "http://famouspoetsandpoems.com/thematic_poems/mother_poems.html"      
 [13] "http://famouspoetsandpoems.com/thematic_poems/family_poems.html"      
 [14] "http://famouspoetsandpoems.com/thematic_poems/thank_you_poems.html"   
 [15] "http://famouspoetsandpoems.com/thematic_poems/sympathy_poems.html"    
 [16] "http://famouspoetsandpoems.com/thematic_poems/retirement_poems.html"  
 [17] "http://famouspoetsandpoems.com/thematic_poems/sorry_poems.html"       
 [18] "http://famouspoetsandpoems.com/thematic_poems/angel_poems.html"       
 [19] "http://famouspoetsandpoems.com/thematic_poems/relationship_poems.html"
 [20] "http://famouspoetsandpoems.com/poets/langston_hughes"                 
 [21] "http://famouspoetsandpoems.com/poets/shel_silverstein"                
 [22] "http://famouspoetsandpoems.com/poets/pablo_neruda"                    
 [23] "http://famouspoetsandpoems.com/poets/maya_angelou"                    
 [24] "http://famouspoetsandpoems.com/poets/edgar_allan_poe"                 
 [25] "http://famouspoetsandpoems.com/poets/robert_frost"                    
 [26] "http://famouspoetsandpoems.com/poets/emily_dickinson"                 
 [27] "http://famouspoetsandpoems.com/poets/elizabeth_barrett_browning"      
 [28] "http://famouspoetsandpoems.com/poets/e__e__cummings"                  
 [29] "http://famouspoetsandpoems.com/poets/walt_whitman"                    
 [30] "http://famouspoetsandpoems.com/poets/william_wordsworth"              
 [31] "http://famouspoetsandpoems.com/poets/allen_ginsberg"                  
 [32] "http://famouspoetsandpoems.com/poets/sylvia_plath"                    
 [33] "http://famouspoetsandpoems.com/poets/jack_prelutsky"                  
 [34] "http://famouspoetsandpoems.com/poets/william_butler_yeats"            
 [35] "http://famouspoetsandpoems.com/poets/thomas_hardy"                    
 [36] "http://famouspoetsandpoems.com/poets/robert_hayden"                   
 [37] "http://famouspoetsandpoems.com/poets/amy_lowell"                      
 [38] "http://famouspoetsandpoems.com/poets/oscar_wilde"                     
 [39] "http://famouspoetsandpoems.com/poets/theodore_roethke"                
 [40] "http://famouspoetsandpoems.com/poets_by_nationality.html"             
 [41] "http://famouspoetsandpoems.com/poets_african_american.html"           
 [42] "http://famouspoetsandpoems.com/poets_women.html"                      
 [43] "http://famouspoetsandpoems.com/poets_contemporary.html"               
 [44] "http://famouspoetsandpoems.com/poets_nobel_prize.html"                
 [45] "http://famouspoetsandpoems.com/country/America/American_poets.html"   
 [46] "http://famouspoetsandpoems.com/country/England/English_poets.html"    
 [47] "http://famouspoetsandpoems.com/poets/maya_angelou/poems/492"          
 [48] "http://famouspoetsandpoems.com/poets/shel_silverstein/poems/14836"    
 [49] "http://famouspoetsandpoems.com/poets/pablo_neruda/poems/15705"        
 [50] "http://famouspoetsandpoems.com/poets/e__e__cummings/poems/14130"      
 [51] "http://famouspoetsandpoems.com/poets/robert_frost/poems/528"          
 [52] "http://famouspoetsandpoems.com/poets/edgar_allan_poe/poems/18847"     
 [53] "http://famouspoetsandpoems.com/poets/emily_dickinson/poems/5212"      
 [54] "http://famouspoetsandpoems.com/poets/langston_hughes/poems/16946"     
 [55] "http://famouspoetsandpoems.com/poets/ezra_pound/poems/18774"          
 [56] "http://famouspoetsandpoems.com/poets/ezra_pound"                      
 [57] "http://famouspoetsandpoems.com/poets/shel_silverstein/poems/14818"    
 [58] "http://famouspoetsandpoems.com/poets/oscar_wilde/poems/11040"         
 [59] "http://famouspoetsandpoems.com/poets/maya_angelou/poems/482"          
 [60] "http://famouspoetsandpoems.com/poets/langston_hughes/poems/16944"     
 [61] "http://famouspoetsandpoems.com/poets/walt_whitman/poems/17543"        
 [62] "http://famouspoetsandpoems.com/poets/robert_frost/poems/530"          
 [63] "http://famouspoetsandpoems.com/poets/william_wordsworth/poems/10951"  
 [64] "http://famouspoetsandpoems.com/poets/mark_strand/poems/11833"         
 [65] "http://famouspoetsandpoems.com/poets/mark_strand"                     
 [66] "http://famouspoetsandpoems.com/poets/w__h__auden/poems/10095"         
 [67] "http://famouspoetsandpoems.com/poets/w__h__auden"                     
 [68] "http://famouspoetsandpoems.com/poets/maya_angelou/poems/496"          
 [69] "http://famouspoetsandpoems.com/poets/edgar_allan_poe/poems/18848"     
 [70] "http://famouspoetsandpoems.com/poets/dylan_thomas/poems/11395"        
 [71] "http://famouspoetsandpoems.com/poets/dylan_thomas"                    
 [72] "http://famouspoetsandpoems.com/poets/ogden_nash/poems/19570"          
 [73] "http://famouspoetsandpoems.com/poets/ogden_nash"                      
 [74] "http://famouspoetsandpoems.com/poets/shel_silverstein/poems/14820"    
 [75] "http://famouspoetsandpoems.com/poets/emily_dickinson/poems/6104"      
 [76] "http://famouspoetsandpoems.com/poets/edgar_allan_poe/poems/18849"     
 [77] "http://famouspoetsandpoems.com/poets/e__e__cummings/poems/14135"      
 [78] "http://famouspoetsandpoems.com/poets/anna_akhmatova/poems/31"         
 [79] "http://famouspoetsandpoems.com/poets/anna_akhmatova"                  
 [80] "http://famouspoetsandpoems.com/poets/pablo_neruda/poems/15708"        
 [81] "http://famouspoetsandpoems.com/poets/seamus_heaney/poems/12699"       
 [82] "http://famouspoetsandpoems.com/poets/seamus_heaney"                   
 [83] "http://famouspoetsandpoems.com/poets/william_butler_yeats/poems/10173"
 [84] "http://famouspoetsandpoems.com/poets/william_barnes/poems/20551"      
 [85] "http://famouspoetsandpoems.com/poets/william_barnes"                  
 [86] "http://famouspoetsandpoems.com/poets/ted_kooser/poems/17900"          
 [87] "http://famouspoetsandpoems.com/poets/ted_kooser"                      
 [88] "http://famouspoetsandpoems.com/poets/gwendolyn_brooks/poems/4176"     
 [89] "http://famouspoetsandpoems.com/poets/gwendolyn_brooks"                
 [90] "http://famouspoetsandpoems.com/poets/sylvia_plath/poems/18897"        
 [91] "http://famouspoetsandpoems.com/poets/jack_prelutsky/poems/18767"      
 [92] "http://famouspoetsandpoems.com/poets/sara_teasdale/poems/17949"       
 [93] "http://famouspoetsandpoems.com/poets/sara_teasdale"                   
 [94] "http://famouspoetsandpoems.com/poets/charles_bukowski/poems/13062"    
 [95] "http://famouspoetsandpoems.com/poets/charles_bukowski"                
 [96] "http://famouspoetsandpoems.com/poets/allen_ginsberg/poems/8318"       
 [97] "http://famouspoetsandpoems.com/poets/robert_hayden/poems/4406"        
 [98] "http://famouspoetsandpoems.com/poets/william_shakespeare/poems/1317"  
 [99] "http://famouspoetsandpoems.com/poets/william_shakespeare"             
[100] "http://famouspoetsandpoems.com/poets/william_blake/poems/1002"        
[101] "http://famouspoetsandpoems.com/poets/william_blake"                   
[102] "http://famouspoetsandpoems.com/poets/sylvia_plath/poems/18899"        
[103] "http://famouspoetsandpoems.com/poets/jack_prelutsky/poems/18768"      
[104] "http://famouspoetsandpoems.com/poets/walt_whitman/poems/17466"        
[105] "http://famouspoetsandpoems.com/poets/robert_burns/poems/4971"         
[106] "http://famouspoetsandpoems.com/poets/robert_burns"                    
[107] "http://famouspoetsandpoems.com/poets/maya_angelou/poems/494"          
[108] "http://famouspoetsandpoems.com/poets/stephen_crane/poems/13266"       
[109] "http://famouspoetsandpoems.com/poets/stephen_crane"                   
[110] "http://famouspoetsandpoems.com/poets/raymond_carver/poems/4592"       
[111] "http://famouspoetsandpoems.com/poets/raymond_carver"                  
[112] "http://famouspoetsandpoems.com/poets/e__e__cummings/poems/14131"      
[113] "http://famouspoetsandpoems.com/poets/langston_hughes/poems/16947"     
[114] "http://famouspoetsandpoems.com/about_project.html"                    
[115] "http://famouspoetsandpoems.com/privacy_policy.html"                   
[116] "http://famouspoetsandpoems.com/copyright_notice.html"                 
[117] "http://famouspoetsandpoems.com/links_poetry.html"                     
[118] "http://famouspoetsandpoems.com/link_to_us.html"                       
[119] "http://famouspoetsandpoems.com/tell_a_friend.html"                    
[120] "http://famouspoetsandpoems.com/contact_us.html"
4

1 回答 1

1

这需要相当多的知识片段,我认为初学者无法连接在一起。所以这是代码,我在评论中解释了:

library(rvest)
library(dplyr)

pg <- read_html("http://famouspoetsandpoems.com/top_poems.html")

tbl <- pg %>% 
  html_nodes(xpath = "//table[@width='436']") %>% .[[2]] %>% # the table that has the info about poems and poets is the second one with width equals 436
  html_table(fill = T) %>% # there are blank lines in between poems' rows => need to set fill = T
  setNames(c("top", "poem", "poet")) %>%
  filter(!is.na(top)) %>% # remove blank lines
  mutate(
    link = sapply(poem, function(x) {
      paste0(
        "http://famouspoetsandpoems.com",
        pg %>% html_node(xpath = paste0("//td/a[contains(., \"", x, "\")]")) %>% html_attr("href")
      ) # this is tricky. with each poem title, find the <a> tag has the text is the title and extract the href attribute
    }, USE.NAMES = F)
  )

dir <- "~/poems" # where do you wanna save the result
for (poet in unique(tbl$poet)) dir.create(paste0(dir, "/", poet))

for (i in 1:nrow(tbl)) {
  poem_content <- 
    read_html(tbl$link[i]) %>% # read the link page
    html_nodes(xpath = "//td/div[@style='padding-left:14px;padding-top:20px;font-family:Arial;font-size:13px;']/text()") %>%
    html_text(trim = T) # poem lines
  file_path <- paste0(dir, "/", tbl$poet[i], "/", tbl$poem[i], ".txt")
  writeLines(poem_content, con = file_path)
}
于 2021-01-10T03:32:09.470 回答