您可以在没有同位素的情况下抓取表格,然后在您决定使用同位素时模仿页面所做的发布请求;然后左加入两个Name
列。您将获得比左表中更多的行(无同位素),因为有多个Change values
,但这与您在查看您描述的同位素的方法中看到的内容相匹配,其中有逗号分隔的同位素值列表,在图中,而不是按行拆分。
我选择一个更具选择性的 css 选择器来最初定位感兴趣的特定表,而不是索引到列表中。
我用来 write_excel_csv
在写出时保留标题的字符编码(我从@stefan得到的一个想法)。
joint_table
您可以在写出之前删除输出中不需要的列(子集/选择等)。
r
library(dyplr)
library(httr)
library(rvest)
library(readr)
library(magrittr)
library(stringr)
webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")
no_isotopes <- webpage %>%
html_node("#maintable") %>%
html_table(fill = T)
data <- list(
'sfor' = "names",
'stype' = "contains",
'country' = "All",
'categ' = "Ungrouped achondrites",
'page' = "0",
'map' = "ge",
'srt' = "name",
'lrec' = "200",
'pnt' = "Oxygen isotopes",
'mblist' = "All",
'snew' = "0",
'sea' = "*"
)
r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)
isotopes <- content(r, "text") %>%
read_html(encoding = "UTF-8") %>%
html_node("#maintable") %>%
html_table(fill = T)
joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)
write_excel_csv(x = joint_table, path = "joint.csv", col_names = T, na = "")
示例输出:
编辑:
根据您在评论中的要求添加来自其他网址的附加信息。我必须动态确定要选择哪个表号,以及处理没有表的情况。
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#>
#> pluck
#> The following object is masked from 'package:readr':
#>
#> guess_encoding
library(readr)
library(furrr)
get_table <- function(url) {
page <- read_html(url)
test_list <- page %>%
html_nodes("#maintable tr > .inside:nth-child(odd)") %>%
html_text() # get left hand column %>%
index <- match(TRUE, stringr::str_detect(test_list, "Data from:")) + 1
table <- page %>%
html_node(paste0("#maintable tr:nth-of-type(", index, ") table")) %>%
html_table() %>%
as_tibble()
temp <- set_names(data.frame(t(table[, -1]), row.names = c()), t(table[, 1])) # https://www.nesono.com/node/456 ; https://stackoverflow.com/a/7970267/6241235
return(temp)
}
start_url <- "https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0"
base <- "https://www.lpi.usra.edu"
webpage <- read_html(start_url)
no_isotopes <- webpage %>%
html_node("#maintable") %>%
html_table(fill = T)
data <- list(
"sfor" = "names",
"stype" = "contains",
"country" = "All",
"categ" = "Ungrouped achondrites",
"page" = "0",
"map" = "ge",
"srt" = "name",
"lrec" = "200",
"pnt" = "Oxygen isotopes",
"mblist" = "All",
"snew" = "0",
"sea" = "*"
)
r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)
isotopes <- content(r, "text") %>%
read_html(encoding = "UTF-8") %>%
html_node("#maintable") %>%
html_table(fill = T)
joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)
lookups <- webpage %>%
html_node("#maintable") %>%
html_nodes("td:nth-of-type(1) a") %>%
map_df(~ c(html_text(.), html_attr(., "href")) %>%
set_names("Name", "Link")) %>%
mutate(Link = paste0(base, gsub("\\s+", "%20", Link)))
error_df <- tibble(
`State/Prov/County:` = NA_character_,
`Origin or pseudonym:` = NA_character_,
`Date:` = NA_character_,
`Latitude:` = NA_character_,
`Longitude:` = NA_character_,
`Mass (g):` = NA_character_,
`Pieces:` = NA_character_,
`Class:` = NA_character_,
`Shock stage:` = NA_character_,
`Fayalite (mol%):` = NA_character_,
`Ferrosilite (mol%):` = NA_character_,
`Wollastonite (mol%):` = NA_character_,
`Magnetic suscept.:` = NA_character_,
`Classifier:` = NA_character_,
`Type spec mass (g):` = NA_character_,
`Type spec location:` = NA_character_,
`Main mass:` = NA_character_,
`Finder:` = NA_character_,
`Comments:` = NA_character_,
)
no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
df <- furrr::future_map_dfr(lookups$Link, ~ tryCatch(get_table(.x), error = function(e) error_df))
colnames(df) <- sub(":", "", colnames(df))
df2 <- df %>%
mutate(
`Mass (g)` = gsub(",", "", `Mass (g)`),
across(c(`Mass (g)`, `Magnetic suscept.`), as.numeric)
)
if (nrow(df2) == nrow(no_isotopes)) {
additional_info <- cbind(lookups, df2)
joint_table$Name <- gsub(" \\*\\*", "", joint_table$Name)
final_table <- dplyr::left_join(joint_table, additional_info, by = "Name", copy = FALSE)
write_excel_csv(x = final_table, file = "joint.csv", col_names = T, na = "")
}
由reprex 包于 2021-02-27 创建(v0.3.0)
注意
由于某种原因,OP 在查找变量方面存在问题,所以这是我为他们编写的替代方法:
lookups <- map_df(
webpage %>% html_node("#maintable") %>% html_nodes("td:nth-of-type(1) a") , ~
data.frame(
Name = .x %>% html_text(),
Link = paste0(base, gsub("\\s+", "%20", .x %>% html_attr("href")))
)
) %>% as_tibble()