1

这是我需要的数据:

https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect =&photo=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0

我已经将表格导入到 R 中:

library(tidyverse)
library(rvest)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")

tbls <- html_nodes(webpage, "table")
tbls_ls <- webpage %>%
  html_nodes("table") %>%
  .[5] %>%
  html_table(fill = TRUE)

data = as.tibble(tbls_ls[[1]]) 

然而,我需要在表格中再添加一件事。对于某些陨石,有可用的氧同位素值。当点击“地块”部分下的陨石名称时,可以看到这一点。单击绘图时,我们会被重定向到具有三个同位素值的页面。我想要做的是在我的表中添加三列,其中包含每个陨石各自的同位素值。我尝试分别为每个“情节”部分编写代码,但我觉得可能有一个更优雅的解决方案。

4

1 回答 1

0

您可以在没有同位素的情况下抓取表格,然后在您决定使用同位素时模仿页面所做的发布请求;然后左加入两个Name列。您将获得比左表中更多的行(无同位素),因为有多个Change values,但这与您在查看您描述的同位素的方法中看到的内容相匹配,其中有逗号分隔的同位素值列表,在图中,而不是按行拆分。

我选择一个更具选择性的 css 选择器来最初定位感兴趣的特定表,而不是索引到列表中。

我用来 write_excel_csv在写出时保留标题的字符编码(我从@stefan得到的一个想法)。

joint_table您可以在写出之前删除输出中不需要的列(子集/选择等)。


r

library(dyplr)
library(httr)
library(rvest)
library(readr)
library(magrittr)
library(stringr)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T) 

data <- list(
  'sfor' = "names",
  'stype' = "contains",
  'country' = "All",
  'categ' = "Ungrouped achondrites",
  'page' = "0",
  'map' = "ge",
  'srt' = "name",
  'lrec' = "200",
  'pnt' = "Oxygen isotopes",
  'mblist' = "All",
  'snew' = "0",
  'sea' = "*"
)


r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)

isotopes <- content(r, "text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)


joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)

write_excel_csv(x = joint_table, path = "joint.csv", col_names = T, na = "")

示例输出:

在此处输入图像描述


编辑:

根据您在评论中的要求添加来自其他网址的附加信息。我必须动态确定要选择哪个表号,以及处理没有表的情况。

library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#> 
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#> 
#>     pluck
#> The following object is masked from 'package:readr':
#> 
#>     guess_encoding
library(readr)
library(furrr)

get_table <- function(url) {
  page <- read_html(url)
  test_list <- page %>%
    html_nodes("#maintable tr > .inside:nth-child(odd)") %>%
    html_text() # get left hand column %>%
  index <- match(TRUE, stringr::str_detect(test_list, "Data from:")) + 1
  table <- page %>%
    html_node(paste0("#maintable tr:nth-of-type(", index, ") table")) %>%
    html_table() %>%
    as_tibble()
  temp <- set_names(data.frame(t(table[, -1]), row.names = c()), t(table[, 1])) # https://www.nesono.com/node/456 ; https://stackoverflow.com/a/7970267/6241235
  return(temp)
}


start_url <- "https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0"
base <- "https://www.lpi.usra.edu"
webpage <- read_html(start_url)

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T)

data <- list(
  "sfor" = "names",
  "stype" = "contains",
  "country" = "All",
  "categ" = "Ungrouped achondrites",
  "page" = "0",
  "map" = "ge",
  "srt" = "name",
  "lrec" = "200",
  "pnt" = "Oxygen isotopes",
  "mblist" = "All",
  "snew" = "0",
  "sea" = "*"
)

r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)

isotopes <- content(r, "text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)

joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)

lookups <- webpage %>%
  html_node("#maintable") %>%
  html_nodes("td:nth-of-type(1) a") %>%
  map_df(~ c(html_text(.), html_attr(., "href")) %>%
    set_names("Name", "Link")) %>%
  mutate(Link = paste0(base, gsub("\\s+", "%20", Link)))

error_df <- tibble(
  `State/Prov/County:` = NA_character_,
  `Origin or pseudonym:` = NA_character_,
  `Date:` = NA_character_,
  `Latitude:` = NA_character_,
  `Longitude:` = NA_character_,
  `Mass (g):` = NA_character_,
  `Pieces:` = NA_character_,
  `Class:` = NA_character_,
  `Shock stage:` = NA_character_,
  `Fayalite (mol%):` = NA_character_,
  `Ferrosilite (mol%):` = NA_character_,
  `Wollastonite (mol%):` = NA_character_,
  `Magnetic suscept.:` = NA_character_,
  `Classifier:` = NA_character_,
  `Type spec mass (g):` = NA_character_,
  `Type spec location:` = NA_character_,
  `Main mass:` = NA_character_,
  `Finder:` = NA_character_,
  `Comments:` = NA_character_,
)

no_cores <- future::availableCores() - 1

future::plan(future::multisession, workers = no_cores)

df <- furrr::future_map_dfr(lookups$Link, ~ tryCatch(get_table(.x), error = function(e) error_df))

colnames(df) <- sub(":", "", colnames(df))

df2 <- df %>%
  mutate(
    `Mass (g)` = gsub(",", "", `Mass (g)`),
    across(c(`Mass (g)`, `Magnetic suscept.`), as.numeric)
  )

if (nrow(df2) == nrow(no_isotopes)) {
  additional_info <- cbind(lookups, df2)
  joint_table$Name <- gsub(" \\*\\*", "", joint_table$Name)
  final_table <- dplyr::left_join(joint_table, additional_info, by = "Name", copy = FALSE)
  write_excel_csv(x = final_table, file = "joint.csv", col_names = T, na = "")
}

reprex 包于 2021-02-27 创建(v0.3.0)


注意

由于某种原因,OP 在查找变量方面存在问题,所以这是我为他们编写的替代方法:

lookups <- map_df(
  webpage %>% html_node("#maintable") %>% html_nodes("td:nth-of-type(1) a") , ~
    data.frame(
      Name = .x %>% html_text(),
      Link =  paste0(base, gsub("\\s+", "%20", .x %>%  html_attr("href")))
    )
) %>% as_tibble()
于 2021-02-04T22:33:33.197 回答