我正在尝试使用 4 个下拉菜单从以下网站抓取数据 - 单击每个下拉菜单后,它们会显示一个我想要从中抓取数据的表格。我想结合所有下拉菜单中所有表格的信息。
我正在使用RSelenium
包但是因为我对网络抓取非常陌生,我无法理解如何使用四个可用选项进行循环以获得决赛桌。
https://hindi.iocl.com/lpgdistributors.aspx
我尝试了之前关于网页抓取的讨论并相应地修改了代码。
library(RSelenium)
library(rvest)
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
rD <- rsDriver(browser = c("firefox")) #specify browser type you want Selenium to open
remDr <- rD$client
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx") # navigates to webpage
# select first dropdown list
option <- remDr$findElement(using='id', value="cmbState")
#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>%
str_extract_all("1[0-9]{3}")
# select 2nd dropdown list
option <- remDr$findElement(using='id', value="cmbDistrict")
#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>%
str_extract_all("1[0-9]{3}")
# select 3rd dropdown list
option <- remDr$findElement(using='id', value="cmbMarket")
#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>%
str_extract_all("1[0-9]{3}")
#select 4th dropdown list
option2 <- remDr$findElement(using='id', value="cmbArea")
#get all option values from dropdown list
option_values_2 <- option2$getElementText() %>%
str_split("\\n") %>%
unlist()
#### create loop to loop over all tables...
option <- remDr$findElement(using='id', value="cmbState")
option <- remDr$findElement(using = 'xpath', "//*/option[@value = '1']") #change '1194' to values in option_values in loop
option$clickElement()
# change dropdown selection
option2 <- remDr$findElement(using='id', value="cmbDistrict")
option2 <- remDr$findElement(using = 'xpath', "//*/option[@value = '185']") #change 'AHB' to values in option_values_2 in loop
option2$clickElement()
# change dropdown selection
option3 <- remDr$findElement(using='id', value="cmbMarket")
option3 <- remDr$findElement(using = 'xpath', "//*/option[@value = '2314']") #change 'AHB' to values in option_values_2 in loop
option3$clickElement()
# change dropdown selection
option4 <- remDr$findElement(using='id', value="cmbArea")
option4 <- remDr$findElement(using = 'xpath', "//*/option[@value = '57']") #change 'AHB' to values in option_values_2 in loop
option4$clickElement()
# click submit
submit <- remDr$findElement(using='id', value="btnSearch")
submit$clickElement()
#get table
tb <- remDr$findElement(using='id', value="grdDistributors")
tb$getPageSource()[[1]] %>%
read_html() %>%
html_table(fill = TRUE)