0

我正在尝试使用 4 个下拉菜单从以下网站抓取数据 - 单击每个下拉菜单后,它们会显示一个我想要从中抓取数据的表格。我想结合所有下拉菜单中所有表格的信息。

我正在使用RSelenium包但是因为我对网络抓取非常陌生,我无法理解如何使用四个可用选项进行循环以获得决赛桌。

https://hindi.iocl.com/lpgdistributors.aspx

我尝试了之前关于网页抓取的讨论并相应地修改了代码。

library(RSelenium)
library(rvest)

system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)

rD <- rsDriver(browser = c("firefox")) #specify browser type you want Selenium to open
remDr <- rD$client
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx") # navigates to webpage


# select first dropdown list
option <- remDr$findElement(using='id', value="cmbState")

#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>% 
  str_extract_all("1[0-9]{3}")

# select 2nd dropdown list
option <- remDr$findElement(using='id', value="cmbDistrict")

#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>% 
  str_extract_all("1[0-9]{3}")

# select 3rd dropdown list
option <- remDr$findElement(using='id', value="cmbMarket")

#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>% 
  str_extract_all("1[0-9]{3}")

#select 4th dropdown list
option2 <- remDr$findElement(using='id', value="cmbArea")

#get all option values from dropdown list
option_values_2 <- option2$getElementText() %>% 
  str_split("\\n") %>% 
  unlist()

#### create loop to loop over all tables...

option <- remDr$findElement(using='id', value="cmbState")
option <- remDr$findElement(using = 'xpath', "//*/option[@value = '1']") #change '1194' to values in option_values in loop
option$clickElement()

# change dropdown selection
option2 <- remDr$findElement(using='id', value="cmbDistrict")
option2 <- remDr$findElement(using = 'xpath', "//*/option[@value = '185']") #change 'AHB' to values in option_values_2 in loop
option2$clickElement()

# change dropdown selection
option3 <- remDr$findElement(using='id', value="cmbMarket")
option3 <- remDr$findElement(using = 'xpath', "//*/option[@value = '2314']") #change 'AHB' to values in option_values_2 in loop
option3$clickElement()

# change dropdown selection
option4 <- remDr$findElement(using='id', value="cmbArea")
option4 <- remDr$findElement(using = 'xpath', "//*/option[@value = '57']") #change 'AHB' to values in option_values_2 in loop
option4$clickElement()


# click submit
submit <- remDr$findElement(using='id', value="btnSearch")
submit$clickElement()


#get table
tb <- remDr$findElement(using='id', value="grdDistributors")

tb$getPageSource()[[1]] %>% 
  read_html() %>% 
  html_table(fill = TRUE)

4

2 回答 2

0

您可以轻松使用RSelenium,但在您必须检查下拉菜单的所有值之前。

在创建示例之前,有必要了解要与下拉菜单交互,您必须研究页面背后的 html 代码并了解菜单的每个元素。

这是一个例子:

    library(RSelenium)
    #Selenium environment activation
    rD <- rsDriver(browser = "firefox", check = FALSE)
    remDr <- rD[["client"]]
    remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx")
    
    # We collect all elements that are present in the drop menu and we insert in a list
    menu_1<-list("Assam")
    menu_2<-list("Goalpara")
    menu_3<-list("Goalpara")
    menu_4<-list("Guwahati")
    
    # We create a four for loops with two focus:
    # 1. by click we active the drop menu
    # 2. in this case we sent the text (the format should be factor) to choose
    for( i in menu_1) {
      first_element<-as.factor(i)
      remDr$findElement(using = 'xpath', value = '//*[@id="cmbState"]')$clickElement()
      remDr$findElement(using = 'xpath', value = '//*[@id="cmbState"]')$sendKeysToElement(list(first_element))
      Sys.sleep(1) #Is better to implement a sleep
      for( j in menu_2) {
        first_element<-as.factor(j)
        remDr$findElement(using = 'xpath', value = '//*[@id="cmbDistrict"]')$clickElement()
        remDr$findElement(using = 'xpath', value = '//*[@id="cmbDistrict"]')$sendKeysToElement(list(first_element))
        Sys.sleep(1) #Is better to implement a sleep
        for( e in menu_3) {
          first_element<-as.factor(e)
          remDr$findElement(using = 'xpath', value = '//*[@id="cmbMarket"]')$clickElement()
          remDr$findElement(using = 'xpath', value = '//*[@id="cmbMarket"]')$sendKeysToElement(list(first_element))
          Sys.sleep(1) #Is better to implement a sleep
          for( f in menu_4) {
            first_element<-as.factor(f)
            remDr$findElement(using = 'xpath', value = '//*[@id="cmbArea"]')$clickElement()
            remDr$findElement(using = 'xpath', value = '//*[@id="cmbArea"]')$sendKeysToElement(list(first_element))
            Sys.sleep(1) #Is better to implement a sleep
        }
       }
      }
    }
    # Click the button for searching
    remDr$findElement(using = 'xpath', value = '//*[@id="btnSearch"]')$clickElement()
于 2021-10-12T15:35:22.963 回答
0

这是使用的部分解决方案RSelenium

library(RSelenium)
driver <- rsDriver(browser = "chrome")
remDr<-driver[["client"]]
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx") 

获取所有状态的列表

webElem <- remDr$findElement(using='xpath', value= '//*[@id="cmbState"]')
states <- webElem$getElementText()
states= unlist(states)
#removing line breakers and converting to vector
states = sapply(strsplit(states, split='\n ', fixed=TRUE), `[`)
      [,1]                   
 [1,] " Andaman & Nicobar"   
 [2,] "Andhra Pradesh"       
 [3,] "Arunachal Pradesh"    
 [4,] "Assam"                
 [5,] "Bihar"                
 [6,] "Chandigarh"           
 [7,] "Chhatisgarh"          
 [8,] "Goa"                  
 [9,] "Gujarat"              
[10,] "Haryana"              
[11,] "Himachal Pradesh"     
[12,] "Jammu & Kashmir"      
[13,] "Jharkhand"            
[14,] "Karnataka"    

 

现在您必须遍历州以获取所有地区的列表,例如,

### Select the State
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx") 
opt1 <- remDr$findElement(using='xpath', value= '//*[@id="cmbState"]')
opt1$clickElement()
opt1$sendKeysToElement(list('Karnataka'))

### Get list of all the districts 
webElem <- remDr$findElement(using='xpath', value= '//*[@id="cmbDistrict"]')
webElem$clickElement()
district <- webElem$getElementText()
district = unlist(district)
district = sapply(strsplit(district, split='\n ', fixed=TRUE), `[`)
      [,1]              
 [1,] " Bagalkot"       
 [2,] "Bangalore"       
 [3,] "Belgaum"         
 [4,] "Bellary"         
 [5,] "Bidar"           
 [6,] "Bijapur"         
 [7,] "Chamrajnagar"    
 [8,] "Chickmagalur"    
 [9,] "Chitradurga"     
[10,] "Coorg"           
[11,] "Dakshina Kannada"
[12,] "Davangere"       
[13,] "Dharwad" 

### Get list of all the markets 
webElem <- remDr$findElement(using='xpath', value= '//*[@id="cmbMarket"]')
webElem$clickElement()
market <- webElem$getElementText()
market = unlist(market)
market = sapply(strsplit(market, split='\n ', fixed=TRUE), `[`)
       [,1]               
  [1,] " Afzalpur"        
  [2,] "Ajjampur"         
  [3,] "Aland"            
  [4,] "Ankola"           
  [5,] "B. Mathikere"     
  [6,] "Bagalkot"         
  [7,] "Bailhongal"       
  [8,] "Bandipur"         
  [9,] "Bangalore"        
 [10,] "Basavakalyan"     
 [11,] "Belgaum"          
 [12,] "Bellary"          
 [13,] "Belthangadi"      
 [14,] "Bhadravathi"      
 [15,] "Bhadravati"       
 [16,] "Bhatkal"          
 [17,] "Bidar"            
 [18,] "Bijapur"          
 [19,] "Challakere"    
于 2021-10-12T10:21:58.377 回答