xml - readHTMLTable 是否对单元格中的字符数有限制？

Question

我正在使用 RSelenium 在联合国条约收集网站上提交表格并保存结果。一切正常，除了条约名称在我的决赛桌中被截断。是因为 readHTML 可以读取的字符数有限制还是我做错了什么？

这是一个（希望）可重现的示例：

###
###   RSelenium scraping of UN treaty collection
###

# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html

rm(list=ls())

###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")

library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")


# Start Selenium Server --------------------------------------------------------

checkForServer()  ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5) 
#remDrv$getStatus()  ## info connection, not necessary

# Simulate browser session and fill out form -----------------------------------

## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')

## check out what is of interest:

## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement()  ## match all the elements
Sys.sleep(5) 

## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn")    ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5) 

## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement()  ## number of pages
Sys.sleep(5) 

## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount")    ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")

## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login")    ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5) 

df_all <- data.frame()
###### need to run search for multiple countries
country_list <- c("Morocco", "Italy", "France")

for (i in country_list){
  Sys.sleep(5) 
  ## define keys to search
  keys <- paste(i, "Agreement promotion investment", sep=" ")

  ## search for files, one by one and save results
  webElem0$clearElement()
  webElem0$sendKeysToElement(list(keys, key = "enter"))
  Sys.sleep(20) 

  # check if the table is there
  doc<-htmlParse(remDrv$getPageSource()[[1]])  ## now parse html so that we can search it
  tables = readHTMLTable(doc)  ## extract all tables
  #names(tables)   ## names of all tables

  tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
  yes_no <- all(tableexists==F)
  yes_no

  if(yes_no==FALSE){
  ## copy table
  table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
  table$getElementAttribute("class")
  table$getElementAttribute("type")
  table$getElementAttribute("id")

  ## extract table of interest
  tabledat <-readHTMLTable(doc, stringsAsFactors = F , skip.rows=c(1))[[37]]
  df_all <- rbind(tabledat, df_all)
  }else{print("caccadicane")}
}

write.csv(df_all[,-(7:ncol(df_all))], ("un_bits.csv"))

结果是：

       V1                                     V2          V3         V4         V5        V6
1 I-42051 Agreement between the Government of... See Details 08/07/1996 27/07/2000 Bilateral
2 I-35582 Agreement between the Government of... See Details 11/10/1995 22/06/1997 Bilateral
3 I-35481 Agreement between the Government of... See Details 30/11/1995 30/05/1997 Bilateral
4 I-23169 Agreement concerning the establishm... See Details 28/06/1980 28/06/1980 Bilateral
5 I-29086 Exchange of notes constituting an a... See Details 12/08/1985 12/08/1985 Bilateral
6 I-43258 Agreement on the promotion and prot... See Details 27/01/1999 08/05/2001 Bilateral

为什么 V2 中的字符串会被截断？

score 0 · Accepted Answer

好的，经过一段时间我发现即使ReadHTML命令有限制，也不是这个例子中的文本被截断的原因。通过更仔细地检查 html 文件，我发现文本已经被截断，而全名在元素“title”中。

因此，解决方案是阅读每个“标题”中的文本以获取协议的全名。如果有人感兴趣，下面是代码，并添加了一些其他内容。

###
###   RSelenium scraping of UN treaty collection
###

# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html

rm(list=ls())

###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")

library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")


# Start Selenium Server --------------------------------------------------------

checkForServer()  ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5) 
#remDrv$getStatus()  ## info connection, not necessary

# Simulate browser session and fill out form -----------------------------------

## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')

## check out what is of interest:

## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement()  ## match all the elements
Sys.sleep(5) 

## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn")    ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5) 

## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement()  ## number of pages
Sys.sleep(5) 

## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount")    ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")

## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login")    ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5) 

df_all <- data.frame()
###### need to run search for multiple countries
#country_list <- c("Morocco", "Italy", "Brutto porco", "France")
names <- read.csv("participants_clean.csv")
country_list <- names$names
current_search  <- length(country_list)

for (i in country_list){

  #i <- "Morocco" 
  print("-------------------------")
  print("-------------------------")
  text <- paste("Still", current_search, "searches to do... ", sep=" ")
  print(text)
  text0 <- paste("Now looking for treaties signed by...  ", i , " ----------------------->>" , sep=" ")
  print(text0)
  Sys.sleep(5) 
  ## define keys to search
  keys <- paste(i, "Agreement promotion investment", sep=" ")

  ## search for files, one by one and save results
  webElem0$clearElement()
  webElem0$sendKeysToElement(list(keys, key = "enter"))
  Sys.sleep(20) 

  # check if the table is there
  doc<-htmlParse(remDrv$getPageSource()[[1]])  ## now parse html so that we can search it
  tables = readHTMLTable(doc)  ## extract all tables
  #names(tables)   ## names of all tables

  tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
  yes_no <- all(tableexists==F)
  yes_no

  if(yes_no==FALSE){
  ## copy table
  table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
  table$getElementAttribute("class")
  table$getElementAttribute("type")
  table$getElementAttribute("id")

  ## extract table of interest
  tabledat <-readHTMLTable(doc, stringsAsFactors = F )[[37]]
  treatfou <-nrow(tabledat)
  text1 <- paste("Amazing, I just found", treatfou - 1, " !!", sep=" ")
  print(text1)

  ## now need to extract the real names of the treaties: start from 2 to treatfound
  names_new <- vector(mode="character",length = treatfou)
  urls <- vector(mode="character",length = treatfou)

  for (jj in 2:treatfou) {
    cell_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[2]", sep="")
    cell_table <- remDrv$findElement(using = 'xpath', cell_add)
    names_new[[jj]] <- as.character(cell_table$getElementAttribute("title"))
  }

  ## now substitute in the real titles:
  names_new <- as.vector(unlist(names_new))
  tabledat$title <- names_new
  tabledat$party <- i

  ## get the link
  for (jj in 2:treatfou) {
    url_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[3]/a", sep="")
    url_add <- remDrv$findElement(using = 'xpath', url_add)
    gio <- unlist(url_add$getElementAttribute("href"))
    gio <- gsub("javascript:void%20window.open\\('","",gio)   ## need to excape the parenthesis with \\
    gio <- gsub("\\'.*", "", gio)  ## cancel everything after '
    urls[[jj]] <- paste0("https://treaties.un.org/Pages/",gio)
  }
  tabledat$url <-urls


  df_all <- rbind(tabledat[-(1),], df_all)
  }else{print("Too bad, there is nothing, I'll try with the next one :) " )}
  current_search <- current_search -1
}

write.csv(df_all[,-(7:10)], ("un_bits.csv"))

xml - readHTMLTable 是否对单元格中的字符数有限制？

1 回答 1

Related

Reference