4

我正在使用 phantomJS 从不同的站点收集数据。在数据抓取过程中,我在解析站点或站点元素时遇到了很多崩溃。不幸的是,phantomJS 和 RSelenium 都没有在控制台中提供任何信息或包报告。脚本只是挂起而没有任何警告。我看到它正在执行,但实际上什么也没发生。阻止脚本执行的唯一方法是手动重启 R。经过几次测试,我发现 phantomJS 通常会在执行 remDr$findElements() 命令时挂起。我尝试使用 firefox 和 RSelenium 重新运行我的代码 - 它工作正常。所以问题在于 phantomJS 是如何工作的。

有没有人在运行 phantomJS 时遇到过类似的情况?是否有可能纠正这种不当行为?

我在用着:

  1. Windows 7的
  2. 硒 2.0
  3. R 版本 3.1.3
  4. phantomjs-2.0.0-windows

我的代码:

# starting phantom server driver
phantomjsdir <- paste(mywd, "/phantomjs-2.0.0-windows/bin/phantomjs.exe", sep="" )
phantomjsUserAgent <- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36 OPR/28.0.1750.48"
eCap <- list(phantomjs.binary.path = phantomjsdir, phantomjs.page.settings.userAgent = phantomjsUserAgent )
pJS <- phantom(pjs_cmd = phantomjsdir)
remDr <- remoteDriver(browserName = "phantomjs", extraCapabilities = eCap)
remDr$open(silent = FALSE)


mywords <- c("canon 600d", "sony 58k","nikon","nikon2","nikon 800","nikon 80","nikon 8")
timeout <- 3

#'
#' Exceuting script
#'

for (word in mywords) {

  print(paste0("searching for: ",word))
  ss.word <- word
  remDr$navigate("http://google.com")

  webElem <- remDr$findElement(using = "class", "gsfi")
  webElem$sendKeysToElement(list(enc2utf8(ss.word),key = "enter"))
  Sys.sleep(1)

  print (remDr$executeScript("return document.readyState;")[[1]])
  while (remDr$executeScript("return document.readyState;")[[1]]!= "complete" && totalwait<10) {
    Sys.sleep(timeout)
  }

  print(paste0("search completed: ",ss.word))
  elem.snippet <- remDr$findElements(using="class name",value = "rc")

  for (i in 1:length(elem.snippet)) {


    print(paste0("element opened: ",ss.word,"  pos",i))
    print(elem.snippet[[i]])
    ss.snippet.code  <- elem.snippet[[i]]$getElementAttribute('innerHTML')
    print(paste0("element element innerHTML ok"))
    elemtitle <- elem.snippet[[i]]$findChildElement(using = "class name", value = "r")
    print(paste0("element title ok"))


    elemcode <- elemtitle$getElementAttribute('innerHTML')
    print(paste0("element innerHTML ok"))


    elemtext <- elem.snippet[[i]]$findChildElement(using = "class name", value = "st")
    ss.text <- elemtext$getElementText()[[1]]
    print(paste0("element loaded: ",ss.word,"  pos",i))


    elemloc <- elem.snippet[[i]]$getElementLocation()
    elemsize <- elem.snippet[[i]]$getElementSize()
    print(paste0("element location parsed: ",ss.word,"  pos",i))

  }

  print(paste0("data collected: ",ss.word))
}
4

0 回答 0