我想在这里抓取的页面:http: //stoptb.org/countries/tbteam/searchExperts.asp需要在这个页面中提交参数: http: //stoptb.org/countries/tbteam/experts.asp按顺序把数据拿出来。由于参数没有嵌套在 URL 中,我不知道如何用 R 传递它们。有没有办法在 R 中做到这一点?
(顺便说一句,我对 ASP 几乎一无所知,所以也许这就是我缺少的组件。)
我想在这里抓取的页面:http: //stoptb.org/countries/tbteam/searchExperts.asp需要在这个页面中提交参数: http: //stoptb.org/countries/tbteam/experts.asp按顺序把数据拿出来。由于参数没有嵌套在 URL 中,我不知道如何用 R 传递它们。有没有办法在 R 中做到这一点?
(顺便说一句,我对 ASP 几乎一无所知,所以也许这就是我缺少的组件。)
您可以使用RHTMLForms
您可能需要先安装它:
# install.packages("RHTMLForms", repos = "http://www.omegahat.org/R")
或在窗户下你可能需要
# install.packages("RHTMLForms", repos = "http://www.omegahat.org/R", type = "source")
require(RHTMLForms)
require(RCurl)
require(XML)
forms = getHTMLFormDescription("http://stoptb.org/countries/tbteam/experts.asp")
fun = createFunction(forms$sExperts)
# find experts with expertise in "Infection control: Engineering Consultant"
results <- fun(Expertise = "Infection control: Engineering Consultant")
tableData <- getNodeSet(htmlParse(results), "//*/table[@class = 'data']")
readHTMLTable(tableData[[1]])
# V1 V2 V3
#1 <NA> <NA>
#2 Name of Expert Country of Residence Email
#3 Girmay, Desalegn Ethiopia deskebede@yahoo.com
#4 IVANCHENKO, VARVARA Estonia v.ivanchenko81@mail.ru
#5 JAUCOT, Alex Belgium alex.jaucot@gmail.com
#6 Mulder, Hans Johannes Henricus Namibia hmulder@iway.na
#7 Walls, Neil Australia neil@nwalls.com
#8 Zuccotti, Thea Italy thea_zuc@yahoo.com
# V4
#1 <NA>
#2 Number of Missions
#3 0
#4 3
#5 0
#6 0
#7 0
#8 1
或创建阅读器以返回表格
returnTable <- function(results){
tableData <- getNodeSet(htmlParse(results), "//*/table[@class = 'data']")
readHTMLTable(tableData[[1]])
}
fun = createFunction(forms$sExperts, reader = returnTable)
fun(CBased = "Bhutan") # find experts based in Bhutan
# V1 V2 V3
#1 <NA> <NA>
#2 Name of Expert Country of Residence Email
#3 Wangchuk, Lungten Bhutan drlungten@health.gov.bt
# V4
#1 <NA>
#2 Number of Missions
#3 2
将表单发送到 Web 服务器通常是通过所谓的 HTTP POST 请求完成的(获取普通网页是 HTTP GET 请求)。
POST 请求规范允许您将所有参数捆绑到 HTTP 标头的一部分中。
RCurl
可以做到这一点,或者您可以尝试httr
具有一个POST
函数的包,该函数接受一个 R 参数列表以与 POST 请求一起传递。
另一个提示:使用 Firebug 或其他浏览器调试器来检查页面作为参数发送给 POST 请求的内容。
我试图为“ http://hp2010.nhlbihin.net/atpiii/calculator.asp?usertype=prof ”准备一个完整的工作示例。
#Example the data required.
medication <- 0; #c("0", "1")
sex <- "male"; #c("female", "male")
smoker <- 1; # c("0", "1")
age <- 20; #
cholesterol<-130;#
hdl <- 20; #
systolic <- 130; #
#thanks to http://www.omegahat.org/RHTMLForms/
download.file("http://www.omegahat.org/RHTMLForms/RHTMLForms_0.6-0.tar", tempdir())
install.packages(file.path(paste(tempdir(),"RHTMLForms_0.6-0.tar", 'RHTMLForms_0.6-0.tar')),repos=NULL, type='source')
#----------------------------------------------------------------------------
#libraries
library(RHTMLForms)
library(xlsx)
library(XML)
library(RCurl)
# http://stackoverflow.com/questions/5396461/how-to-automate-multiple-requests-to-a-web-search-form-using-r
setwd("C:\\MyPath")
data<-read.csv("MyData.csv")
# get form description
url<-"http://hp2010.nhlbihin.net/atpiii/calculator.asp?usertype=prof"
forms <- getHTMLFormDescription(url);
# create a function using form description, to query the url
efun <- createFunction(forms[[1]]);
result<-matrix(NA,nrow=dim(data)[1],ncol=9,dimnames=list(1:dim(data)[1],c("IDNO","medication","sex","smoker","age","cholesterol","hdl","systolic","risk_persent")))
# If you had an actual file you could use this for loop, for now you can use the example data
# for(i in 1: dim(data)[1]){
# medication <- 0;#c("0", "1")
# sex <- ifelse(data$gender1[i]==0,"female","male") ;#c("female", "male")
# smoker <- ifelse(data$cig1c[i]<2,0,1);# c("0", "1")
# age <- data$age1c[i];#
# cholesterol <- data$chol1[i];#
# hdl <- data$hdl1[i];#
# systolic <- round(data$sbp1c[i]);#
if(age<20||age>99||systolic>200||systolic<90||cholesterol<130||cholesterol>320||hdl<20||hdl>100||is.na(sex)||is.na(smoker)||is.na(age)||is.na(cholesterol)||is.na(hdl)||is.na(systolic)){
result[i,]<-c(data$IDNO[i],medication,sex,smoker,age,cholesterol,hdl,systolic,NA)
next;
}
# extract webpage by passing required arguments to function
page <- efun(medication = toString(medication), sex = toString(sex), smoker = toString(smoker), age = toString(age), cholesterol = toString(cholesterol), hdl = toString(hdl), systolic = toString(systolic));
#pause the algorithm, so that you do not request too often from the server
Sys.sleep(.1)
# parse webpage and return html tree
doc <- htmlTreeParse(page, asText = T, useInternalNodes = T);
# extract table from the html tree
tab <- readHTMLTable(doc);
result[i,]<-c(data$IDNO[i],medication,sex,smoker,age,cholesterol,hdl,systolic,as.numeric(gsub('[%a-zA-Z ]','',(tab[[1]][11,2]))))
#system.time
# }#end of for loop
write.csv(result,file="MyResults.csv")