我用 Python 编写了一个应用程序来爬取在服务器站点上使用 ASP.NET 的网站。
这就是我一直在做的(只是从浏览器复制 HTTP 标头和正文,因为我看不到其他方法):
(它奏效了!前段时间......但现在它因“连接超时”而中止。)
def SBPageLoader(keyWord):
headers = {'Host': 'www.sberbank-ast.ru' ,
'Connection': 'keep-alive' ,
'Content-Length': '46203',
'Cache-Control': 'max-age=0' ,
'Origin': 'http://www.sberbank-ast.ru' ,
'User-Agent': 'Mozilla/5.0 (Linux i686)' ,
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8' ,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
'Referer': 'http://www.sberbank-ast.ru/purchaseList.aspx' ,
'Accept-Encoding': 'gzip,deflate,sdch' ,
'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4' ,
'Accept-Charset': 'utf-8' ,
'Cookie': 'ASP.NET_SessionId=d4ki4j55hsq3km45b4qbrgjs; __utma=99173852.1461595200.1340564818.1341685237.1341758931.11; __utmb=99173852.4.9.1341758978151; __utmc=99173852; __utmz=99173852.1340564818.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'
}
#....( here is lots of data with undefined meaning - what is it? )......
data = '_EVENTTARGET=&__EVENTARGUMENT=........&__VIEWSTATE=%2FwEPDwUJMzUwNDEzMjgxD2QWAmYPZBYCZg9kFgICAw9kFgQCAQ9kFgICAg8PFgIeB1Zpc2libGVoZGQCBQ9kFgICAQ9kFgYCAQ9kFgICAQ9kFgwCFQ8PZBYGHgdjb250ZW50BRRsZWFmOnB1YmxpY2RhdGVzdGFydB4JbWF4bGVuZ3RoBQIxMB4FY2xhc3MFCCBkYXRlUlVTZAIXDw9kFgYfAQUSbGVhZjpwdWJsaWNkYXRlpurchID400=887031'
data = data.replace("Toyota", keyWord) # haha - cattlecode
log("Strat loading http://www.sberbank-ast.ru/purchaseList.aspx ...")
req = urllib2.Request('http://www.sberbank-ast.ru/purchaseList.aspx', data, headers)
response = urllib2.urlopen(req)
page = response.read()
log(".. Loading is finished")
现在,即使我用新的替换旧的正文和标题 - 也会发生同样的事情。
欢迎任何关于它有什么问题的想法。