我试图抓取这个页面:http ://www.tyrepac.com.sg/Form/B2C/FittingLocation.aspx
有这个数据表,其中包含我要抓取的信息。比如姓名和地址。但是,它一次只显示一个区域,例如(北、东、西、南)。
当您从下拉列表中选择区域时,将调用 POST 请求并显示数据。
这是我的代码:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import re
from scrapy.http import FormRequest, Request
import json
from tyres.items import TyrePacItem
class TyrePacSpider(BaseSpider):
name = "tyrepac"
allowed_domains = ["tyrepac.com.sg"]
start_urls = ["http://www.tyrepac.com.sg/Form/B2C/FittingLocation.aspx"]
def start_requests(self):
return FormRequest(
url="http://www.tyrepac.com.sg/Form/B2C/FittingLocation.aspx",
method='POST',
formdata={'ctl00$ContentPlaceHolder1$cpFittingLocationDdl$cpRegion$ddlRegion': 'North'},
callback=self.after_post
)
def after_post(self, response):
hxs = HtmlXPathSelector(response)
values = hxs.select('//table[@class="dxgvTable"][1]/tr')
for value in values:
item = TyrePacItem()
#name = value.select('.//label[@class="dxeBase"][1]//text()').extract()
#item['name'] = ' '.join(name)
address = value.select('.//label[@class="dxeBase"][2]//text()').re('[^\n\t\r]+')
item['address'] = ' '.join(address)
# item['postal'] = item['address'][-6:]
# hours = value.select('.//label[@class="dxeBase"][5]//text()').re('[^\n\t\r]+')
# hours = ' '.join(hours)
# item['hours'] = re.sub('\s+', ' ',hours)
yield item
追溯
Traceback (most recent call last):
File "C:\Python27\ArcGIS10.1\lib\site-packages\scrapy-0.16.5-py2.7.egg
\scrapy\cmdline.py", line 138, in _run_command
cmd.run(args, opts)
File "C:\Python27\ArcGIS10.1\lib\site-packages\scrapy-0.16.5-py2.7.egg
\scrapy\commands\crawl.py", line 44, in run
self.crawler.crawl(spider)
File "C:\Python27\ArcGIS10.1\lib\site-packages\scrapy-0.16.5-py2.7.egg
\scrapy\crawler.py", line 47, in crawl
return self.engine.open_spider(spider, requests)
File "C:\Python27\ArcGIS10.1\lib\site-packages\twisted-13.0.0-py2.7-wi
n32.egg\twisted\internet\defer.py", line 1213, in unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- <exception caught here> ---
File "C:\Python27\ArcGIS10.1\lib\site-packages\twisted-13.0.0-py2.7-wi
n32.egg\twisted\internet\defer.py", line 1070, in _inlineCallbacks
result = g.send(result)
File "C:\Python27\ArcGIS10.1\lib\site-packages\scrapy-0.16.5-py2.7.egg
\scrapy\core\engine.py", line 220, in open_spider
slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
File "C:\Python27\ArcGIS10.1\lib\site-packages\scrapy-0.16.5-py2.7.egg
\scrapy\core\engine.py", line 27, in __init__
self.start_requests = iter(start_requests)
exceptions.TypeError: 'FormRequest' object is not iterable
为什么会发生这种情况,我该如何解决?提前谢谢