从 start_url,你会发现有 13 页,但使用以下代码,我只能得到 1-6 页
start_urls = [" http://www.avocatsparis.org/Eannuaire/CMSListeRecherche.aspx?nom=&Pre=&ChReNom=2&Adr=&Arr=8&mail=False&Site=False&Toque=&etranger=False&Spec=41 "]
def parse(self, response):
i = 1;
while i <= 13:
hxs = HtmlXPathSelector(response)
yield FormRequest.from_response(response,
formdata={'__EVENTTARGET':'_ctl0$Corps$DataGridPager1$Page_' + str(i),'__EVENTARGUMENT':''},
callback=self.parse_post,
dont_click=False)
i = i + 1
def parse_post(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//td[@width='400px']")
for title in titles:
link = title.select("a/@href").extract()[0]
yield Request(urljoin('http://www.avocatsparis.org/Eannuaire/',link), callback=self.parse_details)