0

从 start_url,你会发现有 13 页,但使用以下代码,我只能得到 1-6 页

start_urls = [" http://www.avocatsparis.org/Eannuaire/CMSListeRecherche.aspx?nom=&Pre=&ChReNom=2&Adr=&Arr=8&mail=False&Site=False&Toque=&etranger=False&Spec=41 "]

def parse(self, response):
    i = 1;
    while i <= 13:
        hxs = HtmlXPathSelector(response)
        yield FormRequest.from_response(response,
                    formdata={'__EVENTTARGET':'_ctl0$Corps$DataGridPager1$Page_' + str(i),'__EVENTARGUMENT':''},
                    callback=self.parse_post,
                    dont_click=False)
        i = i + 1


def parse_post(self, response):
    hxs = HtmlXPathSelector(response)
    titles = hxs.select("//td[@width='400px']")
    for title in titles:
        link = title.select("a/@href").extract()[0]
        yield Request(urljoin('http://www.avocatsparis.org/Eannuaire/',link), callback=self.parse_details)
4

0 回答 0