我一直试图从 asp.net 网站上抓取一些日期,起始页应该是以下一个: http ://www.e3050.com/Items.aspx?cat=SON
首先,我想每页显示 50 个项目(来自 select 元素)其次,我想通过页面进行分页。
我为每页 50 个项目尝试了以下代码,但没有奏效:
start_urls = ["http://www.e3050.com/Items.aspx?cat=SON"]
def parse(self, response):
requests = []
hxs = HtmlXPathSelector(response)
# Check if there's more than 1 page
if len(hxs.select('//span[@id="ctl00_ctl00_ContentPlaceHolder1_ItemListPlaceHolder_lbl_PageSize"]/text()').extract()) > 0:
# Get last page number
last_page = hxs.select('//span[@id="ctl00_ctl00_ContentPlaceHolder1_ItemListPlaceHolder_lbl_PageSize"]/text()').extract()[0]
i = 1
# preparing requests for each page
while i < (int(last_page) / 5) + 1:
requests.append(Request("http://www.e3050.com/Items.aspx?cat=SON", callback=self.parse_product))
i +=1
# posting form date (50 items and next page button)
requests.append(FormRequest.from_response(
response,
formdata={'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pagesddl':'50',
'__EVENTTARGET':'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pager1$ctl00$ctl01'},
callback=self.parse_product,
dont_click=True
)
)
for request in requests:
yield request