请指导我如何编写 Rule SgmlLinkExtractor
我很困惑,无法弄清楚英文文档
我想用很多页面爬网
规则是:
http://abctest.com/list.php?c=&&page=1
http://abctest.com/list.php?c=&&page=2
http://abctest.com/list.php?c=&&page=3 ...
这是我的代码:
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class Spider(CrawlSpider):
name = "find"
start_urls = ["http://abctest.com/list.php?c=&&page=1",]
#crawl 2 pages to test if the data is normal allow=('?c=&&page=/d+')
rules = [Rule(SgmlLinkExtractor(allow=('?c=&&page=2')),callback='parse_item',follow=True)]
#get the page1 item
def parse(self, response):
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item
#get the page2 item
def parse_item(self, response):
sel = Selector(response)
sites = sel.css("div#list table tr ")
for site in sites:
item = LAItem()
item['day'] = site.css(" td.date::text ").extract()
item['URL'] = site.css(" td.subject a::attr(href) ").extract()
yield item