我无法完全理解 SGML Link Extractor 的工作原理。使用 Scrapy 制作爬虫时,我可以成功地从使用特定 URL 的链接中提取数据。问题是使用规则来跟踪特定 URL 中的下一页链接。
我认为问题在于allow()属性。将规则添加到代码中时,结果不会显示在命令行中,并且不会跟随到下一页的链接。
任何帮助是极大的赞赏。
这是代码...
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider
from scrapy.contrib.spiders import Rule
from tutorial.items import TutorialItem
class AllGigsSpider(CrawlSpider):
name = "allGigs"
allowed_domains = ["http://www.allgigs.co.uk/"]
start_urls = [
"http://www.allgigs.co.uk/whats_on/London/clubbing-1.html",
"http://www.allgigs.co.uk/whats_on/London/festivals-1.html",
"http://www.allgigs.co.uk/whats_on/London/comedy-1.html",
"http://www.allgigs.co.uk/whats_on/London/theatre_and_opera-1.html",
"http://www.allgigs.co.uk/whats_on/London/dance_and_ballet-1.html"
]
rules = (Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[@class="more"]',)), callback="parse_me", follow= True),
)
def parse_me(self, response):
hxs = HtmlXPathSelector(response)
infos = hxs.xpath('//div[@class="entry vevent"]')
items = []
for info in infos:
item = TutorialItem()
item ['artist'] = hxs.xpath('//span[@class="summary"]//text()').extract()
item ['date'] = hxs.xpath('//abbr[@class="dtstart dtend"]//text()').extract()
item ['endDate'] = hxs.xpath('//abbr[@class="dtend"]//text()').extract()
item ['startDate'] = hxs.xpath('//abbr[@class="dtstart"]//text()').extract()
items.append(item)
return items
print items