我无法完全理解 SGML Link Extractor 的工作原理。使用 Scrapy 制作爬虫时,我可以成功地从使用特定 URL 的链接中提取数据。问题是使用规则来跟踪特定 URL 中的下一页链接。
我认为问题在于allow()
属性。将规则添加到代码中时,结果不会显示在命令行中,并且不会跟随到下一页的链接。
任何帮助是极大的赞赏。
这是代码...
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider
from scrapy.contrib.spiders import Rule
from tutorial.items import TutorialItem
class AllGigsSpider(CrawlSpider):
name = "allGigs"
allowed_domains = ["http://www.allgigs.co.uk/"]
start_urls = [
"http://www.allgigs.co.uk/whats_on/London/clubbing-1.html",
"http://www.allgigs.co.uk/whats_on/London/festivals-1.html",
"http://www.allgigs.co.uk/whats_on/London/comedy-1.html",
"http://www.allgigs.co.uk/whats_on/London/theatre_and_opera-1.html",
"http://www.allgigs.co.uk/whats_on/London/dance_and_ballet-1.html"
]
rules = (Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[@class="more"]',)), callback="parse_me", follow= True),
)
def parse_me(self, response):
hxs = HtmlXPathSelector(response)
infos = hxs.xpath('//div[@class="entry vevent"]')
items = []
for info in infos:
item = TutorialItem()
item ['artist'] = hxs.xpath('//span[@class="summary"]//text()').extract()
item ['date'] = hxs.xpath('//abbr[@class="dtstart dtend"]//text()').extract()
item ['endDate'] = hxs.xpath('//abbr[@class="dtend"]//text()').extract()
item ['startDate'] = hxs.xpath('//abbr[@class="dtstart"]//text()').extract()
items.append(item)
return items
print items