我是scrapy的新手,我在从站点中提取数据时遇到问题。我相信我有一个逻辑错误,因为我的蜘蛛抓取了页面,但它没有返回任何抓取的数据,我将不胜感激!
rules = (
Rule(
SgmlLinkExtractor(
allow=(r'.*',),
restrict_xpaths=('//div/div/div/span/a',) #This is the XPath for profiles links that direct to individual pages
),
callback='parse_item',
follow=True
),
Rule(
SgmlLinkExtractor(
allow=(r'.*',),
restrict_xpaths=('//*[contains(concat(" ", normalize-space(@class), " "), " on ")]',) #This is the XPath that cycles through pages
),
callback='parse_item',
follow=True
),
)
def parse_item(self, response):
self.log('parse_item called for: %s' % response.url, level=log.INFO)
hxs = HtmlXPathSelector(response)
item = RealtorSpiderItem()
item['name'] = hxs.select('//*[contains(concat(" ", normalize-space(@class), " "), " screenname ")]').extract()
item['link'] = hxs.select('@href').extract()
item['city'] = hxs.select('//*[contains(concat(" ", normalize-space(@class), " "), " locality ")]').extract()
return item