I have the following spider for Scrapy. I need to scrape not only the top level pages in my sitemap but also the pages that are 1st-level children of those pages. Then I need to concatenate the results of the children's scrape with the body item from my parent parse method. Could anyone help me with the code to do something like this?
from scrapy.contrib.spiders import SitemapSpider
from scrapy.selector import HtmlXPathSelector
from cvorgs.items import CvorgSite
class CvorgSpider(SitemapSpider):
name = 'cvorg_spider'
sitemap_urls = ["http://www.urbanministry.org/cvorg_urls.xml"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = CvorgSite()
item['url'] = response.url
item['title'] = hxs.select('//title/text()').extract()
item['meta'] = hxs.select('/html/head/meta[@name="description"]/@content').extract()
body = ' '.join(hxs.select('//body//p//text()').extract())
item['body'] = body.replace('"', '\'');
return item