我一直在关注一些在线教程,以使用 Scrapy 抓取 Craigslist 的电子邮件。我有这段代码,但是当我运行命令并导出到 json 文件时,它会创建文件,但里面唯一的东西是一个“[”。
任何帮助将不胜感激。下面是我的代码:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy_demo.items import ScrapyDemoItem
import urlparse
from scrapy.http.request import Request
class ScrapyDemoSpider(BaseSpider):
name = "scrapy_demo"
allowed_domains = ["buffalo.craigslist.org"]
start_urls = ['http://buffalo.craigslist.org/search/cps/']
def parse(self, response):
hxs = HtmlXPathSelector(response)
listings = hxs.select('//....')
links = []
#scrape listings page to get listing links
for listing in listings:
link = listing.select('..../@href').extract()[0]
links.append(link)
#parse listing url to get content of the listing page
for link in links:
item = ScrapyDemoItem()
item['link'] = link
yield Request(urlparse.urljoin(response.url, link), meta={'item': item}, callback=self.parse_listing_page)
#get next button link
next_page = hxs.select("//..../@href").extract()[0]
if next_page:
yield Request(urlparse.urljoin(response.url, next_page), self.parse)
#scrape listing page to get content
def parse_listing_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
item['title'] = hxs.select('//..../text()').extract()[0]
item['content'] = hxs.select('//..../text()').extract()[0]
yield item