我已成功合并 django 和 scrapy,并希望将我的 items 对象保存到数据库中。保存工作正常,但没有所有元素。
我对 python、scrapy 和 django 还很陌生,我想我错过了一些东西,但无法解决。
这是我的蜘蛛代码:
from scrapy.http import FormRequest, Request
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy import log
from scrapy.contrib.loader import XPathItemLoader
from datacrowdscrapy.items import DatacrowdItem
class DatacrowdSpider(BaseSpider):
name = 'datacrowd'
start_urls = ['https://www.exemple.com/login']
def parse(self, response):
parsed = [FormRequest.from_response(
response,
formdata={
'login': 'email@gmail.com',
'password': 'password'
},
callback=self.after_login)]
return parsed
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.log("Login failed", level=log.ERROR)
return
selector = HtmlXPathSelector(response)
investmentsLinks = selector.select('//a[contains(@class, "myClass")]/@href').extract()
for link in investmentsLinks:
curDatacrowdItem = XPathItemLoader(item=DatacrowdItem(), response=response)
curDatacrowdItem.add_value('url', link)
curRequest = Request(url=link, callback=self.parse_investments, meta={'item': curDatacrowdItem})
yield curRequest
def parse_investments(self, response):
selector = HtmlXPathSelector(response)
curDatacrowdItem = response.meta['item']
# Details
details = selector.select('//td/div[contains(@class, "myClass")]/text()').extract()
curDatacrowdItem.add_value('someVal', details[0].strip())
/* ... */
# Get nbInvestors
investorLink = selector.select('//ul[contains(@id, "myId")]/li/@onclick').re(r'window.location.href=\'(http.+/data.+)\'')
curRequest = Request(url=investorLink[0], callback=self.parse_investors, meta={'item': curDatacrowdItem})
yield curRequest
# Get last company details
detailsLink = selector.select('//ul[contains(@id, "myData")]/li/@onclick').re(r'window.location.href=\'(http.+/company-details.+)\'')
curRequest = Request(url=detailsLink[0], callback=self.parse_details, meta={'item': curDatacrowdItem})
yield curRequest
def parse_investors(self, response):
selector = HtmlXPathSelector(response)
curDatacrowdItem = response.meta['item']
nbInvestors = len(selector.select('//ul/li[contains(@class, "myClass")]'))
curDatacrowdItem.add_value('nbInvestors', nbInvestors)
return curDatacrowdItem
def parse_details(self, response):
selector = HtmlXPathSelector(response)
curDatacrowdItem = response.meta['item']
# Company name
name = selector.select('//div[contains(@class, "myClass")]/h2/text()').extract()
curDatacrowdItem.add_value('name', name[0].strip())
item = curDatacrowdItem.load_item()
item.save() # Here I'm persisiting datas
return item
我收到这样的错误日志:
[datacrowd] ERROR: Spider must return Request, BaseItem or None, got 'XPathItemLoader' in <GET http://www.exemple.com/url/slug>
知道我做错了什么吗?
干杯,
斯奈特