我有一个 CrawlSpider 脚本,它使用 splash 在 javascript 页面上登录。然而,在成功登录后,继承的 self.parse() 函数似乎没有被调用。爬取第一页后蜘蛛关闭。
我认为 CrawlSpider 在 start_requests 产生响应后会自动调用 self.parse 方法。但即使使用显式回调,self.parse 似乎也不会被调用。
我做错了什么?
剧本:
#!/usr/bin/env python3
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from harvest.items import HarvestItem
from scrapy_splash import SplashRequest
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['test.secure.force.com', 'login.salesforce.com']
login_url = 'https://test.secure.force.com/jSites_Home'
rules = (Rule(LinkExtractor(restrict_xpaths='//*[@id="nav"]/ul/li/a[@title="Assignments"]')),
Rule(LinkExtractor(restrict_xpaths='//*/table/tbody/tr[2]/td[1]/a'), callback='parse_item'),
)
def start_requests(self):
script = """
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(10))
splash:set_viewport_full()
local search_input = splash:select('input[name=username]')
search_input:send_text("someuser")
local search_input = splash:select('input[name=pw]')
search_input:send_text("p4ssw0rd")
assert(splash:wait(5))
local submit_button = splash:select('input[class=btn]')
submit_button:click()
assert(splash:wait(10))
return {html = splash:html(),}
end
"""
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko'
') Chrome/55.0.2883.95 Safari/537.36'}
yield SplashRequest(url=self.login_url,
callback=self.parse,
endpoint='execute',
args={
'lua_source': script,
'wait': 5
},
splash_headers=headers,
headers=headers)
def parse_item(self, response):
items = HarvestItem()
items['start'] = response.xpath('(//*/table[@class="detailList"])[3]/tbody/tr[1]/td[1]/span/text()').extract()
return items