我一直在尝试登录一个网站,然后抓取一些只有在登录后才能访问的网址。
def start_requests(self):
script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go(splash.args.url))
splash:set_viewport_full()
local search_input = splash:select('input[name=username]')
search_input:send_text("MY_USERNAME")
splash:evaljs("document.getElementById('password').value = 'MY_PASSWORD';")
local submit_button = splash:select('input[name=signin]')
submit_button:click()
local entries = splash:history()
local last_response = entries[#entries].response
return {
cookies = splash:get_cookies(),
headers = last_response.headers,
html = splash:html()
}
end
"""
yield scrapy_splash.SplashRequest(
url='https://www.website.com/login',
callback=self.after_login,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script}
)
def after_login(self, response):
with open('after_login.html') as out:
out.write(response.body.decode(''utf-8))
script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go(splash.args.url))
splash:set_viewport_full()
assert(splash:wait(10))
return {
cookies = splash:get_cookies(),
html = splash:html()
}
end
"""
yield scrapy_splash.SplashRequest(
url='https://www.website.com/search?tools',
callback=self.parse,
endpoint='execute',
cookies = response.data['cookies'],
headers = response.data['headers'],
args={'lua_source': script},
)
def parse(self, response):
with open('search_result.html', 'w+') as out:
out.write(response.body.decode('utf-8'))
我正在按照Session Handling中的说明进行操作。首先,我登录并开始重定向到主页,这已正确保存在login.html中(登录正常)。然后我获取cookie并将它们设置在第二个SplashRequest中进行搜索,但是search_result.html中的响应是用户未登录。为了在不同的SplashRequest中保持会话,我错过了什么或做错了什么?
问候,