我尝试使用http://www.google.com的scrapy-splash并遵循以下 Github Repo https://github.com/scrapy-plugins/scrapy-splash中给出的所有先决条件步骤,我能够渲染 Google页。
但是,当我通过以下 Github Repo https://github.com/scrapinghub/sample-projects/tree/master/splash_crawlera_example中提到的将crawlera 与 scrapy-splash集成来厌倦相同的http://www.google.com时,我总是收到 504 超时异常
splash_crawlera_example中提到的默认示例 url http://quotes.toscrape.com/js/已成功通过 crawlera 渲染,但不是 Google,是否需要使用脚本更改任何内容以渲染 Google 页面?
这是引号-js.py
from pkgutil import get_data
import scrapy
from scrapy_splash import SplashRequest
from w3lib.http import basic_auth_header
class QuotesJsSpider(scrapy.Spider):
name = 'quotes-js'
def __init__(self, *args, **kwargs):
# to be able to load the Lua script on Scrapy Cloud, make sure your
# project's setup.py file contains the "package_data" setting, similar
# to this project's setup.py
self.LUA_SOURCE = get_data(
'splash_crawlera_example', 'scripts/crawlera.lua'
).decode('utf-8')
super(QuotesJsSpider, self).__init__(*args, **kwargs)
def start_requests(self):
yield SplashRequest(
# url='http://quotes.toscrape.com/js/',
url='http://www.google.com',
endpoint='execute',
splash_headers={
'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
},
args={
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
'wait': 0.5, 'viewport': '1024x2480', 'images': 0, 'timeout': 90
},
# tell Splash to cache the lua script, to avoid sending it for every request
cache_args=['lua_source'],
)
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('span small::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
next_page = response.css('li.next > a::attr(href)').extract_first()
if next_page:
yield SplashRequest(
url=response.urljoin(next_page),
endpoint='execute',
splash_headers={
'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
},
args={
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
},
cache_args=['lua_source'],
)
设置.py
# -*- coding: utf-8 -*-
BOT_NAME = 'splash_crawlera_example'
SPIDER_MODULES = ['splash_crawlera_example.spiders']
NEWSPIDER_MODULE = 'splash_crawlera_example.spiders'
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DOWNLOADER_MIDDLEWARES = {
'scrapy_crawlera.CrawleraMiddleware': 300,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
CRAWLERA_APIKEY = '' # Your crawlera API key
# Splash settings
SPLASH_URL = 'http://localhost:8050/' # Splash instance URL from Scrapy Cloud
SPLASH_APIKEY = '' # Your API key for the Splash instance hosted on Scrapy Cloud
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 100
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 1800
DOENLOAD_DELAY = 1
DEFAULT_HEADERS = {
'X-Crawlera-Max-Retries': 0
}
爬虫.lua
function use_crawlera(splash)
-- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
-- Have a look at the file spiders/quotes-js.py to see how to do it.
-- Find your Crawlera credentials in https://app.scrapinghub.com/
local user = splash.args.crawlera_user
local host = 'proxy.crawlera.com'
local port = 8010
local session_header = 'X-Crawlera-Session'
local session_id = 'create'
splash:on_request(function (request)
-- The commented code below can be used to speed up the crawling
-- process. They filter requests to undesired domains and useless
-- resources. Uncomment the ones that make sense to your use case
-- and add your own rules.
-- Discard requests to advertising and tracking domains.
-- if string.find(request.url, 'doubleclick%.net') or
-- string.find(request.url, 'analytics%.google%.com') then
-- request.abort()
-- return
-- end
-- Avoid using Crawlera for subresources fetching to increase crawling
-- speed. The example below avoids using Crawlera for URLS starting
-- with 'static.' and the ones ending with '.png'.
-- if string.find(request.url, '://static%.') ~= nil or
-- string.find(request.url, '%.png$') ~= nil then
-- return
-- end
request:set_header('X-Crawlera-Cookies', 'disable')
request:set_header(session_header, session_id)
request:set_proxy{host, port, username=user, password=''}
end)
splash:on_response_headers(function (response)
if type(response.headers[session_header]) ~= nil then
session_id = response.headers[session_header]
end
end)
end
function main(splash)
use_crawlera(splash)
splash:go(splash.args.url)
return splash:html()
end