我有下面给出的解析方法,我使用 selenium 首先加载页面,访问无法直接从蜘蛛抓取访问的某些页面,将单个 url 收集到另一个从页面中提取项目的解析方法。问题是,这个解析方法会阻塞其他解析,直到所有页面都被访问。这会阻塞系统。我尝试添加一个睡眠,但这会一起停止引擎,而不仅仅是这个parse
方法。
关于我如何优化这一点的任何指示,或者至少使睡眠工作以使其不会停止引擎?
def parse(self, response):
'''Parse first page and extract page links'''
item_link_xpath = "/html/body/form/div[@class='wrapper']//a[@title='View & Apply']"
pagination_xpath = "//div[@class='pagination']/input"
page_xpath = pagination_xpath + "[@value=%d]"
display = Display(visible=0, size=(800, 600))
display.start()
browser = webdriver.Firefox()
browser.get(response.url)
log.msg('Loaded search results', level=log.DEBUG)
page_no = 1
while True:
log.msg('Scraping page: %d'%page_no, level=log.DEBUG)
for link in [item_link.get_attribute('href') for item_link in browser.find_elements_by_xpath(item_link_xpath)]:
yield Request(link, callback=self.parse_item_page)
page_no += 1
log.msg('Using xpath: %s'%(page_xpath%page_no), level=log.DEBUG)
page_element = browser.find_element_by_xpath(page_xpath%page_no)
if not page_element or page_no > settings['PAGINATION_PAGES']:
break
page_element.click()
if settings['PAGINATION_SLEEP_INTERVAL']:
seconds = int(settings['PAGINATION_SLEEP_INTERVAL'])
log.msg('Sleeping for %d'%seconds, level=log.DEBUG)
time.sleep(seconds)
log.msg('Scraped listing pages, closing browser.', level=log.DEBUG)
browser.close()
display.stop()