我有下面给出的解析方法,我使用 selenium 首先加载页面,访问无法直接从蜘蛛抓取访问的某些页面,将单个 url 收集到另一个从页面中提取项目的解析方法。问题是,这个解析方法会阻塞其他解析,直到所有页面都被访问。这会阻塞系统。我尝试添加一个睡眠,但这会一起停止引擎,而不仅仅是这个parse方法。
关于我如何优化这一点的任何指示,或者至少使睡眠工作以使其不会停止引擎?
def parse(self, response):
    '''Parse first page and extract page links'''
    item_link_xpath = "/html/body/form/div[@class='wrapper']//a[@title='View & Apply']"
    pagination_xpath = "//div[@class='pagination']/input"
    page_xpath = pagination_xpath + "[@value=%d]"
    display = Display(visible=0, size=(800, 600))
    display.start()
    browser = webdriver.Firefox()
    browser.get(response.url)
    log.msg('Loaded search results', level=log.DEBUG)
    page_no = 1
    while True:
        log.msg('Scraping page: %d'%page_no, level=log.DEBUG)
        for link in [item_link.get_attribute('href') for item_link in browser.find_elements_by_xpath(item_link_xpath)]:
            yield Request(link, callback=self.parse_item_page)
        page_no += 1
        log.msg('Using xpath: %s'%(page_xpath%page_no), level=log.DEBUG)
        page_element = browser.find_element_by_xpath(page_xpath%page_no)
        if not page_element or page_no > settings['PAGINATION_PAGES']:
            break
        page_element.click()
        if settings['PAGINATION_SLEEP_INTERVAL']:
            seconds = int(settings['PAGINATION_SLEEP_INTERVAL'])
            log.msg('Sleeping for %d'%seconds, level=log.DEBUG)
            time.sleep(seconds)
    log.msg('Scraped listing pages, closing browser.', level=log.DEBUG)
    browser.close()
    display.stop()