我将刮掉下一页的所有职位,但它只刮掉第一页。该网址仅包含四页,每页包含 25 个项目,但我从未到达下一页。
import scrapy
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from shutil import which
from time import sleep
from scrapy.selector import Selector
from selenium.common.exceptions import NoSuchElementException
class JobSpider(scrapy.Spider):
name = "joblists"
start_urls = ["https://epco.taleo.net/careersection/alljobs/jobsearch.ftl"]
def __init__(self):
self.driver = webdriver.Chrome()
self.driver.maximize_window()
self.driver.implicitly_wait(20)
def parse(self, response):
self.driver.get(response.url)
sleep(5)
try:
wait = WebDriverWait(self.driver, 20)
next_button = wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[contains(@id, "requisitionListInterface.pagerDivID4019.panel")])[7]/span/a')))
self.driver.execute_script("return arguments[0].scrollIntoView();", next_button)
next_button.click()
#sleep(5)
except NoSuchElementException:
sleep(5)
resp = Selector(text = self.driver.page_source)
titles = resp.xpath('//*[@class="ftlcopy ftlrow"]')
for title in titles:
yield{
'Title': title.xpath('.//*[@class="titlelink"]/a/text()').get()
}
#self.driver.quit()