python - 带有javascript下一页的Scrapy和Selenium不起作用

Question

我将刮掉下一页的所有职位，但它只刮掉第一页。该网址仅包含四页，每页包含 25 个项目，但我从未到达下一页。

import scrapy
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from shutil import which
from time import sleep
from scrapy.selector import Selector
from selenium.common.exceptions import NoSuchElementException


class JobSpider(scrapy.Spider):
    name = "joblists"
    start_urls = ["https://epco.taleo.net/careersection/alljobs/jobsearch.ftl"]

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.driver.maximize_window()
        self.driver.implicitly_wait(20)
        
    def parse(self, response):
        
        self.driver.get(response.url)
        sleep(5)

        try:
            wait = WebDriverWait(self.driver, 20)
            next_button = wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[contains(@id, "requisitionListInterface.pagerDivID4019.panel")])[7]/span/a')))
            self.driver.execute_script("return arguments[0].scrollIntoView();", next_button)
            next_button.click()
            #sleep(5)
        except NoSuchElementException:
            sleep(5)
                
        
        resp = Selector(text = self.driver.page_source)
        titles = resp.xpath('//*[@class="ftlcopy ftlrow"]') 
        for title in titles:
            yield{
                'Title': title.xpath('.//*[@class="titlelink"]/a/text()').get()
            }
        #self.driver.quit()

score 1 · Accepted Answer

在选择任何技术之前，您应该先查看 F12->Network。当您在页面轮播中单击“下一步”时，您应该会看到一个 Post 请求出现。

然后，当您了解它的工作原理时，您可以选择一个库。在这里，我只是建议你requests & beautifulsoup。

下面的两个函数（cleanData()和groupData()用于清理和格式化收集到的数据周围的垃圾。

import requests as rq
from bs4 import BeautifulSoup as bs
import re

def cleanData(datax, elements):
    new = []
    flagx = True
    for row in datax:
        for el in elements:
            if row.startswith(el):
                flagx = False
                break

        if re.match(r"\!\d+\!", row):
            flagx = False

        if flagx == True:
            new.append(row)
        flagx = True
    return new

def groupData(datax):
    new = []
    for i in range(5, len(datax), 6):
        new.append((datax[i-5][1:-1], datax[i][1:-1], datax[i-1][1:-1]))
    return new

这是主要的帖子请求。您需要在有效负载中循环“rlPager.currentPage”的值。例如，我将其设置为“3”。

url = "https://epco.taleo.net/careersection/alljobs/jobsearch.ajax"

headers = {"User-Agent": "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}

payload = {"iframemode":"1",
            "ftlpageid":"reqListBasicPage",
            "ftlinterfaceid":"requisitionListInterface",
            "ftlcompid":"rlPager",
            "jsfCmdId":"rlPager",
            "ftlcompclass":"PagerComponent",
            "ftlcallback":"ftlPager_processResponse",
            "ftlajaxid":"ftlx1",
            "rlPager.currentPage":"3",
            "lang":"en"
            }
resp = rq.post(url, headers=headers, data=payload)
soup = bs(resp.content, "lxml")

data = soup.find_all("td")[0].text

garbage_elements = ["!ftl", "ftl", "!!", "!false!", "!true!", " ", "! !", "!list", "!rlPager", "!Go to", "!Re-apply", "!Apply", "!Submission", "!Add ", "!https", "!true", "!The list of", "!Next", "!Previous", "! "]
data_cleaned = groupData(cleanData(data.split('|'), garbage_elements))

len(data_cleaned)

data_cleaned包含您需要的第 3 页数据。设置一个循环，您将获得所有页面数据。

[('000BBE', 'Manager, Fleet Administration', 'Jun 30, 2021'),
 ('000BCP', 'Technician, Measurement (Farmington, NM)', 'Jul 19, 2021'),
 ('000B92', 'Driver - ETC (Breaux Bridge)', 'Jun 2, 2021'),
 ('000B5D', 'Operator, Plant Trainee (Orla, TX)', 'Apr 15, 2021'),
 ('000B7B', 'Deckhand, Inland', 'Jun 28, 2021'),
 ('000B5C', 'Operator, Plant Trainee (Carlsbad, NM)', 'Apr 26, 2021'),
 ('000BCE', 'Accountant', 'Jul 12, 2021'),
 ('000BD3', 'Engineer, Senior Project', 'Jul 19, 2021'),
 ('000BE5', 'Gauger (Midland, TX)', 'Jul 21, 2021'),
 ('000BA1', 'Specialist, Senior Network Architecture', 'Jun 11, 2021'),
 ('000BCW', 'Engineer, Senior Measurement', 'Jul 15, 2021'),
 .......
 ('000B6Z', 'Driver - ETC (Baytown, TX)', 'Apr 30, 2021'),
 ('000B83', 'Specialist, Senior Systems', 'May 26, 2021'),
 ('000BA2', 'Specialist, Lead IT Security Engineering', 'Jun 11, 2021')]

score 0 · Accepted Answer

最后，我已经达到了所需的解决方案，并通过下拉选择解决了这个问题。

我的代码：

import scrapy
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from shutil import which
from time import sleep
from scrapy.selector import Selector
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.select import Select


class JobSpider(scrapy.Spider):
    name = "joblists"
    start_urls = ["https://epco.taleo.net/careersection/alljobs/jobsearch.ftl"]

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.driver.maximize_window()
        self.driver.implicitly_wait(20)
        
    def parse(self, response):
        
        self.driver.get(response.url)
        sleep(5)

        try:
            actions = ActionChains(self.driver)
            wait = WebDriverWait(self.driver, 20)
            dropdown = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="requisitionListInterface.dropListSize"]')))
            sleep(1)
            actions.move_to_element(dropdown).perform()
            sleep(0.5)
            dropdown = Select(self.driver.find_element_by_xpath('//*[@id="requisitionListInterface.dropListSize"]'))
            dropdown.select_by_visible_text('100')
            
        except NoSuchElementException:
            pass
                
        
        resp = Selector(text = self.driver.page_source)
        titles = resp.xpath('//*[@class="ftlcopy ftlrow"]') 
        for title in titles:
            yield{
                'Title': title.xpath('.//*[@class="titlelink"]/a/text()').get()
            }
        self.driver.quit()

python - 带有javascript下一页的Scrapy和Selenium不起作用

2 回答 2

Related

Reference