0

我似乎无法让我的刮板跟随链接def parse_jobs,我只得到一个结果,它不会对href我抓取的每个结果进行交互。我得到了很多没有意义的输出,虽然我得到了 200,但实际上并没有得到太多返回的信息。我想这可能是我的 xpaths,或者我如何设置我的刮板请求?

更新: 我已经解决了单个结果的问题,我错过了 a 的括号get。但是,我只能抓取一个页面,并且抓取器不会进入下一页来抓取任何信息。

这是我的刮刀:

import hashlib
from pathlib import Path
from scrapy.crawler import CrawlerProcess
import scrapy
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
from pathlib import Path
from typing import Generator, Optional


class JobSpider(scrapy.Spider):
    name = 'job_play'
    start_urls = ['https://jobsite.co.uk/jobs/Degree-Accounting-and-Finance']
    
    custom_settings = {
        'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Safari/605.1.15'
    }
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url = url,
                callback = self.parse,
                dont_filter = True,
                meta= dict(
                    playwright = True,
                    playwright_include_page = True,
                    playwright_page_coroutines = [
                        PageCoroutine('wait_for_selector', 'div.row.job-results-row')
                        ]
                )
            )
    def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
        last_page = response.xpath('//div[@class="row job-results-row"]//a[5]//text()').extract_first()
        last_page=int(last_page)
        for page in range(2, last_page + 1):
            yield response.follow(f"https://jobsite.co.uk/jobs?page={page}&action=paging_next.html", cb_kwargs={"current_page": page})

        current_page = current_page or 1
        for jobs in response.xpath("//article//div//div[position() mod 7 = 6]/a//@href"):
            yield response.follow(
                jobs,
                callback=self.parse_jobs,
                meta={
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": f"page-{current_page}",
                },
            )

    async def parse_jobs(self, response):
        url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest()
        page = response.meta["playwright_page"]
        await page.screenshot(
            path=Path(__file__).parent / "job_test" / f"{url_sha256}.png", full_page=True
        )
        await page.close()
        yield {
            "url": response.url,
            "title": response.xpath("//h1[@class='brand-font']//text()").get(),
            "price": response.xpath("//li[@class='salary icon']//div//text()").get(),
            "organisation": response.xpath("//a[@id='companyJobsLink']//text()").get(),
            "image": f"job_test/{url_sha256}.png",
        }
if __name__ == "__main__":
    process = CrawlerProcess(
        settings={
            "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
            "DOWNLOAD_HANDLERS": {
                "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
                "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            },
            "CONCURRENT_REQUESTS": 32,
            "CLOSESPIDER_ITEMCOUNT": 100,
            "FEED_URI":'jobs.jl',
            "FEED_FORMAT":'jsonlines',
        }
    )
    process.crawl(JobSpider)
    process.start()
4

0 回答 0