我似乎无法让我的刮板跟随链接def parse_jobs
,我只得到一个结果,它不会对href
我抓取的每个结果进行交互。我得到了很多没有意义的输出,虽然我得到了 200,但实际上并没有得到太多返回的信息。我想这可能是我的 xpaths,或者我如何设置我的刮板请求?
更新:
我已经解决了单个结果的问题,我错过了 a 的括号get
。但是,我只能抓取一个页面,并且抓取器不会进入下一页来抓取任何信息。
这是我的刮刀:
import hashlib
from pathlib import Path
from scrapy.crawler import CrawlerProcess
import scrapy
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
from pathlib import Path
from typing import Generator, Optional
class JobSpider(scrapy.Spider):
name = 'job_play'
start_urls = ['https://jobsite.co.uk/jobs/Degree-Accounting-and-Finance']
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Safari/605.1.15'
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url = url,
callback = self.parse,
dont_filter = True,
meta= dict(
playwright = True,
playwright_include_page = True,
playwright_page_coroutines = [
PageCoroutine('wait_for_selector', 'div.row.job-results-row')
]
)
)
def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
last_page = response.xpath('//div[@class="row job-results-row"]//a[5]//text()').extract_first()
last_page=int(last_page)
for page in range(2, last_page + 1):
yield response.follow(f"https://jobsite.co.uk/jobs?page={page}&action=paging_next.html", cb_kwargs={"current_page": page})
current_page = current_page or 1
for jobs in response.xpath("//article//div//div[position() mod 7 = 6]/a//@href"):
yield response.follow(
jobs,
callback=self.parse_jobs,
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_context": f"page-{current_page}",
},
)
async def parse_jobs(self, response):
url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest()
page = response.meta["playwright_page"]
await page.screenshot(
path=Path(__file__).parent / "job_test" / f"{url_sha256}.png", full_page=True
)
await page.close()
yield {
"url": response.url,
"title": response.xpath("//h1[@class='brand-font']//text()").get(),
"price": response.xpath("//li[@class='salary icon']//div//text()").get(),
"organisation": response.xpath("//a[@id='companyJobsLink']//text()").get(),
"image": f"job_test/{url_sha256}.png",
}
if __name__ == "__main__":
process = CrawlerProcess(
settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"CONCURRENT_REQUESTS": 32,
"CLOSESPIDER_ITEMCOUNT": 100,
"FEED_URI":'jobs.jl',
"FEED_FORMAT":'jsonlines',
}
)
process.crawl(JobSpider)
process.start()