我有一个链接:https ://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm
我想增加这样的链接:https ://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP2.htm
然后 3,4,5.... 我的代码是:
# -*- coding: utf-8 -*-
import scrapy
class GlassdoorSpider(scrapy.Spider):
name = 'glassdoor'
#allowed_domains = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
start_urls = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm']
def parse(self, response):
#main_url = "https://www.glassdoor.ca"
urls = response.css('li.jl > div > div.flexbox > div > a::attr(href)').extract()
for url in urls:
url = "https://www.glassdoor.ca" + url
yield scrapy.Request(url = url, callback = self.parse_details)
next_page_url = "https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP"
if next_page_url:
#next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url = next_page_url, callback = self.parse)
def parse_details(self,response):
yield{
'Job_Title' : response.css('div.header.cell.info > h2::text').extract()
}
self.log("reached22: "+ response.url)
我想在变量 next_page_url 中增加它。