-1

我正在尝试从网站https://tonaton.com/en/ads/ghana/electronics抓取。有一个“下一步”按钮,我想单击并抓取内容。问题是该按钮的xpath或css选择器在scrapy shell和splash中都没有返回任何值,我被卡住了。我不能进去刮我需要的东西。请问有什么帮助吗?这就是我能走多远,但我没有得到正确的结果。

# -*- coding: utf-8 -*-

进口scrapy 进口scrapy_selenium 从scrapy_selenium 进口SeleniumRequest

类 VisionSpider(scrapy.Spider): name = 'vision'

def start_requests(self):
    yield SeleniumRequest(
        url= 'https://tonaton.com',
        wait_time=3,
        screenshot=True,
        callback=self.parse
    )


def parse(self, response): 
    businesses = response.xpath(
        "//a[@class='link--1t8hM gtm-home-category-link-click']")
    for business in businesses:
        link = business.xpath(".//@href").get()
        category = business.xpath(".//div[2]/p/text()").get()

        yield response.follow(url=link, callback=self.parse_business, meta={'business_category': category})


def parse_business(self, response):
    
    category = response.request.meta['business_category']
    rows = response.xpath("//a[@class='card-link--3ssYv gtm-ad-item']")
    for row in rows:
        new_link = row.xpath(".//@href").get()

        yield response.follow(url=new_link, callback=self.next_parse, meta={'business_category': category})

    next_page = response.xpath("//div[@class = 'action-button--1O8tU']")
    if next_page:
        button = next_page.click()
        yield SeleniumRequest(
            url=button,
            wait_time=3,
            callback=self.parse
        )



def next_parse(self, response):
    category = response.request.meta['business_category']
    lines = response.xpath("//a[@class='member-link--IzDly gtm-visit-shop']")
    for line in lines:
        next_link = line.xpath(".//@href").get()

        yield response.follow(url=next_link, callback=self.another_parse, meta={'business_category': category})

def another_parse(self, response):
    category = response.request.meta['business_category']
    button = response.xpath("//button[@class = 'contact-section--1qlvP gtm-show-number']").click()
    
    yield response.follow(url=button, callback=self.new_parse, meta={'business_category': category})


def new_parse(self, response):
    category = response.request.meta['business_category']
    times = response.xpath("//div[@class='info-container--3pMhK']")
    for time in times:
        name = time.xpath(".//div/span/text()").get()
        location = time.xpath(".//div/div/div/span/text()").get()
        phone = time.xpath(".//div[3]/div/button/div[2]/div/text()").get()

        yield {
            'business_category': category,
            'business_name': name,
            'phone': phone,
            'location': location
        }
4

1 回答 1

0

我已经尝试过了,但分页仍然无法正常工作。此外,在我单击调用按钮进行抓取的地方,返回所需的输出需要相当长的时间。有没有办法让它更快?

class VisionSpider(scrapy.Spider):
    name = 'vision'
    main_domains = ['tonaton.com']
    start_urls =['https://tonaton.com']

def parse(self, response):   
    businesses = response.xpath("//a[@class='link--1t8hM gtm-home-category-link-click'][1]")
    for business in businesses:
        link = business.xpath(".//@href").get()
        category = business.xpath(".//div[2]/p/text()").get()

        yield response.follow(url=link, callback=self.parse_business, meta={'business_category': category})


def parse_business(self, response):
    category = response.request.meta['business_category']
    rows = response.xpath("//a[@class='card-link--3ssYv gtm-ad-item']")
    for row in rows:
        new_link = row.xpath(".//@href").get()
        if new_link:

            yield response.follow(url=new_link, callback=self.new_parse, meta={'business_category': category, 'newlink':new_link})

    chrome_options = Options()
    chrome_options.add_argument("--headless")

    chrome_path = which("chromedriver")
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
    driver.get(response.url)
    driver.maximize_window

    next_page = wait(driver, 300).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//div[@class='icon--3D09z extra-small--_AIuZ arrow-right--17oRn']"))) 
    if  next_page:
        next_page.click()

        yield SeleniumRequest(callback=self.parse_business)
    
    driver.close()



def new_parse(self, response):
    category = response.request.meta['business_category']
    chrome_options = Options()
    chrome_options.add_argument("--headless")
# options=chrome_options
    chrome_path = which("chromedriver")  
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
    driver.get(response.url)
    driver.maximize_window
    category = response.request.meta['business_category']

    call_button = wait(driver, 500).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='call-button--3uvWj']")))
    call_button.click()
    
    html = driver.page_source
    resp = Selector(text=html)

    driver.close()

    contacts = resp.xpath("//div[@class='call-button--3uvWj']/div[1]")
    for contact in contacts:
        phone = contact.xpath(".//text()").get()
    times = resp.xpath("//div[@class='details-section--2ggRy']")
    for time in times:
        name = time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/text()").get()
        if name is None:
            name =time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/div/text()").get()

        location = time.xpath(".//div/div/div/span/a/span/text()[1]").get()
        region = time.xpath(".//div/div/div/span/a[2]/span/text()").get()

        yield {
            'business_category': category,
            'business_name': name,
            'phone': phone,
            'region':region,
            'location': location
        }
于 2021-09-09T16:02:15.480 回答