0

我正在尝试将 scrapy 与 selenium 集成以抓取 instagram。首先,我想用 selenium 完成登录表单,然后传递 urlscrapy.Requests来解析数据,scrapy因为它的计算速度要快得多。我遇到的问题是前者有效,但是当我运行Requests它时它不起作用,因为下一个解析方法的屏幕截图显示我仍在上一页。

这是我尝试过的:

import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
import time

class InstaSpider(scrapy.Spider):
    name = 'insta'
    allowed_domains = ['instagram.com']
    start_urls = ['http://instagram.com/']


    def start_requests(self):
        for url in self.start_urls:
            yield SeleniumRequest(
                url=url, 
                callback = self.parse
            )
    def parse(self, response):
        driver = response.request.meta['driver']
        # wait=WebDriverWait(driver,10) 
        # time.sleep(3)
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Accept All']"))).click()


        driver.find_element(By.XPATH, "//input[@name='username']").send_keys("xxx")
        time.sleep(1)
        driver.find_element(By.XPATH, "//input[@name='password']").send_keys("xxx")
        time.sleep(1)
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#loginForm > div > div:nth-child(3) > button"))).click()

        time.sleep(6)
        driver.get_screenshot_as_file("screenshot.png")
        yield scrapy.Request(
            url = 'https://www.instagram.com/explore/tags/tesla/',
            callback = self.parse_photos,
            cb_kwargs = {
                'driver':driver
            }
        )

    def parse_photos(self, response, driver):
        driver.get_screenshot_as_file("inTeslaPage.png")
        container = response.xpath(".//div[@class='Nnq7C weEfm']/div/a/@href")
        for links in container:
            yield {
                'links':links
            }
4

0 回答 0