我正在尝试将 scrapy 与 selenium 集成以抓取 instagram。首先,我想用 selenium 完成登录表单,然后传递 urlscrapy.Requests
来解析数据,scrapy
因为它的计算速度要快得多。我遇到的问题是前者有效,但是当我运行Requests
它时它不起作用,因为下一个解析方法的屏幕截图显示我仍在上一页。
这是我尝试过的:
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class InstaSpider(scrapy.Spider):
name = 'insta'
allowed_domains = ['instagram.com']
start_urls = ['http://instagram.com/']
def start_requests(self):
for url in self.start_urls:
yield SeleniumRequest(
url=url,
callback = self.parse
)
def parse(self, response):
driver = response.request.meta['driver']
# wait=WebDriverWait(driver,10)
# time.sleep(3)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Accept All']"))).click()
driver.find_element(By.XPATH, "//input[@name='username']").send_keys("xxx")
time.sleep(1)
driver.find_element(By.XPATH, "//input[@name='password']").send_keys("xxx")
time.sleep(1)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#loginForm > div > div:nth-child(3) > button"))).click()
time.sleep(6)
driver.get_screenshot_as_file("screenshot.png")
yield scrapy.Request(
url = 'https://www.instagram.com/explore/tags/tesla/',
callback = self.parse_photos,
cb_kwargs = {
'driver':driver
}
)
def parse_photos(self, response, driver):
driver.get_screenshot_as_file("inTeslaPage.png")
container = response.xpath(".//div[@class='Nnq7C weEfm']/div/a/@href")
for links in container:
yield {
'links':links
}