块引用
我正在尝试提取href链接,然后我加入了url,然后我给了seleniumRequest和加入的url,我的代码可以工作,它会爬取数据,但结果它会产生相同的重复数据,爬取的数据将是重复的
看起来一切都很好,没有错误,但输出是重复的,数据也来自不同的产品链接
############# STACK OVERFLOW PLAESE HELP I'm AN BEGGINER IN SCRAPY WITH SELENIUM ###########
############# I THINK SOMETHING WITH MY PRODUCT PRICE URL #############
############# SOMETHING WRONG WITH URL #############################
#########This is my code
import scrapy
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.keys import Keys
from time import sleep
class AmazonSpider(scrapy.Spider):
name = 'Amazon'
def start_requests(self):
yield SeleniumRequest(
url='https://www.amazon.com',
wait_time=3,
screenshot=True,
callback=self.parse
)
def parse(self, response):
driver = response.meta['driver']
search_input =
driver.find_element_by_xpath("//input[@id='twotabsearchtextbox']")
search_input.send_keys('smartphones')
search_input.send_keys(Keys.ENTER)
html = driver.page_source
response_obj = Selector(text=html)
driver.set_window_size(1920, 1080)
links = response_obj.xpath("//h2[@class='a-size-mini a-spacing-none a-color-base s-line-clamp-2']/a")
for link in links:
product_link = link.xpath(".//@href").get()
absolute_url = response.urljoin(product_link)
yield SeleniumRequest(url=absolute_url, wait_time=8, callback=self.parse_price,
dont_filter=True )
def parse_price(self, response):
driver = response.meta['driver']
htmlbody = driver.page_source
response_object = Selector(text=htmlbody)
driver.set_window_size(1920, 1080)
sleep(5)
name = response_object.xpath("//h1[@class='a-size-large a-spacing-none']/span").get()
yield {
'name': name
}