我有用python 3.6编写的scrapy项目。并且项目有 3 个爬虫,它只是从 3 个不同的网站上抓取项目,每个网站一个爬虫。我正在使用items.py
脚本yield item
中的项目,每个爬虫的项目都有细微的不同,我运行它scrapy crawl crawlera -o sample.json
并将sample.json
文件作为输出文件。我对每个爬虫都做同样的事情,但输出文件名不同。
但是,我想做的是,我想timestamp + website name
为每个网站提供文件名,这样每次运行和每个网站的文件名都会不同。
所有三个爬虫都具有相同的结构一个在下面
# -*- coding: utf-8 -*-
import scrapy
import logging
from time import sleep
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.utils.log import configure_logging
from product_data_scraper.items import TakealotItem
from product_data_scraper.spiders.helper import Helper
class TakealotSpider(scrapy.Spider):
name = 'takealot'
allowed_domains = ['www.takealot.com']
takealothelper = Helper.TakealotHelper
driver_path = './chromedriver'
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='logs/log_takealot.txt',
format='%(levelname)s: %(message)s',
level=logging.INFO
)
# start_urls = ['http://www.takealot.com/']
def start_requests(self):
BROWSER = webdriver.Chrome(self.driver_path)
for brand in Helper.brands:
URL = 'https://www.takealot.com/all?qsearch='+brand[0]
while True:
BROWSER.get(URL)
sleep(4)
response = Selector(text=BROWSER.page_source)
pro_list = response.xpath(self.takealothelper.pro_list).extract()
for pro in pro_list:
yield scrapy.Request(url=pro, callback=self.parse)
next_page = response.xpath(self.takealothelper.next_page).extract_first()
if next_page:
URL = 'https://www.takealot.com' + next_page
else:
break
BROWSER.quit()
def parse(self, response):
item = TakealotItem()
item['source'] = 'www.takealot.com'
item['URL'] = response.url
item['brand'] = response.xpath(self.takealothelper.brand).extract_first()
item['product_name'] = response.xpath(self.takealothelper.product_name).extract_first()
item['selling_price'] = response.xpath(self.takealothelper.selling_price).extract_first()
list_price = response.xpath(self.takealothelper.list_price).extract_first()
if list_price:
item['list_price'] = list_price.replace('R ','')
else:
item['list_price'] = None
item['barcode'] = response.xpath(self.takealothelper.barcode).extract_first()
review_stars = response.xpath(self.takealothelper.review_stars).extract_first()
if review_stars:
item['review_stars'] = review_stars.replace(' stars','')
else:
item['review_stars'] = '0 out of 5'
ware_houses = response.xpath(self.takealothelper.warehouse).extract()
if ware_houses:
warehouse = ''
flag = 0
for ware_house in ware_houses:
if flag == 0:
warehouse = ware_house
flag = 1
else:
warehouse = warehouse + 'or' + ware_house
item['warehouse'] = warehouse
else:
item['warehouse'] = None
in_stock = response.xpath(self.takealothelper.in_stock).extract_first()
if 'in stock' in in_stock.lower():
item['in_stock'] = 'yes'
else:
item['in_stock'] = 'no'
yield item
项目.py
class TakealotItem(scrapy.Item):
source = scrapy.Field()
URL = scrapy.Field()
brand = scrapy.Field()
product_name = scrapy.Field()
selling_price = scrapy.Field()
list_price = scrapy.Field()
barcode = scrapy.Field()
review_stars = scrapy.Field()
warehouse = scrapy.Field()
in_stock = scrapy.Field()
助手.py
import csv
class Helper():
brand_list = open('brands.csv', 'r')
brands = csv.reader(brand_list)
class TakealotHelper():
next_page = './/a[@class="page-current"]/following-sibling::a/@href'
current_page = './/a[@class="page-current"]/@href'
pro_list = './/ul[@class="product-list group"]/li//p[@class="p-title fn"]/a/@href'
brand = './/span[text()="Brand"]/../following-sibling::dd[1]/span/a/text()'
product_name = './/span[text()="Title"]/../following-sibling::dd[1]/span/text()'
selling_price = './/div[@class="box-summary group buybox-bordered"]//span[@class="amount"]/text()'
list_price = './/div[@id="pdp-product-data"]//p[@class="price-was list-price-info"]/del/text()'
barcode = './/span[text()="Barcode"]/../following-sibling::dd[1]/span/text()'
review_stars = './/p[@class="product-rating left"]/span[1]/@title'
warehouse = './/div[@id="pdp-product-data"]//span[@class="lozenges"]//span[@class="lozenge"]/text()'
in_stock = './/div[@id="pdp-product-data"]//div[contains(@class,"shipping-information group")]//strong/text()'