1

我有用python 3.6编写的scrapy项目。并且项目有 3 个爬虫,它只是从 3 个不同的网站上抓取项目,每个网站一个爬虫。我正在使用items.py脚本yield item中的项目,每个爬虫的项目都有细微的不同,我运行它scrapy crawl crawlera -o sample.json并将sample.json文件作为输出文件。我对每个爬虫都做同样的事情,但输出文件名不同。

但是,我想做的是,我想timestamp + website name为每个网站提供文件名,这样每次运行和每个网站的文件名都会不同。

所有三个爬虫都具有相同的结构一个在下面

# -*- coding: utf-8 -*-
import scrapy
import logging
from time import sleep
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.utils.log import configure_logging
from product_data_scraper.items import TakealotItem
from product_data_scraper.spiders.helper import Helper


class TakealotSpider(scrapy.Spider):
    name = 'takealot'
    allowed_domains = ['www.takealot.com']
    takealothelper = Helper.TakealotHelper
    driver_path = './chromedriver'
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='logs/log_takealot.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )
    # start_urls = ['http://www.takealot.com/']

    def start_requests(self):
        BROWSER = webdriver.Chrome(self.driver_path)
        for brand in Helper.brands:
            URL = 'https://www.takealot.com/all?qsearch='+brand[0]
            while True:
                BROWSER.get(URL)
                sleep(4)
                response = Selector(text=BROWSER.page_source)
                pro_list = response.xpath(self.takealothelper.pro_list).extract()
                for pro in pro_list:
                    yield scrapy.Request(url=pro, callback=self.parse)
                next_page = response.xpath(self.takealothelper.next_page).extract_first()
                if next_page:
                    URL = 'https://www.takealot.com' + next_page
                else:
                    break
        BROWSER.quit()

    def parse(self, response):
        item = TakealotItem()
        item['source'] = 'www.takealot.com'
        item['URL'] = response.url
        item['brand'] = response.xpath(self.takealothelper.brand).extract_first()
        item['product_name'] = response.xpath(self.takealothelper.product_name).extract_first()
        item['selling_price'] = response.xpath(self.takealothelper.selling_price).extract_first()
        list_price = response.xpath(self.takealothelper.list_price).extract_first()
        if list_price:
            item['list_price'] = list_price.replace('R ','')
        else:
            item['list_price'] = None
        item['barcode'] = response.xpath(self.takealothelper.barcode).extract_first()
        review_stars = response.xpath(self.takealothelper.review_stars).extract_first()
        if review_stars:
            item['review_stars'] = review_stars.replace(' stars','')
        else:
            item['review_stars'] = '0 out of 5'
        ware_houses = response.xpath(self.takealothelper.warehouse).extract()
        if ware_houses:
            warehouse = ''
            flag = 0
            for ware_house in ware_houses:
                if flag == 0:
                    warehouse = ware_house
                    flag = 1
                else:
                    warehouse = warehouse + 'or' + ware_house
            item['warehouse'] = warehouse
        else:
            item['warehouse'] = None
        in_stock = response.xpath(self.takealothelper.in_stock).extract_first()
        if 'in stock' in in_stock.lower():
            item['in_stock'] = 'yes'
        else:
            item['in_stock'] = 'no'

        yield item

项目.py

class TakealotItem(scrapy.Item):
    source = scrapy.Field()
    URL = scrapy.Field()
    brand = scrapy.Field()
    product_name = scrapy.Field()
    selling_price = scrapy.Field()
    list_price = scrapy.Field()
    barcode = scrapy.Field()
    review_stars = scrapy.Field()
    warehouse = scrapy.Field()
    in_stock = scrapy.Field()

助手.py

import csv


class Helper():
    brand_list = open('brands.csv', 'r')
    brands = csv.reader(brand_list)

    class TakealotHelper():
        next_page = './/a[@class="page-current"]/following-sibling::a/@href'
        current_page = './/a[@class="page-current"]/@href'
        pro_list = './/ul[@class="product-list group"]/li//p[@class="p-title fn"]/a/@href'
        brand = './/span[text()="Brand"]/../following-sibling::dd[1]/span/a/text()'
        product_name = './/span[text()="Title"]/../following-sibling::dd[1]/span/text()'
        selling_price = './/div[@class="box-summary group buybox-bordered"]//span[@class="amount"]/text()'
        list_price = './/div[@id="pdp-product-data"]//p[@class="price-was list-price-info"]/del/text()'
        barcode = './/span[text()="Barcode"]/../following-sibling::dd[1]/span/text()'
        review_stars = './/p[@class="product-rating left"]/span[1]/@title'
        warehouse = './/div[@id="pdp-product-data"]//span[@class="lozenges"]//span[@class="lozenge"]/text()'
        in_stock = './/div[@id="pdp-product-data"]//div[contains(@class,"shipping-information group")]//strong/text()'
4

1 回答 1

4

你可以这样做

scrapy crawl myscraper -o `date +\%d-\%m-\%Y-\%H:\%M:\%S`-websiteName.json
于 2018-03-18T12:20:46.267 回答