我使用 Python 和 Scrapy(Python 3.7.1 和 Scrapy 1.6.0)在使用“utf-16”编码时对阿拉伯网站进行网络抓取,并将输出导出到 .csv 文件中,所有列组合在一起,仅在一列中显示为以下:Company_Name、Phone、Activity、Website、Company_Type 以及所有结果都显示在它下面。但是当使用任何其他编码时,结果显示在一列中的每个字段都是正常的,但格式很奇怪。
# spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from Arabic_Companies.items import ArabicCompaniesItem
class ArabicSpider(scrapy.Spider):
name = 'arabic'
handle_httpstatus_all = True
def start_requests(self):
url = "https://www.egycompanies.com/searchAr?name=&activity=&industry_id=0"
yield Request(url, callback=self.parse)
def parse(self, response):
links = response.xpath("//div[@class='f-listings-item']")
for link in links:
Company_Name = link.xpath(".//div/div/div/h2[@class='f-listings-item__title']/a/text()").extract_first()
Phone = link.xpath(".//*[contains(text(),' هاتف الشركة: ')]/following-sibling::text()[1]").extract_first()
Activity = link.xpath(".//*[contains(text(),' النشاط: ')]/following-sibling::text()[1]").extract_first()
Website = link.xpath(".//*[contains(text(),' الموقع الإلكترونى: ')]/following-sibling::a[1]/@href").extract_first()
Company_Type = link.xpath(".//*[contains(text(),' التصنيفات : ')]/span/text()").extract_first()
loader = ItemLoader(item=ArabicCompaniesItem(), response=response)
loader.add_value('Company_Name', Company_Name)
loader.add_value('Phone', Phone)
loader.add_value('Activity', Activity)
loader.add_value('Website', Website)
yield loader.load_item()
------------------------------------------------
# items.py
import scrapy
from w3lib.html import replace_escape_chars
from scrapy.loader.processors import MapCompose, TakeFirst
class ArabicCompaniesItem(scrapy.Item):
Company_Name = scrapy.Field(input_processor= MapCompose(replace_escape_chars))
Phone = scrapy.Field(input_processor= MapCompose(replace_escape_chars))
Activity = scrapy.Field(input_processor= MapCompose(replace_escape_chars))
Website = scrapy.Field(input_processor= MapCompose(replace_escape_chars))
Company_Type = scrapy.Field(input_processor= MapCompose(replace_escape_chars))
------------------------------------------------
# settings.py file
FEED_EXPORT_ENCODING = 'utf-16'
FEED_EXPORT_FIELDS = ["Company_Name", "Phone", "Activity", "Website", "Company_Type"]
公司_姓名、电话、活动、网站、公司_类型
أورينتال لمكسبات الطعم والرائحة, 0482590882, صناعات غذائية , http://www.oriental-off.com , شركات المواد الغذائية