我正在尝试下载一些没有压缩的图像。
例如http://p1.pstatp.com/origin/433c000159def0223671
这张图片大约是 2.0MB
,当我使用 scrapy 下载它时它只有 120Kb。
设置.py
BOT_NAME = 'toutiao'
SPIDER_MODULES = ['toutiao.spiders']
NEWSPIDER_MODULE = 'toutiao.spiders'
IMAGES_STORE = './images/'
MEDIA_ALLOW_REDIRECTS = True
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36X-Requested-With:XMLHttpRequest'
}
ITEM_PIPELINES = {'toutiao.pipelines.ToutiaoPipeline': 300,}
项目.py
import scrapy
class ToutiaoItem(scrapy.Item):
keyword = scrapy.Field()
title = scrapy.Field()
urls = scrapy.Field()
蜘蛛.py
import scrapy
from scrapy import Request
from toutiao.items import ToutiaoItem
from urllib.parse import urlencode
import json
import re
class ToutiaopicSpider(scrapy.Spider):
name = 'toutiaopic'
allowed_domains = ['toutiao.com']
keyword = '佳片欣赏·人像'
param={'offset': 0,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': '1'}
url = 'https://www.toutiao.com/search_content/?' + urlencode(param)
start_urls = [url,]
def parse(self, response):
if response.status == 200 :
data = json.loads(response.body.decode('utf-8'))
#yield Request(url=data.get('data')[0]['article_url'],callback=self.find_pic)
if 'data' in data.keys():
for item in data.get('data'):
url = item.get('article_url')
if url:
yield Request(url,callback=self.find_pic)
# get more
if self.param['offset'] < 20:
self.param['offset'] += 20
#print('data是',self.data['offset'])
url = 'https://www.toutiao.com/search_content/?' + urlencode(self.param)
yield Request(url,callback=self.parse)
def find_pic(self,response):
title = response.xpath('//title/text()').extract()[0]
html = response.body.decode('utf-8').replace('\\','')
if 'gallery: JSON.parse' in html:
images_pattern = re.compile('"url_list".*?"url":"(.*?)"},', re.S)
urls = re.findall(images_pattern, html)
else:
img_pattern = re.compile(r'"(http.*?)"',re.S)
urls = re.findall(img_pattern,html)
item = ToutiaoItem()
item['keyword'] = self.keyword
item['urls'] = urls
item['title'] = title
#print('打印item',item['image_urls'],item['title'])
yield item
管道.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
from scrapy.exceptions import DropItem
import re
class ToutiaoPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
self.item = item
for url in item['urls']:
self.index = 0
yield Request(url=url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
def file_path(self, request, response=None, info=None):
item = self.item
keyword = re.sub(r'[?\\*|“<>:/]', '',item['keyword'])
title = re.sub(r'[?\\*|“<>:/]', '',item['title'])
image_name = title+str(int(self.index))
self.index += 0.5
return '%s/%s.png' % (keyword,image_name)
我想下载原图。我应该怎么办 ?