我是scrapy的新手。我正在尝试编写一个蜘蛛来下载图像。对于使用图像管道,安装PIL是否足够?我的PIL位于
/usr/lib/python2.7/dist-packages/PIL
如何将它包含在我的 Scrapy 项目中?
设置文件:
BOT_NAME = 'paulsmith'
BOT_VERSION = '1.0'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGE_STORE = '/home/jay/Scrapy/paulsmith/images'
SPIDER_MODULES = ['paulsmith.spiders']
NEWSPIDER_MODULE = 'paulsmith.spiders'
DEFAULT_ITEM_CLASS = 'paulsmith.items.PaulsmithItem'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
物品文件:
from scrapy.item import Item, Field
class PaulsmithItem(Item):
image_urls=Field()
image = Field()
pass
蜘蛛代码
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from paulsmith.items import PaulsmithItem
class PaulSmithSpider(BaseSpider):
name="Paul"
allowed_domains=["http://www.paulsmith.co.uk/uk-en/shop/mens"]
start_urls=["http://www.paulsmith.co.uk/uk-en/shop/mens/jeans"]
def parse(self,response):
item= PaulsmithItem()
#open('paulsmith.html','wb').write(response.body)
hxs=HtmlXPathSelector(response)
#sites=hxs.select('//div[@class="category-products"]')
item['image_urls']=hxs.select("//div[@class='category-products']//a/img/@src").extract()
#for site in sites:
#print site.extract()
#image = site.select('//a/img/@src').extract()
return item
SPIDER = PaulSmithSpider()