我正在尝试使用以下抓取代码从网站上抓取图像:
import urlparse
from PIL import Image
from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.contrib.pipeline.images import ImagesPipeline
from mobile.items import Website
class MobileSpider(CrawlSpider):
name = "mobile"
allowed_domains = ["mobile-store.ro"]
start_urls = ["http://www.mobile-store.ro/produse/"]
rules = (
Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True),
Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item')
)
def parse(self, response, response2):
hxs = HtmlXPathSelector(response)
next_page = hxs.select("//ul[@class='products']/li/a/@href").extract()
if not not next_page:
yield Request(next_page[0], self.parse)
sites = hxs.select('//div[@id="wrapper"]/div[@id="content"]')
items = []
for site in sites:
item = Website()
item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)]
#item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
settings.py:
SPIDER_MODULES = ['mobile.spiders']
NEWSPIDER_MODULE = 'mobile.spiders'
DEFAULT_ITEM_CLASS = 'mobile.items.Website'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
items.py:
from scrapy.item import Item, Field
class Website(Item):
nume = Field()
descriere = Field()
categorie = Field()
brand = Field()
pret = Field()
url = Field()
image_urls = Field()
images = Field()
image_paths = Field()
pipelines.py:
from mobile.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
当我尝试使用以下代码获取图像 url 时,问题就出现了:
for site in sites:
item = Website()
item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
item['image_urls'] = [urlparse.urljoin(response.url2,image_relative_url)]
#item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
它返回给我的是页面 url 而不是图片 url。所有其他字段均已正确爬网。有关如何解决此问题并正确获取图像网址的任何线索?