我正在尝试从下面提到的 URL 中提取某些字符串:
示例网址:
http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1
我想提取:
productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"
等等。实际上我正在写一个蜘蛛,我需要从上面提到的 URL 中提取 productCategory 和 productSubCategory。所以我试图从 response.url 中提取解析方法中的这些字段。有人可以帮我吗
我的代码:
import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "ladyblushSpider"
allowed_domains = ["ladyblush.com"]
URLSList = []
for n in range (1,100):
URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[@class="third thumbnailSpillLarge"]')
items = []
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productCategory'] = [""]
item['productSubCategory'] = [""]
item['productSite'] = ["http://ladyblush.com/"]
item['productImage'] = site.select('./a/div/img/@src').extract()
item['productTitle'] = site.select('./a/div/img/@title').extract()
item['productURL'] = [site.select('./a/@href').extract()[0].replace(" ","%20")]
productMRP = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//p[@class="old-price"]//span[@class="price"]/text()').extract()
productPrice = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//p[@class="special-price"]//span[@class="price"]/text()').extract()
if productMRP and productPrice:
price = [productMRP[1].strip()] + [productPrice[1].strip()]
else:
price = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//span[@class="regular-price"]//span[@class="price"]/text()').extract()
item['productPrice'] = price
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
if hxs.select('//div[@class="addtocart-container"]/div/text()').extract():
item['availability'] = False
else:
item['availability'] = True
if hxs.select('//label[@class="required"]/text()').extract():
item['hasVariants'] = True
else:
item['hasVariants'] = False
item['image_urls'] = list(set(item['productImage']))
item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[@class="std"]/text()').extract()])]
item['productImage'] = item['productImage'] + hxs.select('//div[@class="more-views"]/ul/li/a/img/@src').extract() + hxs.select('//div[@class="more-views"]/ul/li/a/@href').extract()
return item
#------------------------------------------------------------------------------