我爬了一个网站的每个页面,但是现在出现了这个问题。
如果页面包含类“td-cell align-right gray”和“td-cell align-right gray row-border”,则在 item['price'] 中写入两者的 text()。
但是,如果一个页面只包含“td-cell align-right gray row-border”,那么在 item['price'] 中只写 text()。
编码:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from Test01.items import Test01Item
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.response import get_base_url
import urlparse
class ScrapyOrgSpider(BaseSpider):
name = "oeticket"
allowed_domains = ["oeticket.com"]
start_urls = ["http://www.oeticket.com/de/suche/?search_string=amaretto"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
next_page = hxs.select("//li[@class='next-page navigation']/a/@href").extract()
abs_page = []
for g in next_page:
abs_page.append("http://oeticket.com" + g )
if not not abs_page:
for e in abs_page:
yield Request(e, self.parse)
next_event = hxs.select("//li[@class='event-item vevent']/a/@href").extract()
abs_event = []
for it in next_event:
abs_event.append("http://oeticket.com" + it)
if not not abs_event:
for s in abs_event:
yield Request(s, self.parse)
deeper = hxs.select("//li[@class='performance-item vevent']/a/@href").extract()
abs_deeper = []
for c in deeper:
abs_deeper.append("http://oeticket.com" + c)
if not not abs_deeper:
for d in abs_deeper:
yield Request(d, self.parse)
posts = hxs.select("//ul[@class='grid_10 horizontal-list clearfix']")
preis = hxs.select("//tbody/tr")
for post in posts:
item = Test01Item()
item["when"] = post.select("li[@class='when']/p/abbr/text()").extract() + post.select("li[@class='when']/h2/text()").extract()
items.append(item)
for post in posts:
item = Test01Item()
item["what"] = post.select("li[@class='what']/h2/text()").extract()
items.append(item)
for post in posts:
item = Test01Item()
item["where"] = post.select("li[@class='where']/h2/text()").extract()
items.append(item)
for prei in preis:
item = Test01Item()
item['url'] = response.url
item['price'] = prei.select("td[@class='ticket_price td-cell ucase black strong align-right']/text()").extract()
item['price'] = prei.select("td[@class='ticket_price td-cell ucase black strong align-right row-border']/text()").extract()
item["func"] = prei.select("td[@class='td-cell align-right gray']/text()").extract()
item["func"] = prei.select("td[@class='td-cell align-right gray row-border']/text()").extract()
items.append(item)
for item in items:
yield item
结果:
{"when": ["Donnerstag, 7. Feb 2013 ", "20:00"]},
{"what": ["Amaretto"]},
{"where": ["kleines theater"]},
{"url": "http://www.oeticket.com/de/tickets/amaretto-salzburg-kleines-theater-482435/performance.html", "price": [], "func": []},
{"url": "http://www.oeticket.com/de/tickets/amaretto-salzburg-kleines-theater-482435/performance.html", "price": [" 15,90 EUR "], "func": [" Erm\u00e4\u00dfigung lt. Info - ACHTUNG: Ausweiskontrolle! "]},
预期结果:
{"when": ["Donnerstag, 7. Feb 2013 ", "20:00"]},
{"what": ["Amaretto"]},
{"where": ["kleines theater"]},
{"url": "http://www.oeticket.com/de/tickets/amaretto-salzburg-kleines-theater-482435/performance.html", "price": [" 22,50 EUR "], "func": [" Normalpreis "},
{"url": "http://www.oeticket.com/de/tickets/amaretto-salzburg-kleines-theater-482435/performance.html", "price": [" 15,90 EUR "], "func": [" Erm\u00e4\u00dfigung lt. Info - ACHTUNG: Ausweiskontrolle! "]},
如何使用空白项目字段解决此问题?谢谢你!