0

我正在尝试来自https://github.com/monkeylearn/hotel-review-analysis的代码

import scrapy
from hotel_sentiment.items import HotelSentimentItem

#TODO use loaders
class TripadvisorSpider(scrapy.Spider):
name = "tripadvisor"
start_urls = [
    "https://www.tripadvisor.com/Hotels-g60763-New_York_City_New_York-Hotels.html"
]

def parse(self, response):
    for href in response.xpath('//div[@class="listing_title"]/a/@href'):
        url = response.urljoin(href.extract())
        yield scrapy.Request(url, callback=self.parse_hotel)

    next_page = response.xpath('//div[@class="unified pagination standard_pagination"]/child::*[2][self::a]/@href')
    if next_page:
        url = response.urljoin(next_page[0].extract())
        yield scrapy.Request(url, self.parse)

def parse_hotel(self, response):
    for href in response.xpath('//div[starts-with(@class,"quote")]/a/@href'):
        url = response.urljoin(href.extract())
        yield scrapy.Request(url, callback=self.parse_review)

    next_page = response.xpath('//div[@class="unified pagination "]/child::*[2][self::a]/@href')
    if next_page:
        url = response.urljoin(next_page[0].extract())
        yield scrapy.Request(url, self.parse_hotel)


#to get the full review content I open its page, because I don't get the full content on the main page
#there's probably a better way to do it, requires investigation
def parse_review(self, response):
    item = HotelSentimentItem()
    item['title'] = response.xpath('//div[@class="quotes"]/text()').extract()[0][1:-1] #strip the quotes (first and last char)
    item['content'] = response.xpath('//div[@class="[entry"]/p/text()').extract()[0]
    return item

我得到了这个错误

Traceback (most recent call last):
  File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
  File "C:\Users\eg06\Desktop\hotel-review-analysis-master\hotel_sentiment\spiders\tripadvisor_spider.py", line 36, in parse_review
item['title'] = response.xpath('//div[@class="quotes"]/text()').extract()[0][1:-1] #strip the quotes (first and last char)
IndexError: list index out of range

我正在使用此命令调用蜘蛛

scrapy crawl tripadvisor -o itemsTripadvisor.csv -s CLOSESPIDER_ITEMCOUNT=100

我一直在尝试搜索和更改代码,但是这个错误一遍又一遍地发生。

4

0 回答 0