我正在尝试来自https://github.com/monkeylearn/hotel-review-analysis的代码
import scrapy
from hotel_sentiment.items import HotelSentimentItem
#TODO use loaders
class TripadvisorSpider(scrapy.Spider):
name = "tripadvisor"
start_urls = [
"https://www.tripadvisor.com/Hotels-g60763-New_York_City_New_York-Hotels.html"
]
def parse(self, response):
for href in response.xpath('//div[@class="listing_title"]/a/@href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_hotel)
next_page = response.xpath('//div[@class="unified pagination standard_pagination"]/child::*[2][self::a]/@href')
if next_page:
url = response.urljoin(next_page[0].extract())
yield scrapy.Request(url, self.parse)
def parse_hotel(self, response):
for href in response.xpath('//div[starts-with(@class,"quote")]/a/@href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_review)
next_page = response.xpath('//div[@class="unified pagination "]/child::*[2][self::a]/@href')
if next_page:
url = response.urljoin(next_page[0].extract())
yield scrapy.Request(url, self.parse_hotel)
#to get the full review content I open its page, because I don't get the full content on the main page
#there's probably a better way to do it, requires investigation
def parse_review(self, response):
item = HotelSentimentItem()
item['title'] = response.xpath('//div[@class="quotes"]/text()').extract()[0][1:-1] #strip the quotes (first and last char)
item['content'] = response.xpath('//div[@class="[entry"]/p/text()').extract()[0]
return item
我得到了这个错误
Traceback (most recent call last):
File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Users\eg06\Desktop\hotel-review-analysis-master\hotel_sentiment\spiders\tripadvisor_spider.py", line 36, in parse_review
item['title'] = response.xpath('//div[@class="quotes"]/text()').extract()[0][1:-1] #strip the quotes (first and last char)
IndexError: list index out of range
我正在使用此命令调用蜘蛛
scrapy crawl tripadvisor -o itemsTripadvisor.csv -s CLOSESPIDER_ITEMCOUNT=100
我一直在尝试搜索和更改代码,但是这个错误一遍又一遍地发生。