0

我正在使用scrapy从网站的所有页面获取一些信息。这是我的 dmoz_spider.py 文件。当我执行这个时,我得到 IndentationError。请帮帮我。

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import string
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class EypItem(Item):
    title = Field()
    link = Field()
    price = Field()
    review = Field()
class eypSpider(CrawlSpider):
    name = "dmoz"
    allowed_domains =["http://www.walgreens.com"]
    start_urls =["http://www.walgreens.com/search/results.jsp?Ntt=allergy%20medicine"] 
rules = (Rule(SgmlLinkExtractor(allow=('/search/results\.jsp', )), callback='parse_item', follow= True),)
    def parse_item(self, response):
    self.log('Hi, this is an item page! %s' % response.url)
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//div[@id="productGrid"]')
        items = []
        for site in sites:
            itemE = EypItem()
            itemE["title"] = site.select('//*[@class="image-container"]/a/img/@alt').extract()
            itemE["link"] = site.select('//*[@class="image-container"]/a/img/@src').extract()
            itemE["price"] = site.select('//*[@class="pricing"]/div/p/text()').extract()
            itemE["review"] = site.select('//*[@class="reviewSnippet"]/div/div/span/text()').extract()
            items.append(itemE)
        return items
4

1 回答 1

1

除了缩进错误之外,您的allowed_domains指定错误。更改如下(也就是说,从 URL 中删除“http://”前缀):

allowed_domains =["www.walgreens.com"]
于 2013-08-27T01:13:03.573 回答