0

我正在制作一个爬虫来递归地爬取网站,但问题是蜘蛛没有进入 parse_item 方法。我的蜘蛛的名称是 example.py。代码如下:

from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy.utils.response import get_base_url


class CrawlSpider(CrawlSpider):
    name = "example"
    download_delay = 2
    allowed_domains = ["dmoz.org"]
    print allowed_domains
    start_urls = [
        "http://www.dmoz.org/Arts/"
    ]
    print start_urls
    rules = (
    Rule(SgmlLinkExtractor(allow=('/Arts', )), callback='parse_item',follow=True),
      )

#The spide is not entering into this parse_item

    def parse_item(self, response):
        print "hello parse"
        sel = Selector(response)
        title = sel.xpath('//title/text()').extract()
        print title
4

1 回答 1

0

为什么要显式定义和调用函数?尝试这个:

class CrawlSpider(CrawlSpider):
   name = "example"
   download_delay = 2
   allowed_domains = ["dmoz.org"]
   print allowed_domains
   start_urls = ["http://www.dmoz.org/Arts/"]

   def parse(self, response):
      print "hello parse"
      sel = Selector(response)
      title = sel.xpath('//title/text()').extract()
      print title
于 2014-07-06T10:41:36.963 回答