python - Scrapy Spider 不使用 SgmlLinkExtractor Rule 进入 parse_item 方法

Question

我正在制作一个爬虫来递归地爬取网站，但问题是蜘蛛没有进入 parse_item 方法。我的蜘蛛的名称是 example.py。代码如下：

from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy.utils.response import get_base_url


class CrawlSpider(CrawlSpider):
    name = "example"
    download_delay = 2
    allowed_domains = ["dmoz.org"]
    print allowed_domains
    start_urls = [
        "http://www.dmoz.org/Arts/"
    ]
    print start_urls
    rules = (
    Rule(SgmlLinkExtractor(allow=('/Arts', )), callback='parse_item',follow=True),
      )

#The spide is not entering into this parse_item

    def parse_item(self, response):
        print "hello parse"
        sel = Selector(response)
        title = sel.xpath('//title/text()').extract()
        print title

score 0 · Accepted Answer

为什么要显式定义和调用函数？尝试这个：

class CrawlSpider(CrawlSpider):
   name = "example"
   download_delay = 2
   allowed_domains = ["dmoz.org"]
   print allowed_domains
   start_urls = ["http://www.dmoz.org/Arts/"]

   def parse(self, response):
      print "hello parse"
      sel = Selector(response)
      title = sel.xpath('//title/text()').extract()
      print title

python - Scrapy Spider 不使用 SgmlLinkExtractor Rule 进入 parse_item 方法

1 回答 1

Related

Reference