from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from aibang.items import OrgItem
class OrgSpider(CrawlSpider):
name = "org"
allowed_domains = ["demo-site.com"]
start_urls = [
'http://demo-site.com/detail/17507640-419823665'
]
rules = (
# Item List
Rule(SgmlLinkExtractor(allow=(r'list\/\d+$', ))),
# Parse item
Rule(SgmlLinkExtractor(allow=(r'detail\/\d+-\d+$', )), callback='parse_item', follow=False),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = OrgItem()
try:
item['name'] = hxs.select('//div[@class="b_title"]/h1/text()')[0].extract()
except:
print 'Something goes wrong, skip it'
print item['name']
我Scrapy
用来抓取一些页面,但我不希望它跟随detail/xxx-xxx
页面中的链接,我该如何禁用它?
我已经添加follow=False
了,但它不起作用,它仍然按照里面的链接detail/xxx-xxx
。
======注意======
我仍然需要从里面爬detail page
出来list page
,但没有更多的detail page
东西在里面detail page
。