是的,你可以用 Scrapy 做到这一点,链接提取器将帮助:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class AustliiSpider(CrawlSpider):
name = "austlii"
allowed_domains = ["austlii.edu.au"]
start_urls = ["http://www.austlii.edu.au/au/cases/cth/HCA/1945/"]
rules = (
Rule(SgmlLinkExtractor(allow=r"au/cases/cth/HCA/1945/\d+.html"), follow=True, callback='parse_item'),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
# do whatever with html content (response.body variable)
希望有帮助。