python - 为什么 Scrapy 不抓取也不解析？

Question

我正在尝试抓取国会图书馆/托马斯网站。这个 Python 脚本旨在从他们的站点访问 40 个账单的样本（URL 中的#1-40 标识符）。我想解析每条立法的正文，在正文/内容中搜索，提取潜在多个版本的链接并关注。

一旦进入版本页面，我想解析每条立法的正文，搜索正文/内容并提取指向潜在部分的链接并遵循。

一旦进入部分页面，我想解析账单每个部分的正文。

我相信我的代码的 Rules/LinkExtractor 部分存在一些问题。python 代码正在执行，爬取起始 url，但不解析或任何后续任务。

三个问题：

有些账单没有多个版本（因此 URL 的正文部分没有链接
有些账单没有链接部分，因为它们太短了，而有些只是链接到部分。
一些版块链接不仅仅包含特定版块的内容，而且大部分内容只是多余地包含了之前或之后的版块内容。

我的问题又是，为什么 Scrapy 不抓取或解析？

from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

class BillItem(Item):
    title = Field()
    body = Field()

class VersionItem(Item):
    title = Field()
    body = Field()

class SectionItem(Item):
    body = Field()

class Lrn2CrawlSpider(CrawlSpider):
    name = "lrn2crawl"
    allowed_domains = ["thomas.loc.gov"]
    start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767

    ]

rules = (
        # Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
        # and follow links from them (since no callback means follow=True by default).
        # Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
        Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[@id="content"]')), callback='parse_bills', follow=True),

        # Extract links in the body of a bill-version & follow them.
       #Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
        Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
    )

def parse_bills(self, response):
    hxs = HtmlXPathSelector(response)
    bills = hxs.select('//div[@id="content"]')
    scraped_bills = []
    for bill in bills:
        scraped_bill = BillItem() ### Bill object defined previously
        scraped_bill['title'] = bill.select('p/text()').extract()
        scraped_bill['body'] = response.body
        scraped_bills.append(scraped_bill)
    return scraped_bills

def parse_versions(self, response):
    hxs = HtmlXPathSelector(response)
    versions = hxs.select('//div[@id="content"]')
    scraped_versions = []
    for version in versions:
        scraped_version = VersionItem() ### Version object defined previously
        scraped_version['title'] = version.select('center/b/text()').extract()
        scraped_version['body'] = response.body
        scraped_versions.append(scraped_version)
    return scraped_versions

def parse_sections(self, response):
    hxs = HtmlXPathSelector(response)
    sections = hxs.select('//div[@id="content"]')
    scraped_sections = []
    for section in sections:
        scraped_section = SectionItem() ## Segment object defined previously
        scraped_section['body'] = response.body
        scraped_sections.append(scraped_section)
    return scraped_sections

spider = Lrn2CrawlSpider()

score 1 · Accepted Answer

只是为了记录，你的脚本的问题是变量rules不在范围内，Lrn2CrawlSpider因为它不共享相同的缩进，所以当alecxe修复缩进时，变量rules现在成为类的属性。稍后，继承的方法__init__()读取属性并编译规则并执行它们。

def __init__(self, *a, **kw):
    super(CrawlSpider, self).__init__(*a, **kw)
    self._compile_rules()

擦除最后一行与此无关。

score 0 · Accepted Answer

我刚刚修复了缩进，删除spider = Lrn2CrawlSpider()了脚本末尾的行，运行了蜘蛛scrapy runspider lrn2crawl.py，它会抓取，跟踪链接，返回项目 - 你的规则有效。

这是我正在运行的内容：

from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

class BillItem(Item):
    title = Field()
    body = Field()

class VersionItem(Item):
    title = Field()
    body = Field()

class SectionItem(Item):
    body = Field()

class Lrn2CrawlSpider(CrawlSpider):
    name = "lrn2crawl"
    allowed_domains = ["thomas.loc.gov"]
    start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767

    ]

    rules = (
            # Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
            # and follow links from them (since no callback means follow=True by default).
            # Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
            Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[@id="content"]')), callback='parse_bills', follow=True),

            # Extract links in the body of a bill-version & follow them.
           #Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
            Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
        )

    def parse_bills(self, response):
        hxs = HtmlXPathSelector(response)
        bills = hxs.select('//div[@id="content"]')
        scraped_bills = []
        for bill in bills:
            scraped_bill = BillItem() ### Bill object defined previously
            scraped_bill['title'] = bill.select('p/text()').extract()
            scraped_bill['body'] = response.body
            scraped_bills.append(scraped_bill)
        return scraped_bills

    def parse_versions(self, response):
        hxs = HtmlXPathSelector(response)
        versions = hxs.select('//div[@id="content"]')
        scraped_versions = []
        for version in versions:
            scraped_version = VersionItem() ### Version object defined previously
            scraped_version['title'] = version.select('center/b/text()').extract()
            scraped_version['body'] = response.body
            scraped_versions.append(scraped_version)
        return scraped_versions

    def parse_sections(self, response):
        hxs = HtmlXPathSelector(response)
        sections = hxs.select('//div[@id="content"]')
        scraped_sections = []
        for section in sections:
            scraped_section = SectionItem() ## Segment object defined previously
            scraped_section['body'] = response.body
            scraped_sections.append(scraped_section)
        return scraped_sections

希望有帮助。

python - 为什么 Scrapy 不抓取也不解析？

2 回答 2

Related

Reference