0

我在爬取过程的底部编写了一个具有以下两个功能的刮板。

def parse_summary(self, response):
    hxs = HtmlXPathSelector(response)
    item = response.meta['item']
    soup = BeautifulSoup(hxs.select("//div[@class='PrimaryContent']").extract()[0])
    text = soup.get_text()
    item['main_summary'] = text

    summary_links = hxs.select("//ul[@class='module_leftnav']/li/a/@href").extract()
    chap_summary_links = [urljoin(response.url, link) for link in summary_links]

    for link in chap_summary_links:
        print 'yielding request to chapter summary.'
        yield Request(link, callback=self.parse_chap_summary_link, meta={'item': item})


def parse_chap_summary_link(self, response):
    hxs = HtmlXPathSelector(response)
    item = response.meta['item']
    item['chapter_summaries'] = hxs.select("//h1/text()").extract()
    soup = BeautifulSoup(hxs.select("//div[@class='PrimaryContent']").extract()[0])
    text = soup.get_text()
    item['chapter_summaries'] += [text]
    yield item

在底部parse_summary,我发出请求以parse_chap_summary_link从章节摘要页面中提取数据。这可行,但问题是输出给了我:

{item 1, [chapter 1 summary]}
{item 1, [chapter 2 summary]}

但我想要:

{item 1, [Chapter 1 summary, Chapter 2 Summary]}
{item 2, [Chapter 1 summary, Chapter 2 Summary, Chapter 3 etc etc]}

如何将所有章节摘要信息放入一个标题,而不是为每个章节摘要创建一个新项目?

4

1 回答 1

0

一个选项是一个接一个地执行每个请求。例如

def parse_summary(self, response):
    # ...

    links = [urljoin(response.url, link) for link in summary_links]
    return self._dispatch_summary_request(item, links)

def parse_chap_summary_link(self, response):
    item = response.meta['item']

    # ... collect summary into the item field

    return self._dispatch_summary_request(item, response.meta['summary_links'])

def _dispatch_summary_request(self, item, links):
    try:
        next_link = links.pop()
    except IndexError:
        # no links left
        return item
    else:
        # TODO: it might happen that one request fails and to not lose the item
        # the request must have an errback callback to handle the failure and
        # resume the next summary request.
        return Request(next_link, meta={'item': item, 'summary_links': links},
                       callback=self.parse_chap_summary_link)

另一种选择是使用inline_requests装饰器:

@inline_requests
def parse_summary(self, response):
    # ...
    for link in chap_summary_links:
        try:
            response = yield Request(link)
        except Exception:
            # TODO: handle the error, log or something
            pass
        else:
            # extract the summary as in parse_chap_summary_link ...
            item['chapter_summaries'] += [text]

    # Must use yield at the end as this callback is a generator
    # due the previous yield statements.
    yield item
于 2013-10-19T01:18:56.223 回答