2

在一个页面中,我想爬取两个链接,并进入每个链接去爬取一些信息,然后将它们收集到一个条目中,我的代码是:

 def parse(self, response):

    a = '/html/body/div[3]/div/div/div[3]/ul/li[position()>1]/ul/li/a/'
    # function query returns HtmlXPathSelector(response).select(xpath).extract()
    song_names = query(a + 'text()', response)
    song_links = query(a + '@href', response)

    for name, link in izip(song_names, song_links):
        yield Request(
            url=self.host + link,
            meta={'item': BdmmsItem(singer=name)},
            callback=self.parse_single_song)


def parse_single_song(self, response):
    item = response.meta['item']

    album_link = query('a[contains(@href, "/album/")]/@href', response)[0]
    lrc_link = query('//a[@lyricdata]/@lyricdata', response)[0]

    # here, i want to go into the two different page to get different information
    if lrc_link:
        yield Request(
            url=lrc_link[0],
            meta={'item': item},
            callback=self.parse_lrc)
    if album_link:
        yield Request(
            url=album_link[0],
            meta={'item': item},
            callback=self.parse_album)
    # if use urllib2, but how do that in scrapy
    '''
    item['lrc'] = urllib2.urlopen(lrc_link).read()
    item['album'] = some_other_func(urllib2.urlopen(album_link).read())
    '''

def parse_lrc(self, response):
    item = response.meta['item']
    item['lrc'] = response.body
    yield item

def parse_album(self, response):
    item = response.meta['item']
    item['album'] = query('div[@id="album-info"]', response)
    yield item

它将生成两个项目。我该如何做才能使信息在一个项目中生成?

4

1 回答 1

1

我会做这样的事情:

def parse_single_song(self, response):
    item = response.meta['item']

    album_link = query('a[contains(@href, "/album/")]/@href', response)[0]
    lrc_link = query('//a[@lyricdata]/@lyricdata', response)[0]

    if album_link:
        meta={'item': item}
        if lrc_link:
            meta['lrc_link'] = lrc_link[0]

        yield Request(
            url=album_link[0],
            meta={'item': item},
            callback=self.parse_album)

def parse_album(self, response):
    item = response.meta['item']
    item['album'] = query('div[@id="album-info"]', response)

    lrc_link = response.meta.get('lrc_link')
    if lrc_link:
        yield Request(
            url=lrc_link,
            meta={'item': item},
            callback=self.parse_lrc)
    else:
        yield item

def parse_lrc(self, response):
    item = response.meta['item']
    item['lrc'] = response.body
    yield item

如果没有专辑链接,它将无法正常工作,但您应该明白这一点。

于 2013-04-26T13:21:02.137 回答