在一个页面中,我想爬取两个链接,并进入每个链接去爬取一些信息,然后将它们收集到一个条目中,我的代码是:
def parse(self, response):
a = '/html/body/div[3]/div/div/div[3]/ul/li[position()>1]/ul/li/a/'
# function query returns HtmlXPathSelector(response).select(xpath).extract()
song_names = query(a + 'text()', response)
song_links = query(a + '@href', response)
for name, link in izip(song_names, song_links):
yield Request(
url=self.host + link,
meta={'item': BdmmsItem(singer=name)},
callback=self.parse_single_song)
def parse_single_song(self, response):
item = response.meta['item']
album_link = query('a[contains(@href, "/album/")]/@href', response)[0]
lrc_link = query('//a[@lyricdata]/@lyricdata', response)[0]
# here, i want to go into the two different page to get different information
if lrc_link:
yield Request(
url=lrc_link[0],
meta={'item': item},
callback=self.parse_lrc)
if album_link:
yield Request(
url=album_link[0],
meta={'item': item},
callback=self.parse_album)
# if use urllib2, but how do that in scrapy
'''
item['lrc'] = urllib2.urlopen(lrc_link).read()
item['album'] = some_other_func(urllib2.urlopen(album_link).read())
'''
def parse_lrc(self, response):
item = response.meta['item']
item['lrc'] = response.body
yield item
def parse_album(self, response):
item = response.meta['item']
item['album'] = query('div[@id="album-info"]', response)
yield item
它将生成两个项目。我该如何做才能使信息在一个项目中生成?