因为我想格式化下面的 XML 输出,所以我在 Pipelines 上的代码:
class TutorialPipeline(object):
def __init__(self):
self.file = open('outs.xml', 'a')
self.file.write('<?xml version=\'1.0\' encoding=\'utf-8\'?>')
self.file.write('<Friends>')
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_closed(self, spider):
self.file.write('</Friends>')
self.file.close()
def process_item(self, item, spider):
escape("< & >")
self.file.write('<friend id=\"' + item['id'] + '\">')
self.file.write('<birthdate>' + item['birthdate'] + '</date>')
self.file.write('<user>' + item['user'] + '</user>')
self.file.write('<review>' + escape(item['review'].encode('utf-8').strip()) + '</review>')
self.file.write('</item >')
return item
以下是我的 Spider 如何抓取多个页面:
class SavoySpider(BaseSpider):
# identifies of the Spider
name = "friend"
count = 0
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/biz/social/"
]
def start_requests(self):
for i in range(0,1000,40):
yield self.make_requests_from_url("http://www.example.com/biz/social/?start=%d" % i)
def parse(self, response):
response = response.replace(body=response.body.replace('<br />', '\n'))
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
items = []
for site in sites:
item = FriendItem()
self.count += 1
item['id'] = str(self.count)
item['birthdate'] = str(site.select('.//div/div/meta[@itemprop="birthdate"]/@content').extract()[0])
item['user'] = site.select('h4/span/text()').extract()[0]
item['review'] = ''.join(site.select('.//div[@class="media-friend"]/p/text()').extract())
items.append(item)
return items
但是现在的问题是,如果我使用 Pipelines 自定义 xml 格式,当抓取另一个页面时,将附加到下面和后续页面。输出将如下所示:
<?xml version="1.0" encoding="utf-8"?>
<Friends>
<friend id = "1">
<name>Name1</name>
<birthdate>1988-04-03</birthdate>
<review>txt............</review>
</friend>
.....
</Friends>
<?xml version="1.0" encoding="utf-8"?>
<Friends>
<friend id = "40">
<name>Name41</name>
<birthdate>1988-04-13</birthdate>
<review>txt............</review>
</friend>
.....
</Friends>
<?xml version="1.0" encoding="utf-8"?>
<Friends>
<friend id = "81">
<name>Name81</name>
<birthdate>1988-04-23</birthdate>
<review>txt............</review>
</friend>
.....
</Friends>
任何人都可以帮忙吗?