如果你愿意,你可以使用标准库HTMLParser(不是最好的选择,但可能很有趣):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
data = ""
search_tag = None
search_attr = None
attr = None
grab_data = False
def to_dict(self, attrs):
ret = {}
for k, v in attrs: ret[k] = v
return ret
def feed(self, data, tag, attr):
self.search_tag = tag
self.search_attr = attr
HTMLParser.feed(self, data)
def handle_starttag(self, tag, attrs):
attrs = self.to_dict(attrs)
if tag == self.search_tag:
if self.search_attr in attrs:
self.attr = attrs[self.search_attr]
self.grab_data = 1
def handle_data(self, data):
if self.grab_data:
self.data = data
def handle_endtag(self, tag):
if tag == self.search_tag:
self.grab_data = 0
xml = u'História do RFID A tecnologia de <EM ID="hub-30518" CATEG="PESSOA">RFID </EM>\
tem suas raízes nos sistemas de radares'
parser = MyHTMLParser()
parser.feed(xml, "em", "categ")
print parser.data, parser.attr