0

这是我的代码:

# -*- coding: utf-8 -*-

from lxml import etree
from StringIO import StringIO

def str_repr(el, enc='utf-8'):
    text = etree.tostring(el, pretty_print=True, method='xml', encoding=enc)
    return text

if __name__ == "__main__":

    txt = u'''<img width="280" style= "margin: 10px; float:left;" alt ="привет мир" src = "/[[template]]/image.jpg"><p>[[body]]</p>'''

    #tree = etree.fromstring(txt)

    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(txt), parser)

    print str_repr(tree.getroot())#GOOD

    path = "//img[@src]"
    items = tree.xpath(path)

    for l in items:
        #y0 = l.attrib['alt']
        #print y0

        x0 = str_repr(l)#BAD
        print x0

这部分:print str_repr(tree.getroot())#GOOD打印结果编码良好(utf8)。

这部分:x0 = str_repr(l)#BAD打印导致未知编码。它打印:

<img width="280" style="margin: 10px;float:left;"alt="&#x43F;&#x440;&#x438;&#x432;&#x435;&#x442; &#x43C;&#x438;&#x440;" src=" 

为什么?如何以相同的编码获得结果?

4

0 回答 0