0

我需要列出我的<product>项目中的所有元素,因为元素<product>是可变的。

XML 文件:

<catalog>
   <product>
      <element1>text 1</element1>
      <element2>text 2</element2>
      <element..>text ..</element..>
   </produc>
</catalog>

Python 解析器:我使用 fast_iter 因为我的 xml 文件很大......

import lxml.etree as etree
import configs.application as configs

myfile = configs.application.tmp + '/xml_hug_file.xml'

def fast_iter(context, func, *args, **kwargs):
    for event, elem in context:
        func(elem, *args, **kwargs)
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context

def process_element(catalog):
    print("List all element of <product>")

context = etree.iterparse(myfile, tag='catalog', events = ('end', ))
fast_iter(context, process_element)
4

3 回答 3

1
def process_element(catalog, *args, **kwargs):
    for child in catalog.getchildren():
        print(child.text)
于 2013-06-25T14:01:49.177 回答
1

这是我的问题的解决方案:

def process_element(catalog):
    for product in catalog.findall('product'):
        for element in product.findall('*'):
            print(element.tag)
            print(element.text)
于 2013-06-25T14:01:04.807 回答
1

您可以使用 XPath 'product/*[starts-with(local-name(),"element")]'


import lxml.etree as ET
import io

content = '''\
<catalog>
   <product>
      <element1>text 1</element1>
      <element2>text 2</element2>
      <element3>text ..</element3>
   </product>
</catalog>'''

def fast_iter(context, func, *args, **kwargs):
    """
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    Author: Liza Daly
    See also http://effbot.org/zone/element-iterparse.htm
    """
    for event, elem in context:
        func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
    del context


def process_element(catalog):
    for elt in catalog.xpath('product/*[starts-with(local-name(),"element")]'):
        print(elt)

context = ET.iterparse(io.BytesIO(content), tag='catalog', events = ('end', ))
fast_iter(context, process_element)

产量

<Element element1 at 0xb7449374>
<Element element2 at 0xb744939c>
<Element element3 at 0xb74493c4>

顺便说一句,我对 Liz Daly 的 fast_iter 进行了修改,这将删除更多未使用的元素。这应该会在解析大型 XML 文件时减少内存需求。

这是一个示例,显示了fast_iter上面的修改如何删除了比原始元素更多的元素fast_iter

import logging
import textwrap
import lxml.etree as ET
import io

logger = logging.getLogger(__name__)
level = logging.INFO
# level = logging.DEBUG  # uncomment to see more debugging information
logging.basicConfig(level=level)

def fast_iter(context, func, *args, **kwargs):
    """
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    Author: Liza Daly
    See also http://effbot.org/zone/element-iterparse.htm
    """
    for event, elem in context:
        logger.debug('Processing {e}'.format(e=ET.tostring(elem)))
        func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        logger.debug('Clearing {e}'.format(e=ET.tostring(elem)))
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            logger.debug('Checking ancestor: {a}'.format(a=ancestor.tag))
            while ancestor.getprevious() is not None:
                logger.info('Deleting {p}'.format(
                    p=(ancestor.getparent()[0]).tag))
                del ancestor.getparent()[0]
    del context

def orig_fast_iter(context, func, *args, **kwargs):
    for event, elem in context:
        logger.debug('Processing {e}'.format(e=ET.tostring(elem)))
        func(elem, *args, **kwargs)
        logger.debug('Clearing {e}'.format(e=ET.tostring(elem)))
        elem.clear()
        while elem.getprevious() is not None:
            logger.info('Deleting {p}'.format(
                p=(elem.getparent()[0]).tag))                
            del elem.getparent()[0]
    del context

def setup_ABC():
    content = textwrap.dedent('''\
      <root>
        <A1>
          <B1></B1>
          <C>1<D1></D1></C>
          <E1></E1>
        </A1>
        <A2>
          <B2></B2>
          <C>2<D></D></C>
          <E2></E2>
        </A2>
      </root>
        ''')
    return content

content = setup_ABC()
context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
orig_fast_iter(context, lambda elem: None)
# DEBUG:__main__:Deleting B1
# DEBUG:__main__:Deleting B2

print('-'*80)
"""
The improved fast_iter deletes A1. The original fast_iter does not.
"""
content = setup_ABC()
context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
fast_iter(context, lambda elem: None)
# DEBUG:__main__:Deleting B1
# DEBUG:__main__:Deleting A1
# DEBUG:__main__:Deleting B2

因此,您会看到修改后的元素fast_iter设法删除了该A1元素,因为在处理第二个元素时不需要它C。原来fast_iter只删除元素的父C元素(即B元素)。您可以想象A1,在大型 XML 文件中,诸如此类的内容可能非常大,并且可能有很多这样的元素。因此,修改后的fast_iter将允许回收大量原始fast_iter未释放的内存。

于 2013-06-25T14:01:18.993 回答