这是一个 Python3 脚本,用于使用 xmltodict 的流式传输功能将特定结构的 XML 转换为 JSON。该脚本在内存中保留的很少,因此对输入的大小没有限制。这做了很多假设,但可能会让你开始,你的里程会有所不同,希望这会有所帮助。
#!/usr/bin/env python3
"""
Converts an XML file with a single outer list element
and a repeated list member element to JSON on stdout.
Processes large XML files with minimal memory using the
streaming feature of https://github.com/martinblech/xmltodict
which is required ("pip install xmltodict").
Expected input structure (element names are just examples):
<mylist attr="a">
<myitem name="foo"></myitem>
<myitem name="bar"></myitem>
<myitem name="baz"></myitem>
</mylist>
Output:
{
"mylist": {
"attr": "a",
"myitem": [
{
"name": "foo"
},
{
"name": "bar"
},
{
"name": "baz"
}
]
}
}
"""
import json
import os
import sys
import xmltodict
ROOT_SEEN = False
def handle_item(path, element):
"""
Called by xmltodict on every item found at the specified depth.
This requires a depth >= 2.
"""
# print("path {} -> element: {}".format(path, element))
global ROOT_SEEN
if path is None and element is None:
# after element n
print(']') # list of items
print('}') # outer list
print('}') # root
return False
elif ROOT_SEEN:
# element 2..n
print(",")
else:
# element 1
ROOT_SEEN = True
print('{') # root
# each path item is a tuple (name, OrderedDict)
print('"{}"'.format(path[0][0]) + ': {') # outer list
# emit any root element attributes
if path[0][1] is not None and len(path[0][1]) > 0:
for key, value in path[0][1].items():
print('"{}":"{}",'.format(key, value))
# use the repeated element name for the JSON list
print('"{}": ['.format(path[1][0])) # list of items
# Emit attributes and contents by merging the contents into
# the ordered dict of attributes so the attr appear first.
if path[1][1] is not None and len(path[1][1]) > 0:
ordict = path[1][1]
ordict.update(element)
else:
ordict = element
print(json.dumps(ordict, indent=2))
return True
def usage(args, err=None):
"""
Emits a message and exits.
"""
if err:
print("{}: {}".format(args[0], err), file=sys.stderr)
print("Usage: {} <xml-file-name>".format(args[0]), file=sys.stderr)
sys.exit()
if __name__ == '__main__':
if len(sys.argv) != 2:
usage(sys.argv)
xmlfile = sys.argv[1]
if not os.path.isfile(xmlfile):
usage(sys.argv, 'Not found or not a file: {}'.format(xmlfile))
with open(xmlfile, 'rb') as f:
# Set item_depth to turn on the streaming feature
# Do not prefix attribute keys with @
xmltodict.parse(f, item_depth=2, attr_prefix='', item_callback=handle_item)
handle_item(None, None)