使用标准的xml.etree.Element树将信息从 XML 提取到 Python 对象(或具有相同 API的更增强的第三方lxml )。
我建议阅读 Mark Pilrim 的Dive Into Python 3,第 12 章。XML ( http://getpython3.com/diveintopython3/xml.html )。
这是如何编写解析器/编译器的核心。这个想法是递归遍历元素,收集必要的信息并在可能的情况下输出代码:
import xml.etree.ElementTree as ET
class Parser:
def __init__(self):
self.output_list = [] # collected output lines
self.il = 0 # indentation level
def __iter__(self):
return iter(self.output_list)
def out(self, s):
'''Output the indented string to the output list.'''
self.output_list.append(' ' * self.il + s)
def indent(self, num=1):
'''Increase the indentation level.'''
self.il += num
def dedent(self, num=1):
'''Decrease the indentation level.'''
self.il -= num
def parse(self, elem):
'''Call the parser of the elem.tag name.
The tag name appended to "parse_" and then the name of that
function is called. If the function is not defined, then
self.parse_undefined() is called.'''
fn_name = 'parse_' + elem.tag
try:
fn = getattr(self, fn_name)
except AttributeError:
fn = self.parse_undefined
return fn(elem)
def loop(self, elem):
'''Helper method to loop through the child elements.'''
for e in elem:
self.parse(e)
def parseXMLfile(self, fname):
'''Reads the XML file and starts parsing from the root element.'''
tree = ET.parse(fname)
script = tree.getroot()
assert script.tag == 'script'
self.parse(script)
###################### ELEMENT PARSERS #######################
def parse_undefined(self, elem):
'''Called for the element that has no parser defined.'''
self.out('PARSING UNDEFINED for ' + elem.tag)
def parse_script(self, elem):
self.loop(elem)
def parse_stage(self, elem):
self.out('')
self.out('Parsing the stage: ' + elem.attrib['id'])
self.indent()
self.loop(elem)
self.dedent()
def parse_initialise(self, elem):
self.out('')
self.out('#---------- ' + elem.tag + ' ----------')
self.loop(elem)
def parse_variable(self, elem):
tt = str # default type
if elem.attrib['type'] == 'Integer':
tt = int
# elif ... etc for other types
# Conversion of the value to the type because of the later repr().
value = tt(elem.attrib['value'])
id_ = elem.attrib['id']
# Produce the line of the output.
self.out('{0} = {1}'.format(id_, repr(value)))
def parse_execute(self, elem):
self.out('')
self.out('#---------- ' + elem.tag + ' ----------')
self.loop(elem)
def parse_if(self, elem):
assert elem[0].tag == 'condition'
condition = self.parse(elem[0])
self.out('if ' + condition + ':')
self.indent()
self.loop(elem[1:])
self.dedent()
def parse_condition(self, elem):
assert len(elem) == 0
return elem.text
def parse_then(self, elem):
self.loop(elem)
def parse_else(self, elem):
self.dedent()
self.out('else:')
self.indent()
self.loop(elem)
def parse_error(self, elem):
assert len(elem) == 0
errorID = elem.attrib.get('errorID', None)
fieldID = elem.attrib.get('fieldID', None)
self.out('error({0}, {1})'.format(errorID, fieldID))
def parse_setNextStage(self, elem):
assert len(elem) == 0
self.out('setNextStage --> ' + elem.text)
if __name__ == '__main__':
parser = Parser()
parser.parseXMLfile('data.xml')
for s in parser:
print s
当与此处粘贴的数据一起使用http://pastebin.com/vRRxfWiA时,脚本会产生以下输出:
Parsing the stage: stage1
#---------- initialise ----------
taxyear = 2012
taxyearstart = '06/04/2012'
taxyearend = '05/04/2013'
previousemergencytaxcode = '747L'
emergencytaxcode = '810L'
nextemergencytaxcode = '810L'
...
maxLimitAmount = 0
PARSING UNDEFINED for executeMethod
if $maxLimitReached$ == True:
employeepayrecord = 'N'
employeepayrecordstate = '2'
else:
employeepayrecordstate = '1'
gender = ''
genderstate = '1'
title = ''
forename = ''
forename2 = ''
surname = ''
dob = ''
dobinvalid = ''
#---------- execute ----------
if $dobstring$ != "":
validDOBCheck = 'False'
PARSING UNDEFINED for executeMethod
if $validDOBCheck$ == False:
error(224, dob)
else:
minimumDOBDate = ''
PARSING UNDEFINED for executeMethod
validDOBCheck = 'False'
PARSING UNDEFINED for executeMethod
if $validDOBCheck$ == False:
error(3007161, dob)
if $dobstring$ == "01/01/1901":
error(231, dob)
else:
error(231, dob)
Parsing the stage: stage2
#---------- initialise ----------
address1 = ''
...