########################################
# some comment
# other comment
########################################
block1 {
value=data
some_value=some other kind of data
othervalue=032423432
}
block2 {
value=data
some_value=some other kind of data
othervalue=032423432
}
5 回答
最好的方法是使用现有的格式,例如 JSON。
这是您的格式的示例解析器:
from lepl import (AnyBut, Digit, Drop, Eos, Integer, Letter,
NON_GREEDY, Regexp, Space, Separator, Word)
# EBNF
# name = ( letter | "_" ) , { letter | "_" | digit } ;
name = Word(Letter() | '_',
Letter() | '_' | Digit())
# words = word , space+ , word , { space+ , word } ;
# two or more space-separated words (non-greedy to allow comment at the end)
words = Word()[2::NON_GREEDY, ~Space()[1:]] > list
# value = integer | word | words ;
value = (Integer() >> int) | Word() | words
# comment = "#" , { all characters - "\n" } , ( "\n" | EOF ) ;
comment = '#' & AnyBut('\n')[:] & ('\n' | Eos())
with Separator(~Regexp(r'\s*')):
# statement = name , "=" , value ;
statement = name & Drop('=') & value > tuple
# suite = "{" , { comment | statement } , "}" ;
suite = Drop('{') & (~comment | statement)[:] & Drop('}') > dict
# block = name , suite ;
block = name & suite > tuple
# config = { comment | block } ;
config = (~comment | block)[:] & Eos() > dict
from pprint import pprint
pprint(config.parse(open('input.cfg').read()))
输出:
[{'block1': {'othervalue': 32423432,
'some_value': ['some', 'other', 'kind', 'of', 'data'],
'value': 'data'},
'block2': {'othervalue': 32423432,
'some_value': ['some', 'other', 'kind', 'of', 'data'],
'value': 'data'}}]
嗯,数据看起来很规律。所以你可以做这样的事情(未经测试):
class Block(object):
def __init__(self, name):
self.name = name
infile = open(...) # insert filename here
current = None
blocks = []
for line in infile:
if line.lstrip().startswith('#'):
continue
elif line.rstrip().endswith('{'):
current = Block(line.split()[0])
elif '=' in line:
attr, value = line.strip().split('=')
try:
value = int(value)
except ValueError:
pass
setattr(current, attr, value)
elif line.rstrip().endswith('}'):
blocks.append(current)
结果将是 Block 实例的列表,其中block.name
名称('block1'
、'block2'
等)和其他属性对应于数据中的键。因此,blocks[0].value
将是“数据”等。请注意,这仅将字符串和整数作为值处理。
(如果您的密钥可以包含“名称”,则这里有一个明显的错误。如果发生这种情况,您可能希望更改self.name
为self._name
或其他内容)
!
如果您的意思不是真正的解析,而是文本处理,并且输入数据确实很常规,那么请使用 John 的解决方案。如果你真的需要一些解析(比如你得到的数据有一些更复杂的规则),那么根据你需要解析的数据量,我会选择 pyparsing 或simpleparse。我都试过了,但实际上 pyparsing 对我来说太慢了。
你可能会研究类似pyparsing 的东西。
Grako(用于语法编译器)允许将输入格式规范(语法)与其解释(语义)分开。这是 Grako 的各种EBNF输入格式的语法:
(* a file contains zero or more blocks *)
file = {block} $;
(* a named block has at least one assignment statement *)
block = name '{' {assignment}+ '}';
assignment = name '=' value NEWLINE;
name = /[a-z][a-z0-9_]*/;
value = integer | string;
NEWLINE = /\n/;
integer = /[0-9]+/;
(* string value is everything until the next newline *)
string = /[^\n]+/;
要安装grako
,运行pip install grako
。从语法生成PEG解析器:
$ grako -o config_parser.py Config.ebnf
config_parser
使用生成的模块将 stdin 转换为 json :
#!/usr/bin/env python
import json
import string
import sys
from config_parser import ConfigParser
class Semantics(object):
def file(self, ast):
# file = {block} $
# all blocks should have unique names within the file
return dict(ast)
def block(self, ast):
# block = name '{' {assignment}+ '}'
# all assignment statements should use unique names
return ast[0], dict(ast[2])
def assignment(self, ast):
# assignment = name '=' value NEWLINE
# value = integer | string
return ast[0], ast[2] # name, value
def integer(self, ast):
return int(ast)
def string(self, ast):
return ast.strip() # remove leading/trailing whitespace
parser = ConfigParser(whitespace='\t\n\v\f\r ', eol_comments_re="#.*?$")
ast = parser.parse(sys.stdin.read(), rule_name='file', semantics=Semantics())
json.dump(ast, sys.stdout, indent=2, sort_keys=True)
输出
{
"block1": {
"othervalue": 32423432,
"some_value": "some other kind of data",
"value": "data"
},
"block2": {
"othervalue": 32423432,
"some_value": "some other kind of data",
"value": "data"
}
}