如果您想跨多个终端进行复杂的反向引用,例如您不能使用单个正则表达式,您需要使用 PostLexer(或者最坏的情况,自定义词法分析器)。一个类似 XML 结构的小例子:
<html>
<body>
Hello World
</body>
</html>
可以被这个语法 + Postlexer 解析(验证):
from typing import Iterator
from lark import Lark, Token
TEXT = r"""
<html>
<body>
Hello World
</body>
</html>
"""
GRAMMAR = r"""
start: node
node: OPEN_TAG content* CLOSE_TAG
content: node
| TEXT
TEXT: /[^\s<>]+/
RAW_OPEN: "<" /\w+/ ">"
RAW_CLOSE: "</" /\w+/ ">"
%ignore WS
%import common.WS
%declare OPEN_TAG CLOSE_TAG
"""
class MatchTag:
always_accept = "RAW_OPEN", "RAW_CLOSE"
def process(self, stream: Iterator[Token]) -> Iterator[Token]:
stack = []
for t in stream:
if t.type == "RAW_OPEN":
stack.append(t)
t.type = "OPEN_TAG"
elif t.type == "RAW_CLOSE":
open_tag = stack.pop()
if open_tag.value[1:-1] != t.value[2:-1]:
raise ValueError(f"Non matching closing tag (expected {open_tag.value!r}, got {t.value!r})")
t.type = "CLOSE_TAG"
yield t
parser = Lark(GRAMMAR, parser='lalr', postlex=MatchTag())
print(parser.parse(TEXT).pretty())
(注意:如果你真的想解析 XML,请不要使用 Lark。有很多难以处理的陷阱)