新库:XParsec
这个问题导致了 F# 3.0 中与流和类型无关、非线性、可扩展的解析秒实现——受 FParsec 启发,从字符和线性流中解放出来并简化了:http ://corsis.github.com/XParsec/
图案
1 = < font=?'Bold' bbox=F'l ..' s > ; < ~s >*
2 = < font=!'Bold' bbox=F'l ..' s=?'(' > | [ 1.l < 2.l ] ; < ~s >*
3 = < font=!'Bold' bbox=F'l ..' s=?')' > | [ 1.l < 3.l ]
在哪里
element names are left unspecified
font, bbox and s are attributes
V = string, N = string
? :: V -> bool -- value contains string
! :: V -> bool = not . (?) -- value does not contain string
~ :: N -> bool -- value of attribute N is empty or whitespace
F :: V -> [(N, float)] -- extracts a list of named floats from value
RM :: V -> bool -- value matches regular expression
[] :: [bool] -- list of conditions
代码
open System.Xml.Linq
open System.Collections.Generic
let inline (-?-) a b = (a : string).Contains b
let inline (~~) s = s |> String.IsNullOrWhiteSpace
let inline (!>) x = ( ^a : (static member op_Implicit : ^b -> ^a) x )
let inline (@) (x : XElement) n = let a = x.Attribute(!> n) in if a <> null then a.Value else String.Empty
let inline (@<) (x : XElement) n v = x.SetAttributeValue(!> n, v)
type XE = XElement IEnumerator
let inline bbox e = (e @ "bbox") |> fun s -> s.Split [| ' ' |] |> Seq.map float |> Seq.toList
let inline left bbox = match bbox with l::_ -> l | _ -> nan
let mark n = let id = Guid.NewGuid() in Seq.iter <| fun e -> e @< "class-" + n <| id
let speaker (n : XE) =
let c1 = n.Current
if c1 @? "font" <| "Bold"
then let l1 = c1 |> bbox |> left
while n.MoveNext() && ~~(n.Current @ "s") do ()
let c2 = n.Current
if (c2 @ "font") -?- "Bold" |> not
then let l2 = c2 |> bbox |> left
if l1 < l2
then let s2 = c2 @ "s"
if s2 -?- "("
then if s2 -?- ")"
then [c1; c2] |> mark "speaker"
while n.MoveNext() && ~~(n.Current @ "s") do ()
let c3 = n.Current
if (c3 @ "font") -?- "Bold" |> not
then let l3 = c3 |> bbox |> left
if l1 < l3
then if (c3 @ "s") -?- ")"
then [c1; c2; c3] |> mark "speaker"
let test (x : XElement) =
let spans = x.Descendants(!> "span") |> Seq.toArray
for i = 29 to spans.Length - 1 do
let n = (spans |> Seq.skip i).GetEnumerator()
n.MoveNext() |> ignore
speaker n
输入
<doc>
<block bbox="63.2999 550.846 246.865 561.875">
<line bbox="63.2999 550.846 246.865 561.875">
<span bbox="63.2999 550.846 189.001 561.875" font="TimesNewRoman,Bold" size="9.96" s="Dr. Frank-Walter Steinmeier " />
<span bbox="189 550.846 246.865 561.875" font="TimesNewRoman" size="9.96" s="(SPD) . . . . . ." />
</line>
</block>
<block bbox="63.2999 567.766 246.875 578.796">
<line bbox="63.2999 567.766 246.875 578.796">
<span bbox="63.2999 567.766 136.004 578.796" font="TimesNewRoman,Bold" size="9.96" s="Rainer Brüderle " />
<span bbox="136.02 567.766 246.875 578.796" font="TimesNewRoman" size="9.96" s="(FDP) . . . . . . . . . . . . . . . . ." />
</line>
</block>
<block bbox="63.2999 584.626 250.351 651.456">
<line bbox="63.2999 584.626 246.826 595.656">
<span bbox="63.2999 584.626 152.105 595.656" font="TimesNewRoman,Bold" size="9.96" s="Sahra Wagenknecht " />
<span bbox="152.16 584.626 246.826 595.656" font="TimesNewRoman" size="9.96" s="(DIE LINKE) . . . . . . ." />
</line>
<line bbox="63.2999 600.362 250.351 613.34">
<span bbox="63.2999 601.546 139.327 612.576" font="TimesNewRoman,Bold" size="9.96" s="Siegfried Kauder " />
<span bbox="139.38 601.546 247.762 612.576" font="TimesNewRoman" size="9.96" s="(Villingen-Schwenningen) " />
<span bbox="247.861 600.362 250.351 613.34" font="Symbol" size="9.96" s=" " />
</line>
<line bbox="74.6404 612.526 246.911 623.556">
<span bbox="74.6404 612.526 246.911 623.556" font="TimesNewRoman" size="9.96" s="(CDU/CSU) . . . . . . . . . . . . . . . . . . . . . . . ." />
</line>
<line bbox="63.2999 628.202 191.909 641.18">
<span bbox="63.2999 629.386 126.374 640.416" font="TimesNewRoman,Bold" size="9.96" s="Jürgen Trittin " />
<span bbox="126.419 629.386 189.433 640.416" font="TimesNewRoman" size="9.96" s="(BÜNDNIS 90/" />
<span bbox="189.419 628.202 191.909 641.18" font="Symbol" size="9.96" s=" " />
</line>
<line bbox="74.6394 640.426 246.813 651.456">
<span bbox="74.6394 640.426 246.813 651.456" font="TimesNewRoman" size="9.96" s="DIE GRÜNEN) . . . . . . . . . . . . . . . . . . . . ." />
</line>
</block>
</doc>
输出
<doc>
<block>
<line>
<span font="TimesNewRoman,Bold" size="9.96" s="Dr. Frank-Walter Steinmeier " class-speaker="1f2e4dca-80d5-4c5e-91b6-6bd2e4a8acaf" />
<span font="TimesNewRoman" size="9.96" s="(SPD) . . . . . ." class-speaker="1f2e4dca-80d5-4c5e-91b6-6bd2e4a8acaf" />
</line>
</block>
<block>
<line>
<span font="TimesNewRoman,Bold" size="9.96" s="Rainer Brüderle " class-speaker="eaa75d02-0ac6-4480-bcbe-f17bddfe6e81" />
<span font="TimesNewRoman" size="9.96" s="(FDP) . . . . . . . . . . . . . . . . ." class-speaker="eaa75d02-0ac6-4480-bcbe-f17bddfe6e81" />
</line>
</block>
<block>
<line>
<span font="TimesNewRoman,Bold" size="9.96" s="Sahra Wagenknecht " class-speaker="6b193f23-9b8b-4b37-9118-d8488fba25a2" />
<span font="TimesNewRoman" size="9.96" s="(DIE LINKE) . . . . . . ." class-speaker="6b193f23-9b8b-4b37-9118-d8488fba25a2" />
</line>
<line>
<span font="TimesNewRoman,Bold" size="9.96" s="Siegfried Kauder " class-speaker="a0162e4e-1167-412a-ac11-ac13ef1aa46e" />
<span font="TimesNewRoman" size="9.96" s="(Villingen-Schwenningen) " class-speaker="a0162e4e-1167-412a-ac11-ac13ef1aa46e" />
<span font="Symbol" size="9.96" s=" " />
</line>
<line>
<span font="TimesNewRoman" size="9.96" s="(CDU/CSU) . . . . . . . . . . . . . . . . . . . . . . . ." class-speaker="a0162e4e-1167-412a-ac11-ac13ef1aa46e" />
</line>
<line>
<span font="TimesNewRoman,Bold" size="9.96" s="Jürgen Trittin " class-speaker="81fd6735-c57f-464b-a08f-7e7cb3bccfa8" />
<span font="TimesNewRoman" size="9.96" s="(BÜNDNIS 90/" class-speaker="81fd6735-c57f-464b-a08f-7e7cb3bccfa8" />
<span font="Symbol" size="9.96" s=" " />
</line>
<line>
<span font="TimesNewRoman" size="9.96" s="DIE GRÜNEN) . . . . . . . . . . . . . . . . . . . . ." class-speaker="81fd6735-c57f-464b-a08f-7e7cb3bccfa8" />
</line>
</block>
</doc>
问题
为了自动从简洁的模式声明转到运行代码,我正在考虑执行以下操作:
- 使用 FParsec 将模式声明解析为 AST
- 评估 AST
但在我做任何事情之前,我想知道:
- 任何人都可以编写(应用)EDSL(/其中的一部分)来直接使用 F# 函数和组合来声明代码,而无需求助于 AST?
- 是否有能够在 XML 上进行类似模式匹配的库?
- 有人对我的方法有任何意见吗?