这很有趣!
我采取了类似于史蒂夫的方法。通过对“开始标签”和“结束标签”中的元素进行排序,然后计算放置它们的位置。
我厚颜无耻地从 Blaisorblade 那里窃取了测试,并添加了一些帮助我开发代码的更多内容。
编辑于 2011-08-14
我对在 test-5 中插入空标签的方式感到不满。然而,这个位置是如何制定 test-3 的结果
- 即使空标签 (c,d) 在 spans 列表中的跨越标签 (a) 和 c,d-tags 具有与 a 的结束标签相同的插入点,c,d 标签也会进入 a。这使得很难在可能有用的跨标签之间放置空标签。
所以,我稍微改变了一些测试并提供了一个替代解决方案。
在替代解决方案中,我以相同的方式启动,但有 3 个单独的列表,开始、空和结束标记。而不仅仅是排序,我还有第三步,将空标签放入标签列表中。
第一个解决方案:
import xml.{XML, Elem, Node}
import annotation.tailrec
object SpanToXml {
def spansToXML(text: String, spans: Seq[(Int, Int, Elem)]): Node = {
// Create a Seq of elements, sorted by where it should be inserted
// differentiate start tags ('s) and empty tags ('e)
val startElms = spans sorted Ordering[Int].on[(Int, _, _)](_._1) map {
case e if e._1 != e._2 => (e._1, e._3, 's)
case e => (e._1, e._3, 'e)
}
//Create a Seq of closing tags ('c), sorted by where they should be inserted
// filter out all empty tags
val endElms = spans.reverse.sorted(Ordering[Int].on[(_, Int, _)](_._2))
.filter(e => e._1 != e._2)
.map(e => (e._2, e._3, 'c))
//Combine the Seq's and sort by insertion point
val elms = startElms ++ endElms sorted Ordering[Int].on[(Int, _, _)](_._1)
//The sorting need to be refined
// - end tag's need to come before start tag's if the insertion point is thesame
val sorted = elms.sortWith((a, b) => a._1 == b._1 && a._3 == 'c && b._3 == 's )
//Adjust the insertion point to what it should be in the final string
// then insert the tags into the text by folding left
// - there are different rules depending on start, empty or close
val txt = adjustInset(sorted).foldLeft(text)((tx, e) => {
val s = tx.splitAt(e._1)
e match {
case (_, elem, 's) => s._1 + "<" + elem.label + elem.attributes + ">" + s._2
case (_, elem, 'e) => s._1 + "<" + elem.label + elem.attributes + "/>" + s._2
case (_, elem, 'c) => s._1 + "</" + elem.label + ">" + s._2
}
})
//Sanity check
//println(txt)
//Convert to XML
XML.loadString(txt)
}
def adjustInset(elems: Seq[(Int, Elem, Symbol)]): Seq[(Int, Elem, Symbol)] = {
@tailrec
def adjIns(elems: Seq[(Int, Elem, Symbol)], tmp: Seq[(Int, Elem, Symbol)]): Seq[(Int, Elem, Symbol)] =
elems match {
case t :: Nil => tmp :+ t
case t :: ts => {
//calculate offset due to current element
val offset = t match {
case (_, e, 's) => e.label.size + e.attributes.toString.size + 2
case (_, e, 'e) => e.label.size + e.attributes.toString.size + 3
case (_, e, 'c) => e.label.size + 3
}
//add offset to all elm's in tail, and recurse
adjIns(ts.map(e => (e._1 + offset, e._2, e._3)), tmp :+ t)
}
}
adjIns(elems, Nil)
}
def test(text: String, spans: Seq[(Int, Int, Elem)], expected: Node) {
val res = spansToXML(text, spans)
print("Text: \"%s\", expected:\n%s\nResult:\n%s\n\n" format (text, expected, res))
assert(expected == res)
}
def test1() =
test(
text = "The dog chased the cat.",
spans = Seq(
(0, 23, <xml/>),
(4, 22, <phrase/>),
(4, 7, <token/>)),
expected = <xml>The <phrase><token>dog</token> chased the cat</phrase>.</xml>
)
def test2() =
test(
text = "aabbccdd",
spans = Seq(
(0, 8, <xml x="1"/>),
(0, 4, <ab y="foo"/>),
(4, 8, <cd z="42>3"/>)),
expected = <xml x="1"><ab y="foo">aabb</ab><cd z="42>3">ccdd</cd></xml>
)
def test3() =
test(
text = " ",
spans = Seq(
(0, 1, <a/>),
(0, 0, <b/>),
(0, 0, <c/>),
(1, 1, <d/>),
(1, 1, <e/>)),
expected = <a><b/><c/> <d/><e/></a>
)
def test4() =
test(
text = "aabbccdd",
spans = Seq((0, 8, <xml/>),
(0, 4, <ab/>),
(4, 8, <cd/>),
(4, 6, <ok/>)),
expected = <xml><ab>aabb</ab><cd><ok>cc</ok>dd</cd></xml>
)
def test5() =
test(
text = "aabbccdd",
spans = Seq((0, 8, <xml/>),
(0, 4, <ab/>),
(2, 4, <b/>),
(4, 4, <empty/>),
(4, 8, <cd/>),
(4, 6, <ok/>)),
expected = <xml><ab>aa<b>bb<empty/></b></ab><cd><ok>cc</ok>dd</cd></xml>
)
def test6() =
test(
text = "aabbccdd",
spans = Seq((0, 8, <xml/>),
(0, 4, <ab/>),
(2, 4, <b/>),
(2, 4, <c/>),
(3, 4, <d/>),
(4, 8, <cd/>),
(4, 6, <ok/>)),
expected = <xml><ab>aa<b><c>b<d>b</d></c></b></ab><cd><ok>cc</ok>dd</cd></xml>
)
def test7() =
test(
text = "aabbccdd",
spans = Seq((0, 8, <xml/>),
(0, 4, <ab a="a" b="b"/>),
(4, 8, <cd c="c" d="d"/>)),
expected = <xml><ab a="a" b="b">aabb</ab><cd c="c" d="d">ccdd</cd></xml>
)
def invalidSpans() = {
val text = "aabbccdd"
val spans = Seq((0, 8, <xml/>),
(0, 4, <ab/>),
(4, 6, <err/>),
(4, 8, <cd/>))
try {
val res = spansToXML(text, spans)
assert(false)
} catch {
case e => {
println("This generate invalid XML:")
println("<xml><ab>aabb</ab><err><cd>cc</err>dd</cd></xml>")
println(e.getMessage)
}
}
}
def main(args: Array[String]) {
test1()
test2()
test3()
test4()
test5()
test6()
test7()
invalidSpans()
}
}
SpanToXml.main(Array())
替代解决方案:
import xml.{XML, Elem, Node}
import annotation.tailrec
object SpanToXmlAlt {
def spansToXML(text: String, spans: Seq[(Int, Int, Elem)]): Node = {
// Create a Seq of start tags, sorted by where it should be inserted
// filter out all empty tags
val startElms = spans.sorted(Ordering[Int].on[(Int, _, _)](_._1))
.filterNot(e => e._1 == e._2)
.map(e => (e._1, e._3, 's))
//Create a Seq of closing tags, sorted by where they should be inserted
// filter out all empty tags
val endElms = spans.reverse.sorted(Ordering[Int].on[(_, Int, _)](_._2))
.filterNot(e => e._1 == e._2)
.map(e => (e._2, e._3, 'c))
//Create a Seq of empty tags, sorted by where they should be inserted
val emptyElms = spans.sorted(Ordering[Int].on[(Int, _, _)](_._1))
.filter(e => e._1 == e._2)
.map(e => (e._1, e._3, 'e))
//Combine the Seq's and sort by insertion point
val elms = startElms ++ endElms sorted Ordering[Int].on[(Int, _, _)](_._1)
//The sorting need to be refined
// - end tag's need to come before start tag's if the insertion point is the same
val sorted = elms.sortWith((a, b) => a._1 == b._1 && a._3 == 'c && b._3 == 's )
//Insert empty tags
val allSorted = insertEmpyt(spans, sorted, emptyElms) sorted Ordering[Int].on[(Int, _, _)](_._1)
//Adjust the insertion point to what it should be in the final string
// then insert the tags into the text by folding left
// - there are different rules depending on start, empty or close
val str = adjustInset(allSorted).foldLeft(text)((tx, e) => {
val s = tx.splitAt(e._1)
e match {
case (_, elem, 's) => s._1 + "<" + elem.label + elem.attributes + ">" + s._2
case (_, elem, 'e) => s._1 + "<" + elem.label + elem.attributes + "/>" + s._2
case (_, elem, 'c) => s._1 + "</" + elem.label + ">" + s._2
}
})
//Sanity check
//println(str)
//Convert to XML
XML.loadString(str)
}
def insertEmpyt(spans: Seq[(Int, Int, Elem)],
sorted: Seq[(Int, Elem, Symbol)],
emptys: Seq[(Int, Elem, Symbol)]): Seq[(Int, Elem, Symbol)] = {
//Find all tags that should be before the empty tag
@tailrec
def afterSpan(empty: (Int, Elem, Symbol),
spans: Seq[(Int, Int, Elem)],
after: Seq[(Int, Elem, Symbol)]): Seq[(Int, Elem, Symbol)] = {
var result = after
spans match {
case t :: _ if t._1 == empty._1 && t._2 == empty._1 && t._3 == empty._2 => after //break
case t :: ts if t._1 == t._2 => afterSpan(empty, ts, after :+ (t._1, t._3, 'e))
case t :: ts => {
if (t._1 <= empty._1) result = result :+ (t._1, t._3, 's)
if (t._2 <= empty._1) result = result :+ (t._2, t._3, 'c)
afterSpan(empty, ts, result)
}
}
}
//For each empty tag, insert it in the sorted list
var result = sorted
emptys.foreach(e => {
val afterSpans = afterSpan(e, spans, Seq[(Int, Elem, Symbol)]())
var emptyInserted = false
result = result.foldLeft(Seq[(Int, Elem, Symbol)]())((res, s) => {
if (afterSpans.contains(s) || emptyInserted) {
res :+ s
} else {
emptyInserted = true
res :+ e :+ s
}
})
})
result
}
def adjustInset(elems: Seq[(Int, Elem, Symbol)]): Seq[(Int, Elem, Symbol)] = {
@tailrec
def adjIns(elems: Seq[(Int, Elem, Symbol)], tmp: Seq[(Int, Elem, Symbol)]): Seq[(Int, Elem, Symbol)] =
elems match {
case t :: Nil => tmp :+ t
case t :: ts => {
//calculate offset due to current element
val offset = t match {
case (_, e, 's) => e.label.size + e.attributes.toString.size + 2
case (_, e, 'e) => e.label.size + e.attributes.toString.size + 3
case (_, e, 'c) => e.label.size + 3
}
//add offset to all elm's in tail, and recurse
adjIns(ts.map(e => (e._1 + offset, e._2, e._3)), tmp :+ t)
}
}
adjIns(elems, Nil)
}
def test(text: String, spans: Seq[(Int, Int, Elem)], expected: Node) {
val res = spansToXML(text, spans)
print("Text: \"%s\", expected:\n%s\nResult:\n%s\n\n" format (text, expected, res))
assert(expected == res)
}
def test1() =
test(
text = "The dog chased the cat.",
spans = Seq(
(0, 23, <xml/>),
(4, 22, <phrase/>),
(4, 7, <token/>)),
expected = <xml>The <phrase><token>dog</token> chased the cat</phrase>.</xml>
)
def test2() =
test(
text = "aabbccdd",
spans = Seq(
(0, 8, <xml x="1"/>),
(0, 4, <ab y="foo"/>),
(4, 8, <cd z="42>3"/>)),
expected = <xml x="1"><ab y="foo">aabb</ab><cd z="42>3">ccdd</cd></xml>
)
def test3alt() =
test(
text = " ",
spans = Seq(
(0, 2, <a/>),
(0, 0, <b/>),
(0, 0, <c/>),
(1, 1, <d/>),
(1, 1, <e/>)),
expected = <a><b/><c/> <d/><e/> </a>
)
def test4() =
test(
text = "aabbccdd",
spans = Seq((0, 8, <xml/>),
(0, 4, <ab/>),
(4, 8, <cd/>),
(4, 6, <ok/>)),
expected = <xml><ab>aabb</ab><cd><ok>cc</ok>dd</cd></xml>
)
def test5alt() =
test(
text = "aabbccdd",
spans = Seq((0, 8, <xml/>),
(0, 4, <ab/>),
(2, 4, <b/>),
(4, 4, <empty/>),
(4, 8, <cd/>),
(4, 6, <ok/>)),
expected = <xml><ab>aa<b>bb</b></ab><empty/><cd><ok>cc</ok>dd</cd></xml>
)
def test5b() =
test(
text = "aabbccdd",
spans = Seq((0, 8, <xml/>),
(0, 4, <ab/>),
(2, 2, <empty1/>),
(4, 4, <empty2/>),
(2, 4, <b/>),
(2, 2, <empty3/>),
(4, 4, <empty4/>),
(4, 8, <cd/>),
(4, 6, <ok/>)),
expected = <xml><ab>aa<empty1/><b><empty3/>bb<empty2/></b></ab><empty4/><cd><ok>cc</ok>dd</cd></xml>
)
def test6() =
test(
text = "aabbccdd",
spans = Seq((0, 8, <xml/>),
(0, 4, <ab/>),
(2, 4, <b/>),
(2, 4, <c/>),
(3, 4, <d/>),
(4, 8, <cd/>),
(4, 6, <ok/>)),
expected = <xml><ab>aa<b><c>b<d>b</d></c></b></ab><cd><ok>cc</ok>dd</cd></xml>
)
def test7() =
test(
text = "aabbccdd",
spans = Seq((0, 8, <xml/>),
(0, 4, <ab a="a" b="b"/>),
(4, 8, <cd c="c" d="d"/>)),
expected = <xml><ab a="a" b="b">aabb</ab><cd c="c" d="d">ccdd</cd></xml>
)
def failedSpans() = {
val text = "aabbccdd"
val spans = Seq((0, 8, <xml/>),
(0, 4, <ab/>),
(4, 6, <err/>),
(4, 8, <cd/>))
try {
val res = spansToXML(text, spans)
assert(false)
} catch {
case e => {
println("This generate invalid XML:")
println("<xml><ab>aabb</ab><err><cd>cc</err>dd</cd></xml>")
println(e.getMessage)
}
}
}
def main(args: Array[String]) {
test1()
test2()
test3alt()
test4()
test5alt()
test5b()
test6()
test7()
failedSpans()
}
}
SpanToXmlAlt.main(Array())