根据Jim Baldwin的回答,我创建了一个迭代器,它获取特定级别的节点(而不是特定标签):
import scala.io.Source
import scala.xml.parsing.FatalError
import scala.xml.{Elem, MetaData, Node, Text, TopScope}
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
/**
* Streaming XML parser which yields Scala XML Nodes.
*
* Usage:
*
* val it = new XMLNodeIterator(pathToXML, 1)
*
* Will give you all book-nodes of
*
* <?xml version="1.0" encoding="UTF-8"?>
* <books>
* <book>
* <title>A book title</title>
* </book>
* <book>
* <title>Another book title</title>
* </book>
* </books>
*
*/
class StreamingXMLParser(filename: String, wantedNodeLevel: Int) extends Iterator[Node] {
val file = Source.fromFile(filename)
val it = new XMLEventReader(file)
var currentLevel = 0
var nextEvent = it.next // peek into next event
def getNext() = {
val currentEvent = nextEvent
nextEvent = it.next
currentEvent
}
def hasNext = {
while (it.hasNext && !nextEvent.isInstanceOf[EvElemStart]) {
getNext() match {
case EvElemEnd(_, _) => {
currentLevel -= 1
}
case _ => // noop
}
}
it.hasNext
}
def next: Node = {
if (!hasNext) throw new NoSuchElementException
getNext() match {
case EvElemStart(pre, tag, attrs, _) => {
if (currentLevel == wantedNodeLevel) {
currentLevel += 1
getElemWithChildren(tag, attrs)
}
else {
currentLevel += 1
next
}
}
case EvElemEnd(_, _) => {
currentLevel -= 1
next
}
case _ => next
}
}
def getElemWithChildren(tag: String, attrs: MetaData): Node = {
var children = List[Node]()
while (it.hasNext) {
getNext() match {
case EvElemStart(_, t, a, _) => {
currentLevel += 1
children = children :+ getElemWithChildren(t, a)
}
case EvText(t) => {
children = children :+ Text(t)
}
case EvElemEnd(_, _) => {
currentLevel -= 1
return new Elem(null, tag, attrs, TopScope, true, children: _*)
}
case _ =>
}
}
throw new FatalError("Failed to parse XML.")
}
}