Here's my class that I use during large XML Processing.
Some notes:
- for any element, xmlns declarations in use come from this element and from all ancestors
- when element
<elem />
is a sibling of <elem xmlns="x" />
they have different namespaces
- processing XML is generally stack-based
Code:
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import javax.xml.namespace.QName;
import org.springframework.util.xml.SimpleNamespaceContext;
import com.ximpleware.NavException;
import com.ximpleware.VTDNav;
public class VtdXmlCurrentState
{
/* Stack of QName.toString()s for the elements of processed XML - each is a full path */
private Deque<String> qnames = new LinkedList<String>();
/* Stack of QName.toString()s for the elements - each is a single path element and doesn't contain "/" */
private Deque<QName> names = new LinkedList<QName>();
// current depth in input document, starting from -1
private int currentDepth = -1;
// stack of namespace contexts increased during xml depth-first VTD navigation */
private Deque<SimpleNamespaceContext> namespaces = new LinkedList<SimpleNamespaceContext>();
// a flag for optimizing the case when there are many sibling elements without any xmlns declarations
// the case:
// <el />
// <el />
// ...
// it allows to effectively manage the stack and properly handles the following case:
// <el xmlns="x" />
// <el />
// in which the second element should use NSContext from parent and not a copy of sibling's NSContext
private boolean lastNSContextsDifferent = false;
/**
*
*/
public VtdXmlCurrentState()
{
// first a context without any mapping
this.namespaces.push(new SimpleNamespaceContext());
// first QName is "/"
this.qnames.push("/");
this.names.push(null);
}
/**
* Name of the current element
* @return
*/
public QName currentElementName()
{
return this.names.peek();
}
/**
* Returns parent and current path for VTDNav
*
* @param nav
* @return
* @throws NavException
*/
public String[] currentXPath(VTDNav nav) throws NavException
{
// we don't check the end - autopilot handles that
int depth = nav.getCurrentDepth();
int idx = nav.getCurrentIndex();
this.handleNamespaces(nav, depth);
// determining current XPath
// name of the current element (optimization, because we're that the token is START_ELEMENT)
String elName = nav.toRawString(nav.getTokenOffset(idx), nav.getTokenLength(idx) & 0xffff);
QName qName = null;
if (elName.contains(":")) {
String[] qname = elName.split(":");
qName = new QName(this.namespaces.peek().getNamespaceURI(qname[0]), qname[1]);
} else {
qName = new QName(this.namespaces.peek().getNamespaceURI(""), elName);
}
// full name of the current element
StringBuilder sb = new StringBuilder(1024);
String fullName = null;
for (int i = 0; i <= this.currentDepth - depth; i++) {
this.qnames.pop();
this.names.pop();
}
fullName = sb.append(this.qnames.peek()).append(qName.toString()).append("/").toString();
String parentName = this.qnames.peek();
this.qnames.push(fullName);
this.names.push(qName);
this.currentDepth = depth;
return new String[] { parentName, fullName };
}
/**
* Handling element's namespaces - if there are any xmlns[:x], we must create new NSContext
*
* @param nav
* @param depth
* @throws NavException
*/
private void handleNamespaces(VTDNav nav, int depth) throws NavException
{
// are there any ns declarations?
Map<String, String> _namespaces = null;
int index = nav.getCurrentIndex() + 1;
int total = nav.getTokenCount();
while (index < total) {
int type = nav.getTokenType(index);
while (type == VTDNav.TOKEN_ATTR_NAME) {
// quickly skip non-xmlns attrs
index += 2;
type = nav.getTokenType(index);
}
if (type == VTDNav.TOKEN_ATTR_NS) {
String prefix = nav.toString(index).substring(5);
if (prefix.length() > 0)
prefix = prefix.substring(1);
String namespace = nav.toString(index + 1);
if (_namespaces == null)
_namespaces = new HashMap<String, String>();
_namespaces.put(prefix, namespace);
} else if (type == VTDNav.TOKEN_ATTR_VAL) {
} else {
break;
}
index++;
}
if (_namespaces != null) {
// first remove (if necessary) previous contexts from the stack - even if new element is at the same level
// (not descendant - it's sibiling), remove old, push new
for (int i = 0; i <= this.currentDepth - depth; i++)
this.namespaces.pop();
// for this element there's xmlns declaration - this element has different namespace context
// and it will be valid till the next descendant with xmlns
// previous context
SimpleNamespaceContext snc = this.namespaces.peek();
// new ...
SimpleNamespaceContext newSnc = new SimpleNamespaceContext();
// ... to which we'll copy previous declarations
for (Iterator<?> prefixes = snc.getBoundPrefixes(); prefixes.hasNext();) {
String pfx = (String)prefixes.next();
newSnc.bindNamespaceUri(pfx, snc.getNamespaceURI(pfx));
}
newSnc.bindNamespaceUri("", snc.getNamespaceURI(""));
// adding (overwriting!) new namespace mappings
newSnc.setBindings(_namespaces);
this.namespaces.push(newSnc);
this.lastNSContextsDifferent = true;
} else {
// current element doesn't define new namespaces - it gets them from parent element
// optimization - no new namesaces, the same level - we don't do anything!
// we only do something if we got a level up - we have to pop some ns contexts
for (int i = 0; i < this.currentDepth - depth; i++)
this.namespaces.pop();
if (this.currentDepth > depth) {
// we went up and popped() too much ns contexts - we duplicate the most recent
this.namespaces.push(this.namespaces.peek());
} else if (this.currentDepth < depth) {
// we went down - just copy
this.namespaces.push(this.namespaces.peek());
} else {
// the same level
if (this.lastNSContextsDifferent) {
this.namespaces.pop();
this.namespaces.push(this.namespaces.peek());
}
}
this.lastNSContextsDifferent = false;
}
}
}
Input XML:
<?xml version="1.0" encoding="UTF-8"?>
<set id="#1" xmlns="urn:test:1.0">
<documents xmlns="urn:test:1.0">
<doc xmlns="urn:test:1.1" />
<doc />
<doc xmlns="urn:test:1.2" />
</documents>
<documents />
<documents xmlns="" />
</set>
Using the class:
byte[] doc = FileCopyUtils.copyToByteArray(super.createResource("dom03.xml").getInputStream());
VTDGen vtd = new VTDGen();
vtd.setDoc(doc);
vtd.parse(true);
VTDNav nav = vtd.getNav();
AutoPilot ap = new AutoPilot();
ap.bind(nav);
ap.selectElementNS("*", "*");
VtdXmlCurrentState cxp = new VtdXmlCurrentState();
ap.iterate();
assertEquals("/{urn:test:1.0}set/", cxp.currentXPath(nav)[1]);
ap.iterate();
assertEquals("/{urn:test:1.0}set/{urn:test:1.0}documents/", cxp.currentXPath(nav)[1]);
ap.iterate();
assertEquals("/{urn:test:1.0}set/{urn:test:1.0}documents/{urn:test:1.1}doc/", cxp.currentXPath(nav)[1]);
ap.iterate();
assertEquals("/{urn:test:1.0}set/{urn:test:1.0}documents/{urn:test:1.0}doc/", cxp.currentXPath(nav)[1]);
ap.iterate();
assertEquals("/{urn:test:1.0}set/{urn:test:1.0}documents/{urn:test:1.2}doc/", cxp.currentXPath(nav)[1]);
ap.iterate();
assertEquals("/{urn:test:1.0}set/{urn:test:1.0}documents/", cxp.currentXPath(nav)[1]);
ap.iterate();
assertEquals("/{urn:test:1.0}set/documents/", cxp.currentXPath(nav)[1]);
assertFalse(ap.iterate());