1

我有一种情况,我想开始使用 XML 模式来验证直到现在还没有模式定义的文档。因此,我要验证的现有文件中没有任何xmlns声明。

我成功验证包含声明的文档没有问题xmlns但我也希望能够在没有此类声明的情况下验证这些文档。我希望有这样的事情:

DocumentBuilderFactory dbf = ...;
dbf.setSchema(... my schema for namespace "foo:bar"...);
dbf.setValidating(false);
dbf.setNamespaceAware(true);
DocumentBuilder db = dbf.newDocumentBuilder();
db.setDefaultNamespace("foo:bar");
Document doc = db.parse(input);

没有这样的方法DocumentBuilder.setDefaultNamespace,因此在加载这种类型的文档时不会执行模式验证。

如果未设置,有什么方法可以强制文档的命名空间?或者这是否需要在不考虑架构的情况下解析 XML,检查现有命名空间,调整它,然后使用架构重新验证文档?

我目前期望解析器在解析期间执行验证,但是我先解析然后再验证没有问题。

更新 2021-01-13

这是我正在尝试做的一个具体示例,作为 JUnit 测试用例。

import java.io.IOException;
import java.io.StringReader;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;
import javax.xml.validation.Schema;
import javax.xml.validation.SchemaFactory;

import org.junit.Assert;
import org.junit.Test;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

public class XMLSchemaTest
{
    private static final String XMLNS = "http://www.example.com/schema";
    private static final String schemaDocument = "<xs:schema xmlns:xs=\"http://www.w3.org/2001/XMLSchema\" targetNamespace=\"" + XMLNS + "\" xmlns:e=\"" + XMLNS + "\" elementFormDefault=\"qualified\"><xs:element name=\"example\" type=\"e:exampleType\" /><xs:complexType name=\"exampleType\"><xs:sequence><xs:element name=\"test\" type=\"e:testType\" /></xs:sequence></xs:complexType><xs:complexType name=\"testType\" /></xs:schema>";

    private static Document parse(String document) throws SAXException, ParserConfigurationException, IOException {
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        SchemaFactory sf = SchemaFactory.newInstance("http://www.w3.org/2001/XMLSchema");

        Source[] sources = new Source[] {
                new StreamSource(new StringReader(schemaDocument))
        };

        Schema schema = sf.newSchema(sources);

        dbf.setSchema(schema);
        dbf.setNamespaceAware(true);

        DocumentBuilder db = dbf.newDocumentBuilder();
        db.setErrorHandler(new MyErrorHandler());

        return db.parse(new InputSource(new StringReader(document)));

    }

    @Test
    public void testConformingDocumentWithSchema() throws Exception {
        String testDocument = "<example xmlns=\"" + XMLNS + "\"><test/></example>";

        Document doc = parse(testDocument);

        //Assert.assertEquals("Wrong document XML namespace", XMLNS, doc.getNamespaceURI());
        Element root = doc.getDocumentElement();
        Assert.assertEquals("Wrong root element XML namespace", XMLNS, root.getNamespaceURI());
        Assert.assertEquals("Wrong element name", "example", root.getLocalName());
        Assert.assertEquals("Wrong element name", "example", root.getTagName());
    }

    @Test
    public void testConformingDocumentWithoutSchema() throws Exception {
        String testDocument = "<example><test/></example>";

        Document doc = parse(testDocument);

        //Assert.assertEquals("Wrong document XML namespace", XMLNS, doc.getNamespaceURI());
        Element root = doc.getDocumentElement();
        Assert.assertEquals("Wrong root element XML namespace", XMLNS, root.getNamespaceURI());
        Assert.assertEquals("Wrong element name", "example", root.getLocalName());
        Assert.assertEquals("Wrong element name", "example", root.getTagName());
    }

    @Test
    public void testNononformingDocumentWithSchema() throws Exception {
        String testDocument = "<example xmlns=\"" + XMLNS + "\"><random/></example>";

        try {
            parse(testDocument);

            Assert.fail("Document should not have parsed properly");
        } catch (Exception e) {
            System.out.println(e);
            // Expected
        }
    }
    @Test
    public void testNononformingDocumentWithoutSchema() throws Exception {
        String testDocument = "<example><random/></example>";

        try {
            parse(testDocument);

            Assert.fail("Document should not have parsed properly");
        } catch (Exception e) {
            System.out.println(e);
            // Expected
        }
    }

    public static class MyErrorHandler implements ErrorHandler {

        @Override
        public void warning(SAXParseException exception) throws SAXException {
            System.err.println("WARNING: " + exception);
        }

        @Override
        public void error(SAXParseException exception) throws SAXException {
            throw exception;
        }

        @Override
        public void fatalError(SAXParseException exception) throws SAXException {
            System.err.println("FATAL: " + exception);
        }
    }
}

所有的测试都通过了,除了testConformingDocumentWithoutSchema. 我认为这是意料之中的,因为文档没有声明命名空间。

我在问如何更改测试(但不是文档本身!),以便我可以根据文档实际未声明的模式验证文档。

4

1 回答 1

0

我对此进行了一段时间的研究,并且能够想出一个可行的技巧。可以更优雅地做到这一点(这是我最初的问题),也可以用更少的代码来做到这一点,但这是我能想到的。

如果您查看问题中的 JUnit 测试用例,将“parse”方法更改为以下内容(并将XMLNS作为第二个参数添加到所有调用中parse)将允许完成所有测试:

import org.w3c.dom.ls.DOMImplementationLS;
import org.w3c.dom.ls.LSOutput;
import org.w3c.dom.ls.LSSerializer;

...

    private static Document parse(String document, String namespace) throws SAXException, ParserConfigurationException, IOException {
        SchemaFactory sf = SchemaFactory.newInstance("http://www.w3.org/2001/XMLSchema");

        Source[] sources = new Source[] {
                new StreamSource(new StringReader(schemaDocument))
        };

        Schema schema = sf.newSchema(sources);

        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        dbf.setSchema(schema);
        dbf.setNamespaceAware(true);

        DocumentBuilder db = dbf.newDocumentBuilder();
        ErrorHandler errorHandler = new MyErrorHandler();
        db.setErrorHandler(errorHandler);

        try {
            return db.parse(new InputSource(new StringReader(document)));
        } catch (SAXParseException spe) {
            // Just in case this was a problem with a missing namespace
            // System.out.println("Possibly recovering from SPE " + spe);

            // New DocumentBuilder without the schema
            dbf.setSchema(null);
            db = dbf.newDocumentBuilder();
            db.setErrorHandler(errorHandler);

            Document doc = db.parse(new InputSource(new StringReader(document)));

            if(null != doc.getDocumentElement().getNamespaceURI()) {
                // Namespace URI was set; this is a fatal error
                throw spe;
            }

            // Override the namespace on the Document + root element
            doc.getDocumentElement().setAttribute("xmlns", namespace);

            // Serialize the document -> String to start over again
            DOMImplementationLS domImplementation = (DOMImplementationLS) doc.getImplementation();
            LSSerializer lsSerializer = domImplementation.createLSSerializer();
            LSOutput lsOutput = domImplementation.createLSOutput();
            lsOutput.setEncoding("UTF-8");
            StringWriter out = new StringWriter();
            lsOutput.setCharacterStream(out);

            lsSerializer.write(doc, lsOutput);

            String converted = out.toString();

            // Re-enable the schema
            dbf.setSchema(schema);
            db = dbf.newDocumentBuilder();
            db.setErrorHandler(errorHandler);

            return db.parse(new InputSource(new StringReader(converted)));
        }
    }

这通过捕获SAXParseException并且SAXParseException不放弃任何细节来工作,假设问题可能是由于缺少 XML 命名空间声明。然后我在没有模式验证的情况下重新解析文档,将命名空间声明添加到 in-memory Document,然后序列化Documentto并在重新启用模式验证的情况String下重新解析文档。

我试图通过设置 XML 命名空间然后使用来做到这一点Schema.newValidator().validate(new DOMSource(doc)),但是这对我来说每次都失败了。通过序列化程序运行解决了这个问题。

于 2021-01-13T18:00:41.443 回答