0

我有以下名为 customers.xml 的源 XML 文件:

<?xml version="1.0" encoding="utf-8"?>
<p:CustomerElement xmlns:p="http://www.dog.com/customer" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:schemaLocation="http://www.dog.com/customer Customer.xsd">
  <Customer>
    <Sender>
      <transmitDate>2016-02-21T00:00:00</transmitDate>
      <transmitter>Dog ETL v2.0</transmitter>
      <dealerCode><![CDATA[P020]]></dealerCode>
      <DMSSystem><![CDATA[DBS]]></DMSSystem>
      <DMSReleaseNumber><![CDATA[5.0]]></DMSReleaseNumber>
    </Sender>
    <Identifier>
      <updateInd><![CDATA[A]]></updateInd>
      <dealerCustNumber><![CDATA[AMADOR]]></dealerCustNumber>
      <dealerCustName><![CDATA[AMADOR COMPUTERS]]></dealerCustName>
      <phoneNumber><![CDATA[800 111 4444]]></phoneNumber>
      <faxNumber><![CDATA[780 111 4444]]></faxNumber>
      <email xsi:nil="true" />
      <customerType><![CDATA[R]]></customerType>
      <activeCustomerInd>false</activeCustomerInd>
      <parentCustomerNumber xsi:nil="true" />
      <primaryStoreNumber><![CDATA[00]]></primaryStoreNumber>
      <preferredLanguage><![CDATA[ENG]]></preferredLanguage>
      <dealerDateInSystem>2000-01-11T00:00:00</dealerDateInSystem>
      <dealerLastUpdatedDate>2015-02-05T00:00:00</dealerLastUpdatedDate>
    </Identifier>
    <Location>
      <address2><![CDATA[ACCOUNT FLAGGED FOR DELETION]]></address2>
      <address3><![CDATA[AS PER BILL  FEB AA/15]]></address3>
      <city><![CDATA[CHICAGO]]></city>
      <postalCode><![CDATA[Q5S 1E5]]></postalCode>
      <state><![CDATA[AB]]></state>
      <country><![CDATA[CA]]></country>
      <location><![CDATA[FLAGGED FOR DELETION]]></location>
      <addressType><![CDATA[M]]></addressType>
    </Location>
    <Division>
      <divisionCode><![CDATA[G]]></divisionCode>
      <divisionName><![CDATA[CAR]]></divisionName>
      <IndustryCode>
        <industryCode><![CDATA[AQ99]]></industryCode>
        <primaryIndustryCodeInd>true</primaryIndustryCodeInd>
      </IndustryCode>
      <SalesRep>
        <number><![CDATA[XXX]]></number>
        <name><![CDATA[KILL ACCOUNT IN PROCESS]]></name>
        <type><![CDATA[M]]></type>
        <par>0</par>
        <email xsi:nil="true" />
        <phoneNumber><![CDATA[000 000 0000]]></phoneNumber>
      </SalesRep>
    </Division>
  </Customer>
  <Customer>
    <Sender>
      <transmitDate>2016-02-21T00:00:00</transmitDate>
      <transmitter>Dog ETL v2.0</transmitter>
      <dealerCode><![CDATA[P000]]></dealerCode>
      <DMSSystem><![CDATA[DBS]]></DMSSystem>
      <DMSReleaseNumber><![CDATA[5.0]]></DMSReleaseNumber>
    </Sender>
    <Identifier>
      <updateInd><![CDATA[A]]></updateInd>
      <dealerCustNumber><![CDATA[UU20888]]></dealerCustNumber>
      <dealerCustName><![CDATA[ ADVERTISING AND PR]]></dealerCustName>
      <phoneNumber xsi:nil="true" />
      <faxNumber xsi:nil="true" />
      <email xsi:nil="true" />
      <customerType><![CDATA[I]]></customerType>
      <activeCustomerInd>true</activeCustomerInd>
      <parentCustomerNumber xsi:nil="true" />
      <primaryStoreNumber><![CDATA[M2]]></primaryStoreNumber>
      <preferredLanguage><![CDATA[ENG]]></preferredLanguage>
      <dealerDateInSystem>2015-11-18T00:00:00</dealerDateInSystem>
      <dealerLastUpdatedDate>2015-11-19T00:00:00</dealerLastUpdatedDate>
    </Identifier>
    <Location>
      <address2><![CDATA[EQUIP]]></address2>
      <city><![CDATA[ADER]]></city>
      <country><![CDATA[CA]]></country>
      <addressType><![CDATA[M]]></addressType>
    </Location>
    <Division>
      <divisionCode><![CDATA[A]]></divisionCode>
      <divisionName><![CDATA[AGRO]]></divisionName>
      <IndustryCode>
        <industryCode><![CDATA[EQ00]]></industryCode>
        <primaryIndustryCodeInd>true</primaryIndustryCodeInd>
      </IndustryCode>
    </Division>
  </Customer>
</p:CustomerElement>

我有以下 java 代码,它将customers.xml 解析为单独的“客户”实体,然后尝试将它们中的每一个转换为 AVRO 格式:

package com.dogsoft.data.xmltoavro;

import java.io.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import jdk.nashorn.internal.runtime.regexp.joni.constants.NodeType;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import org.apache.avro.Protocol;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.util.Utf8;
import org.w3c.dom.*;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;


public class ParseXmlFile {

    private static Protocol protocol;

    public static void xmlToAvro(File xmlFile, File avroFile) throws IOException, SAXException {
        try {
            InputStream stream = new FileInputStream("/tmp/xml.avsc");
            if (stream == null) throw new IllegalStateException("Classpath should include xml.avsc");

            protocol = Protocol.parse(stream);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        Schema schema = protocol.getType("Element");

        Document doc = parse(xmlFile);
        DatumWriter<GenericRecord> datumWriter = new SpecificDatumWriter<>(schema);

        try (DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<>(datumWriter)) {
            fileWriter.create(schema, avroFile);
            Object docElement = doc.getDocumentElement();
            fileWriter.append(wrapElement(doc.getDocumentElement()));

        }
    }

    private static GenericData.Record wrapElement(Element el) {
        GenericData.Record record = new GenericData.Record(protocol.getType("Element"));
        record.put("name", el.getNodeName());

        NamedNodeMap attributeNodes = el.getAttributes();
        List<GenericData.Record> attrRecords = new ArrayList<>();
        for (int i = 0; i < attributeNodes.getLength(); i++) {
            Attr attr = (Attr) attributeNodes.item(i);
            attrRecords.add(wrapAttr(attr));
        }
        record.put("attributes", attrRecords);

        List<Object> childArray = new ArrayList<>();
        NodeList childNodes = el.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node node = childNodes.item(i);
            Object nt = node.getNodeType();
            if (node.getNodeType() == Node.ELEMENT_NODE)
                childArray.add(wrapElement((Element) node));

            if (node.getNodeType() == Node.TEXT_NODE)
                childArray.add(node.getTextContent());

        }
        record.put("children", childArray);

        return record;
    }

    private static GenericData.Record wrapAttr(Attr attr) {
        GenericData.Record record = new GenericData.Record(protocol.getType("Attribute"));

        record.put("name", attr.getName());
        record.put("value", attr.getValue());

        return record;
    }

    private static Document parse(File file) throws IOException, SAXException {
        try {
            DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
            return builder.parse(file);
        } catch (ParserConfigurationException e) {
            throw new RuntimeException(e);
        }
    }

    public static void avroToXml(File avroFile, File xmlFile) throws IOException {
        try {
            InputStream stream = new FileInputStream("/tmp/xml.avsc");
            if (stream == null) throw new IllegalStateException("Classpath should include xml.avsc");

            protocol = Protocol.parse(stream);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(protocol.getType("Element"));
        DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(avroFile, datumReader);

        GenericRecord record = dataFileReader.next();

        Document doc;
        try {
            doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        } catch (ParserConfigurationException e) {
            throw new RuntimeException(e);
        }

        Element el = unwrapElement(record, doc);
        doc.appendChild(el);

        saveDocument(doc, xmlFile);
    }

    private static Element unwrapElement(GenericRecord record, Document doc) {
        String name = "" + record.get("name");
        Element el = doc.createElement(name);

        @SuppressWarnings("unchecked")
        GenericArray<GenericRecord> attrArray = (GenericArray<GenericRecord>) record.get("attributes");
        for (GenericRecord attrRecord : attrArray)
            el.setAttributeNode(unwrapAttr(attrRecord, doc));

        @SuppressWarnings("unchecked")
        GenericArray<Object> childArray = (GenericArray<Object>) record.get("children");
        for (Object childObj : childArray) {
            if (childObj instanceof GenericRecord)
                el.appendChild(unwrapElement((GenericRecord) childObj, doc));

            if (childObj instanceof Utf8)
                el.appendChild(doc.createTextNode("" + childObj));
        }

        return el;
    }

    private static Attr unwrapAttr(GenericRecord record, Document doc) {
        Attr attr = doc.createAttribute("" + record.get("name"));
        attr.setValue("" + record.get("value"));
        return attr;
    }

    private static void saveDocument(Document doc, File file) {
        try {
            Transformer transformer = TransformerFactory.newInstance().newTransformer();

            transformer.transform(new DOMSource(doc), new StreamResult(file));
        } catch (TransformerException e) {
            throw new RuntimeException(e);
        }
    }

    public static void main(String[] args)
    {
        Object nodeObject = null;
        Node myNode = null;
        Transformer transformer = null;

        try
        {
            try {
                transformer =
                        TransformerFactory.newInstance().newTransformer();
            } catch (TransformerConfigurationException e) {
                e.printStackTrace();
            }
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            dbf.setNamespaceAware(true);
            DocumentBuilder db = dbf.newDocumentBuilder();
            Document doc = db.parse("/tmp/customers.xml");
            System.out.printf("Version = %s%n", doc.getXmlVersion());
            System.out.printf("Encoding = %s%n", doc.getXmlEncoding());
            System.out.printf("Standalone = %b%n%n", doc.getXmlStandalone());
            if (doc.hasChildNodes())
            {
                int customerNumber = 0;
                NodeList nl = doc.getDocumentElement().getChildNodes();
                for (int i = 0; i < nl.getLength(); i++) {
                    Node node = nl.item(i);
                    if (node.getNodeType() == Node.ELEMENT_NODE) {
                        System.out.println(node.toString());
                        customerNumber++;
                        File avroFile = new File("/tmp/customer" + customerNumber + ".avro");
                        File xmlFile = new File("/tmp/customer" + customerNumber + ".xml");
                        File xmlFile1 = new File("/tmp/customer" + customerNumber + "-foo.xml");
                        try {
                            transformer.transform(
                                    new DOMSource(node), new StreamResult(xmlFile));
                            File outputFile = new File("/tmp/customer" + customerNumber + ".avro");
                            xmlToAvro(xmlFile, outputFile);

                        } catch (TransformerException e) {
                            e.printStackTrace();
                        }
                    }
                }
            }
        }
        catch (IOException ioe)
        {
            System.err.println("IOE: " + ioe);
        }
        catch (SAXException saxe)
        {
            System.err.println("SAXE: " + saxe);
        }
        catch (FactoryConfigurationError fce)
        {
            System.err.println("FCE: " + fce);
        }
        catch (ParserConfigurationException pce)
        {
            System.err.println("PCE: " + pce);
        }
    }
}

此代码总体上有效,但它忽略了包含在

![CDATA[  

标签。碰巧的是,customers.xml 文件中的大部分实际有用数据都包含在这些标记中。

有没有办法修改此代码,使其不忽略 CDATA 内容?

4

1 回答 1

0

您可能希望将问题拆分为两部分,而不是手写解析器代码:首先,将 XML 绑定到 POJO(使用 JAXB 或 Jackson XML 模块);然后将 POJO 编写为 Avro(使用 Apache Avro lib 或 Jackson Avro 模块)。您所需要的只是与 XML 和 Avro 等数据的预期结构相匹配的 POJO 定义。结果应该是更少的代码,并且基本上指定需要发生什么以及现在如何去做。

于 2017-10-24T04:38:15.830 回答