2

我有一堆 XML 文档,其中包含我需要用假数据替换的个人信息。Person 节点包含以下元素:

  • uuid - 必需,不应触碰。
  • 名字 - 可选
  • 姓氏 - 可选
  • 地址 - 可选
  • personID - 必填

一个人可能出现多次,在这种情况下应该使用相同的假数据,即如果两个 Person 节点具有相同的 personID,那么它们都应该收到相同的假 ID。

我已经实现了一些 Java 代码,它从 XML 字符串构建 DOM 树并在将其写回字符串之前替换节点。这很好用,但由于我有这么多文件,我想知道是否有更快的方法。也许通过正则表达式或 XSLT 之类的?

这是一个示例文档:

<ADocument>
  <Stuff>
    ...
  </Stuff>
  <OtherStuff>
    ...
  </OtherStuff>
  <Person>
    <uuid>11111111-1111-1111-1111-111111111111</uuid>
    <firstName>Some</firstName>
    <lastName>Person</lastName>
    <personID>111111111111</personID>
  </Person>
  <Person>
    <uuid>22222222-2222-2222-2222-222222222222</uuid>
    <firstName>Another Person</firstName>
    <address>Main St. 2</address>
    <personID>222222222222</personID>
  </Person>
  <Person>
    <uuid>33333333-3333-3333-3333-333333333333</uuid>
    <firstName>Some</firstName>
    <lastName>Person</lastName>
    <personID>111111111111</personID>
  </Person>
  <MoreStuff>
    ...
  </MoreStuff>
</ADocument>

这是我目前的实现:

public String replaceWithFalseData(String xmlInstance) {
    Document dom = toDOM(xmlInstance);

    XPathExpression xPathExpression = XPathExpressionFactory.createXPathExpression("//Person");
    List<Node> nodeList = xPathExpression.evaluateAsNodeList(dom);

    for(Node personNode : nodeList) {
        Map<String, Node> childNodes = getChildNodes(personNode);
        String personID = childNodes.get("personID").getTextContent();
        // Retrieve a cached fake person using the ID, or create a new one if none exists.
        Person fakePerson = getFakePerson(personID);

        setIfExists(childNodes.get("firstName"), fakePerson.getFirstName());
        setIfExists(childNodes.get("lastName"), fakePerson.getLastName());
        setIfExists(childNodes.get("address"), fakePerson.getAddress());
        setIfExists(childNodes.get("personID"), fakePerson.getPersonID());
    }

    return toString(dom);
}

public Map<String, Node> getChildNodes(Node parent) {
    Map<String, Node> childNodes = new HashMap<String, Node>();
    for(Node child = parent.getFirstChild(); child != null; child = child.getNextSibling()) {
        if(child.getLocalName() != null) {
            childNodes.put(child.getLocalName(), child);
        }
    }
    return childNodes;
}

public void setIfExists(Node node, String value) {
    if(node != null) {
        node.setTextContent(value);
    }
}
4

4 回答 4

2

您正在使用基于 DOM 的 API。使用Streaming API for XML (StAX)可以实现更快的替换,在许多情况下,它可以胜过基于 DOM 的 API: StAX 与 DOM

DOM API 比 StAX 占用更多内存,会降低性能,但比 StAX API 更易于使用。

您的示例的工作解决方案 - 在 150 MB xml 文件上测试,在 10 秒内替换:

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;


public class ReplaceXmlWithFakeUser
{
  public static void main(String[] args) throws XMLStreamException, IOException
  {
    XMLInputFactory inFactory = XMLInputFactory.newInstance();
    XMLEventReader eventReader = inFactory.createXMLEventReader(new BufferedInputStream(new FileInputStream("c:\\temp\\persons.xml")));
    XMLOutputFactory factory = XMLOutputFactory.newInstance();
    XMLEventWriter writer = factory.createXMLEventWriter(new BufferedOutputStream(new FileOutputStream("c:\\temp\\fakePersons.xml")));
    XMLEventFactory eventFactory = XMLEventFactory.newInstance();
    while (eventReader.hasNext())
    {
      XMLEvent event = eventReader.nextEvent();

      if (event.getEventType() == XMLEvent.START_ELEMENT &&
        event.asStartElement().getName().toString().equals("Person"))
      {
        //write Person startElement:
        writer.add(event);


        /*
        STEP 1:
        personId is at the end of Person element. Cannot overwrite firstName and address element with fake data yet. Must call getFakePerson() first.
        Iterate till you read Person END element and just remember all events within person element which we will overwrite with fake data in step 2.
         */
        Person fakePerson=null;

        List<XMLEvent> eventsWithinPersonElement = new ArrayList<XMLEvent>();

        event = eventReader.nextEvent();
        while(!(event.getEventType() == XMLEvent.END_ELEMENT && event.asEndElement().getName().toString().equals("Person")))
        {

          eventsWithinPersonElement.add(event);

          if(event.getEventType() == XMLEvent.START_ELEMENT &&
              event.asStartElement().getName().toString().equals("personID"))
          {
            XMLEvent personIDContentEvent = eventReader.nextEvent();

            String personId = personIDContentEvent.asCharacters().toString();
            fakePerson = getFakePerson(personId);

            eventsWithinPersonElement.add(personIDContentEvent);
          }

          event = eventReader.nextEvent();
        }
        XMLEvent personEndElement=event;


        //STEP 2:
        for (Iterator<XMLEvent> eventWithinPersonElementIterator = eventsWithinPersonElement.iterator(); eventWithinPersonElementIterator.hasNext(); )
        {
          XMLEvent eventWithinPersonElement = eventWithinPersonElementIterator.next();

          writer.add(eventWithinPersonElement);

          if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("personID"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.personId));

            //skip personId event
            eventWithinPersonElementIterator.next();
          }
          if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("firstName"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.firstName));

            //skip real firstName
            eventWithinPersonElementIterator.next();
          }
          if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("lastName"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.lastName));

            //skip real firstName
            eventWithinPersonElementIterator.next();
          }
          else if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("address"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.address));

            //skip real address
            eventWithinPersonElementIterator.next();

          }
        }

        writer.add(personEndElement);
      }
      else
      {
        writer.add(event);
      }
    }
    writer.close();
  }

  private static Person getFakePerson(String personId)
  {
    //create simple fake user...

    Person fakePerson = new Person();
    fakePerson.personId = personId;
    fakePerson.firstName = "fake first name: " + Math.random();
    fakePerson.lastName = "fake last name: " + Math.random();
    fakePerson.address = "fake address: " + Math.random();

    return fakePerson;
  }

  static class Person
  {
    String personId;
    String firstName;
    String lastName;
    String address;

  }
}

用作persons.xml输入:

<ADocument>
    <Stuff>
        <StuffA></StuffA>
    </Stuff>
    <OtherStuff>
        <OtherStuff>
            <ABC>yada yada</ABC>
        </OtherStuff>
    </OtherStuff>

    <Person>
        <uuid>11111111-1111-1111-1111-111111111111</uuid>
        <firstName>Some</firstName>
        <lastName>Person</lastName>
        <personID>111111111111</personID>
    </Person>
    <Person>
        <uuid>22222222-2222-2222-2222-222222222222</uuid>
        <firstName>Another Person</firstName>
        <address>Main St. 2</address>
        <personID>222222222222</personID>
    </Person>
    <Person>
        <uuid>33333333-3333-3333-3333-333333333333</uuid>
        <firstName>Some</firstName>
        <lastName>Person</lastName>
        <personID>111111111111</personID>
    </Person>

    <MoreStuff>
        <foo></foo>
        <foo>fooo</foo>
        <foo><bar></bar></foo>
        <foo>
            <bar></bar>
            <bar/>
            <bar>bb</bar>
        </foo>
        <bar/>
    </MoreStuff>

</ADocument>

产生这个fakePersons.xml结果:

<?xml version="1.0" encoding="UTF-8"?><ADocument>
    <Stuff>
        <StuffA></StuffA>
    </Stuff>
    <OtherStuff>
        <OtherStuff>
            <ABC>yada yada</ABC>
        </OtherStuff>
    </OtherStuff>

    <Person>
        <uuid>11111111-1111-1111-1111-111111111111</uuid>
        <firstName>fake first name: 0.9518514637129984</firstName>
        <lastName>fake last name: 0.3495378044884426</lastName>
        <personID>111111111111</personID>
    </Person>
    <Person>
        <uuid>22222222-2222-2222-2222-222222222222</uuid>
        <firstName>fake first name: 0.8945739434355868</firstName>
        <address>fake address: 0.40784763231471777</address>
        <personID>222222222222</personID>
    </Person>
    <Person>
        <uuid>33333333-3333-3333-3333-333333333333</uuid>
        <firstName>fake first name: 0.7863207851479257</firstName>
        <lastName>fake last name: 0.09918620445731652</lastName>
        <personID>111111111111</personID>
    </Person>

    <MoreStuff>
        <foo></foo>
        <foo>fooo</foo>
        <foo><bar></bar></foo>
        <foo>
            <bar></bar>
            <bar></bar>
            <bar>bb</bar>
        </foo>
        <bar></bar>
    </MoreStuff>

</ADocument>
于 2013-09-04T13:52:50.793 回答
0

我不确定 XSLT 是否可以在这里提供帮助。也许我对 XSLT 的了解不够深入,但是使用 XSLT 根据现有 XML 的数据创建新的 XML 结构。您似乎想在这里做相反的事情:保持相同的结构,但根据动态值更新数据。您可能很难创建这样的 XSLT。优化可能取决于相当多的参数:每个 XML 的 Person 元素的数量、XML 中相等 PersonId 的数量、要处理的 XML 的数量……如果您正在处理大文件,您可能想要切换到SAX 实施可优化您的内存消耗。如果您在同一个 XML 中与大量相同的 PersonID 重新对齐,您可以在您用来替换的假数据后面构建一些缓存结构,以减少 DOM 上的命中量(您可以直接将节点替换为缓存节点并用原始节点覆盖 uuid)。如果您有很多包含相似 PersonID 的小文件,如果可以在多个 XML 文件上使用相同的假数据是可以接受的,那么您可能希望使用交叉 XML 缓存。

另外,我相信您可以在 PersonID 上删除“setIfExists”,因为它被声明为必填字段。

于 2013-09-04T13:53:10.293 回答
0

我无法评论相对性能,但这里有一个 XSLT 解决方案来解决您的问题。

以下 XSLT 样式表:

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  <xsl:output method="xml" indent="yes" omit-xml-declaration="yes"/>

  <!-- Find the position of the first Person with the same personID as this Person. 
       This will be used to provide a unique identifier for that person. -->
  <xsl:template name="get-position-id">
    <xsl:value-of select="count(../../Person[personID=current()/../personID][1]/preceding-sibling::Person)"/>
  </xsl:template>

  <!-- For personID elements, we will replace the number with a number based on the position of 
       the first Person with the same personId. -->
  <xsl:template match="personID">
    <xsl:copy>
      <xsl:variable name="position-id">
        <xsl:call-template name="get-position-id"/>
      </xsl:variable>
      <xsl:call-template name="create-person-id">
        <xsl:with-param name="input" select="$position-id"/>
      </xsl:call-template>
    </xsl:copy>
  </xsl:template>

  <!-- For elements that match this template, we will replace the text with an arbitrary string
       appended with a number linking them to a particular personID. -->
  <xsl:template match="firstName|lastName|address">
    <xsl:copy>
      <xsl:variable name="position-id">
        <xsl:call-template name="get-position-id"/>
      </xsl:variable>
      <xsl:call-template name="create-fake-string">
        <xsl:with-param name="input" select="$position-id"/>
      </xsl:call-template>
    </xsl:copy>    
  </xsl:template>

  <!-- The identity transform. -->
  <xsl:template match="@*|node()">
    <xsl:copy>
      <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
  </xsl:template>

  <!-- This template generates a number that can be used to replace personID. -->
  <xsl:template name="create-person-id">
    <xsl:param name="input"/>
    <!-- Turn the input into a 12-digit number padded by zeroes. -->
    <xsl:value-of select="format-number($input, '000000000000') "/>
  </xsl:template>

  <!-- This template generates a string that can be used to replace data. -->
  <xsl:template name="create-fake-string">
    <xsl:param name="input"/>
    <!-- Create a string to replace data with, appending the input parameter. -->
    <xsl:text>FAKEDATA</xsl:text>
    <xsl:value-of select="$input"/>
  </xsl:template>

</xsl:stylesheet>

应用于您的示例文档时会生成以下 XML:

<ADocument>
  <Stuff>
    ...
  </Stuff>
  <OtherStuff>
    ...
  </OtherStuff>
  <Person>
    <uuid>11111111-1111-1111-1111-111111111111</uuid>
    <firstName>FAKEDATA0</firstName>
    <lastName>FAKEDATA0</lastName>
    <personID>000000000000</personID>
  </Person>
  <Person>
    <uuid>22222222-2222-2222-2222-222222222222</uuid>
    <firstName>FAKEDATA1</firstName>
    <address>FAKEDATA1</address>
    <personID>000000000001</personID>
  </Person>
  <Person>
    <uuid>33333333-3333-3333-3333-333333333333</uuid>
    <firstName>FAKEDATA0</firstName>
    <lastName>FAKEDATA0</lastName>
    <personID>000000000000</personID>
  </Person>
  <MoreStuff>
    ...
  </MoreStuff>
</ADocument>
于 2013-09-05T13:20:24.903 回答
0

感谢所有贡献的人!我使用我的 DOM 实现、Sergej 的 StAX 实现和 Ben 的 XSLT 实现以及我自己的另一个实现,使用正则表达式对一组 2000 个 XML 文档进行了性能测试。结果如下:

  • DOM:23,93 秒
  • StAX: 20,37s
  • XSLT:83,52 秒
  • 正则表达式:7,83s

这是赢家:

public String replaceWithFalseData(String xmlInstance) {
    Pattern personPattern = Pattern.compile("<Person>.*?</Person>", Pattern.DOTALL);
    Matcher personMatcher = personPattern.matcher(xmlInstance);
    StringBuffer xmlBuffer = new StringBuffer();

    while(personMatcher.find()) {
        String personXml = personMatcher.group();

        Pattern idPattern = Pattern.compile("<personID>(.*)</personID>");
        Matcher idMatcher = idPattern.matcher(personXml);
        idMatcher.find();
        String id = idMatcher.group(1);
        Person fakePerson = getFakePerson(id);

        personXml = personXml.replaceFirst("<firstName>.*</firstName>",
                "<firstName>" + fakePerson.getFirstName() + "</firstName>");

        personXml = personXml.replaceFirst("<lastName>.*</lastName>",
                "<lastName>" + fakePerson.getLastName() + "</lastName>");

        personXml = personXml.replaceFirst("<address>.*</address>",
                "<address>" + fakePerson.getAddress() + "</address>");

        personXml = personXml.replaceFirst("<personID>.*</personID>",
                "<personID>" + fakePerson.getPersonID() + "</personID>");

        personMatcher.appendReplacement(xmlBuffer, personXml);
    }

    personMatcher.appendTail(xmlBuffer);
    return xmlBuffer.toString();
}
于 2013-09-06T13:24:53.170 回答