0

我正在解析 XML Wikipedia 数据转储,我想拉出一个页面并将其制成一个新的 XML 文档,其中包含页面的精简版本。例如,对于每个页面,我只对标题、id、时间戳、用户名和文本感兴趣。

这是一个完整的维基百科页面:

<page>
<title>AccessibleComputing</title>
<ns>0</ns>
<id>10</id>
<redirect title="Computer accessibility" />
<revision>
  <id>381202555</id>
  <timestamp>2010-08-26T22:38:36Z</timestamp>
  <contributor>
    <username>OlEnglish</username>
    <id>7181920</id>
  </contributor>
  <minor />
  <comment>[[Help:Reverting|Reverted]] edits by [[Special:Contributions/76.28.186.133|76.28.186.133]] ([[User talk:76.28.186.133|talk]]) to last version by Gurch</comment>
  <text xml:space="preserve">#REDIRECT [[Computer accessibility]] {{R from CamelCase}}</text>
  <sha1 />
  </revision>
</page>

剥离完成后我想得到的结果是这样的:

<page>
  <title>AccessibleComputing</title>
  <id>10</id>
  <revision>
    <timestamp>2010-08-26T22:38:36Z</timestamp>
    <contributor>
      <username>OlEnglish</username>
    </contributor>
    <text xml:space="preserve">#REDIRECT [[Computer accessibility]] {{R from CamelCase}}</text>
  </revision>
</page>

由于这些文档的大小,我知道我不能使用 DOM 来处理这个问题。我知道如何设置 SAX 解析器,但是在解析文档时构建新 XML 文件的最佳方法是什么?

谢谢

4

2 回答 2

3

您可以使用 XMLFilterImpl 并只保留您需要的内容,这里的想法是,输入和输出都是流,因此它可以处理任何大小的 XML

    XMLReader xr = new XMLFilterImpl(XMLReaderFactory.createXMLReader()) {
        public void startElement(String uri, String localName, String qName, Attributes atts)
                throws SAXException {
            if (qName.equals("page")) {
                super.startElement(uri, localName, qName, atts);
            }
        }

        public void endElement(String uri, String localName, String qName) throws SAXException {
            if (qName.equals("page")) {
                super.endElement(uri, localName, qName);
            }
        }

        public void characters(char[] ch, int start, int length) throws SAXException {
            //super.characters(ch, start, length);
        }
    };
    Source src = new SAXSource(xr, new InputSource("1.xml"));
    Result res = new StreamResult(System.out);
    TransformerFactory.newInstance().newTransformer().transform(src, res);
于 2013-07-02T19:59:11.963 回答
0

在这里,我使用 SAX Parser 实现了解析,它提取标题元素和标题

维基百科转储文件中重定向元素的属性。

package parser;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;



 public class SAXHandler extends DefaultHandler {

List<String> list;
int count=0,counter=0;
int MAX_SIZE=100000;
String temp="";
int counterz=0;
public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException{

        long start = System.currentTimeMillis();            

        SAXHandler saxhandler=new SAXHandler();
        saxhandler.assign();
        saxhandler.parseDoc();

        long end = System.currentTimeMillis();
        System.out.println("Time taken to write is " + (end - start) + "msecs");    

}

void assign(){
    list = new ArrayList<String>(); 
}

void parseDoc() throws ParserConfigurationException, SAXException, IOException{

    SAXParserFactory spf = SAXParserFactory.newInstance();
    SAXParser sp = spf.newSAXParser();
    sp.parse("D:\\XMLParsing_Files\\enwiki-20120902-pages-articles-multistream.xml", this);
    writeToFile(list); // for writing the end elements
}

public void startDocument() throws SAXException {

}

public void endDocument() throws SAXException {

}

public void startElement(String uri, String localName,String qName, Attributes attributes)throws SAXException {

    if(qName.equalsIgnoreCase("redirect"))
    {
        list.add(attributes.getValue("title"));
        count++;
        if(count==MAX_SIZE)
        {
            try {
                writeToFile(list);
            } catch (IOException e) {
                e.printStackTrace();
            }
            list.clear();
            count=0;
        }
    }

}

public void endElement(String uri, String localName, String qName)throws SAXException {

   if(qName.equalsIgnoreCase("title"))
   {
       list.add(temp);
       count++;
       if(count==MAX_SIZE)
       {
        try {
            writeToFile(list);
        } catch (IOException e) {
            e.printStackTrace();
        }
        list.clear();
        count=0;
       }
   }

}

public void characters(char ch[], int start, int length)throws SAXException {

    temp="";
    temp=new String(ch,start,length);
}

void writeToFile(List<String> list) throws IOException{

    Collections.sort(list);
    File file = new File("D:\\XMLParsing_Files\\Extracted_Data\\Extracted_Sorted_Data_" + getSuffix() + ".txt");


    if (!file.exists()) {
        file.createNewFile();
    }

    FileWriter fw = new FileWriter(file.getAbsoluteFile());
    PrintWriter pw = new PrintWriter(fw);

    Iterator<String> it = list.iterator();
    while (it.hasNext()) {
        pw.println(it.next());
    }
    pw.println("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz");   
    pw.close();
    System.out.println(++counterz + "Done");
}

int  getSuffix(){
    counter++;
    return counter;
 }

}
于 2013-10-03T21:35:50.267 回答