java - Java DOM在xml节点中解析html

Question

我这里有一个解析器：

package lt.prasom.functions;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.util.Properties;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.w3c.dom.CharacterData;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import android.annotation.TargetApi;
import android.media.MediaRecorder.OutputFormat;
import android.util.Log;

public class XMLParser {

    // constructor
    public XMLParser() {

    }

    /**
     * Getting XML from URL making HTTP request
     * @param url string
     * */
    public String getXmlFromUrl(String url) {
        String xml = null;

        try {
            // defaultHttpClient
            DefaultHttpClient httpClient = new DefaultHttpClient();
            HttpGet httpGet = new HttpGet(url);

            HttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity httpEntity = httpResponse.getEntity();
            xml = EntityUtils.toString(httpEntity);

        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        // return XML
        return xml;
    }

    /**
     * Getting XML DOM element
     * @param XML string
     * */
    public Document getDomElement(String xml){
        Document doc = null;
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        dbf.setValidating(false);

        try {

            DocumentBuilder db = dbf.newDocumentBuilder();

            InputSource is = new InputSource();
                is.setCharacterStream(new StringReader(xml));
                doc = db.parse(is); 

            } catch (ParserConfigurationException e) {
                Log.e("Error: ", e.getMessage());
                return null;
            } catch (SAXException e) {
                Log.e("Error: ", e.getMessage());
                return null;
            } catch (IOException e) {
                Log.e("Error: ", e.getMessage());
                return null;
            }

            return doc;
    }

    /** Getting node value
      * @param elem element
      */
     @TargetApi(8)
    public final String getElementValue( Node elem , boolean html) {
         Node child;
         if( elem != null){
             if (elem.hasChildNodes()){
                 for( child = elem.getFirstChild(); child != null; child = child.getNextSibling() ){
                     if( child.getNodeType() == Node.TEXT_NODE  ){


                         //return child.getNodeValue();
                         return child.getNodeValue();
                     }
                 }
             }
         }
         return "";
     }

     /**
      * Getting node value
      * @param Element node
      * @param key string
      * */

     public String getValue(Element item, String str) {     
            NodeList n = item.getElementsByTagName(str);    

            return this.getElementValue(n.item(0), false);
        }

}

还有我的示例 xml：

<items>
<item>
<name>test</name>
<description>yes <b>no</b></description>
</item>
</items>

当我解析描述时，我得到了所有标记（“是”）。所以我想解析描述标签中的原始数据。我试过 CDATA 标签没用。没有编码xml有什么办法吗？

谢谢！

score 1 · Accepted Answer

我同意关于这个问题的评论不完整，或者不够具体，无法直接回答（比如修改你的源代码以工作等），但我不得不做一些类似的事情（我认为）并且可以添加这个。它可能会有所帮助。

因此，如果“描述”元素的内容本身就是有效的 XML，那么文档实际上看起来像这样：

<items>
  <item>
   <name>test</name>
   <description><span>yes <b>no</b></span></description>
  </item>
</items>

然后您可以将“description”元素的内容作为新的 XML 文档破解，然后获取 XML 文本表单，如下所示：

<span>yes <b>no</b></span>

所以一个类似的方法：

/**
 * Get the Description as a new XML document
 *
 */
public Document retrieveDescriptionAsDocument(Document sourceDocument) {

Document document;
Node tmpNode;
Document document2 = null;

try {
    // get the description node, I am just using XPath here as it is easy
    // to read, you already have a reference to the node so just continue as you
    // were doing for that, bottom line is to get a reference to the node
    tmpNode = org.apache.xpath.XPathAPI.selectSingleNode(sourceDocument,"/items/item/description");

    if (tmpNode != null) {

        // create a new empty document
        document2 = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        // associate the node with the original document
        sourceDocument.importNode(tmpNode, true);
        // create a document fragment from the original document
        DocumentFragment df = sourceDocument.createDocumentFragment();
        // append the node you found, to the fragment   
        df.appendChild(tmpNode);
        // create the Node to append to the new DOM
        Node importNode = document2.importNode(df,true);
        // append the fragment (as a node) to the new empty document
        Document2.appendChild(importNode);
    }
    else {
        // LOG WARNING
        yourLoggerOrWhatever.warn("retrieveContainedDocument: No data found for XPath:" + xpathP);
    }

    } catch (Exception e) {
        // LOG ERROR
        yourLoggerOrWhatever.error("Exception caught getting contained document:",e);
    }

    // return the new doc, and the caller can then output that new document, that will now just contain "<span>yes <b>no</b></span>" as text, apply an XSL or whatever
    return document2;
}

java - Java DOM在xml节点中解析html

1 回答 1

Related

Reference