1

有以下 HTML,请告诉我如何使用 JSoup 获取文本,"<html"尝试"<a id="summary"></a>"了以下正则表达式,但它返回空字符串。

doc.select("*:matches(^[<html]*[a>]$)")

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>TestNG:  Unit Test</title>
 </head>
<body>
<a id="summary"></a>

<table cellspacing=0 cellpadding=0 class="param" style="float: left; width:630px;">
<tr><th>Test</th><th class="numi">Methods<br/>Passed</th><th class="numi">Scenarios<br/>Passed</th><th class="numi"># skipped</th><th class="numi"># failed</th><th class="numi">Total<br/>Time</th><th class="numi">Included<br/>Groups</th><th class="numi">Excluded<br/>Groups</th></tr>   
</table>


</body></html>
4

2 回答 2

0

这有点棘手,因为您必须先深入遍历 DOM。NodeTraversor允许您这样做。

这是一个例子:

package stuff;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

public class A {

    public static void main(String[] args) {
        String html = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" +
                "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
                "<head>" +
                "<title>TestNG:  Unit Test</title>" +
                "</head>" +
                "<body>" +
                "<a id=\"summary\"></a>" +
                "<table cellspacing=0 cellpadding=0 class=\"param\" style=\"float: left; width:630px;\">" +
                "<tr><th>Test</th><th class=\"numi\">Methods<br/>Passed</th><th class=\"numi\">Scenarios<br/>Passed</th><th class=\"numi\"># skipped</th><th class=\"numi\"># failed</th><th class=\"numi\">Total<br/>Time</th><th class=\"numi\">Included<br/>Groups</th><th class=\"numi\">Excluded<br/>Groups</th></tr>" +
                "</table>" +
                "</body>" +
                "</html>";
        System.out.println(parse(html));
        String html2 = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" +
                "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
                "<head>" +
                "<title>TestNG:  Unit Test</title>" +
                "</head>" +
                "<body>" +
                "<a id=\"something_else\"></a>" +
                "<a id=\"summary\"></a>" +
                "<table cellspacing=0 cellpadding=0 class=\"param\" style=\"float: left; width:630px;\">" +
                "<tr><th>Test</th><th class=\"numi\">Methods<br/>Passed</th><th class=\"numi\">Scenarios<br/>Passed</th><th class=\"numi\"># skipped</th><th class=\"numi\"># failed</th><th class=\"numi\">Total<br/>Time</th><th class=\"numi\">Included<br/>Groups</th><th class=\"numi\">Excluded<br/>Groups</th></tr>" +
                "</table>" +
                "</body>" +
                "</html>";
        System.out.println(parse(html2));
    }

    public static String parse(String html) {
        Document document = Jsoup.parse(html);
        final StringBuffer buffer = new StringBuffer();
        NodeTraversor nd = new NodeTraversor(new NodeVisitor() {

            private boolean finished = false;

            @Override
            public void tail(Node node, int depth) {
                if (!finished && node instanceof Element) {
                    Element element = (Element) node;
                    if ("a".equals(element.tagName()) && element.hasAttr("id")
                            && "summary".equals(element.attr("id")))
                        finished = true;
                    else
                        buffer.append(element.toString());
                }
            }

            @Override
            public void head(Node arg0, int arg1) {
            }
        });
        buffer.append(document.head().html());
        buffer.append("<body>");
        nd.traverse(document.body());
        return buffer.toString();
    }
}

这不是特别好(尤其是在做的时候buffer.append("<body>");)......但这很快:)

另请参阅此答案以获取相关示例。

于 2012-10-19T10:52:51.207 回答
0

我不确定,但你可以试试这个。当元素由“a”标签触发时,它将停止在边界循环

Elements doc=select("what u want");
String dummy="";
for (Element e:doc){

    if (dummy.isEmpty()){
        System.out.println(e);
        if (e.tagName().matches("a")){
            dummy=e.tagName();
        }

    }
}
于 2015-10-07T04:17:36.707 回答