0

I have an webpage with source contains several similar structure like below:

<tr>
<td width="10%" bgcolor="#FFFFFF"><font class="bodytext9">1-Jun-2013</font></td>
<td width="4%" bgcolor="#FFFFFF" align=center><font class="bodytext9">Sat</font></td>
<td width="5%" bgcolor="#FFFFFF" align="center"></td>
<td width="5%" bgcolor="#FFFFFF" align="center"><font class="bodytext9">Another Text</font></td>
<td width="5%" bgcolor="#FFFFFF" align="center"><font class="bodytext9"><img src="img/colors/white.gif"></font></td>
<td width="15%" bgcolor="#FFFFFF" align="center"><a class="black_9" href="link2">Here is also Text</a></td>
<td width="15%" bgcolor="#FFFFFF" align="center"><a href="LINKtoWeb" class=list><u>STRING TO CAPTURE</u></a></td>
<td width="4%" bgcolor="#FFFFFF" align="center"><a target="_new" href="AnotherLink"><img src="img/img2.gif" border="0"></a></td>
</tr>

This kind of structure repeated many time with different text inside, but I only want to extract this set because the text "STRING TO CAPTURE" appear here FIRST TIME. So how do I use Jsoup to extract only this set, and the visible text between it, as well as the url

AnotherLink

at the line of the text "STRING TO CAPTURE" appears ? I am new to Jsoup, so I only tried this

  Document doc = Jsoup.connect("http://www.website.com").get();

Element link = doc.select("a").first();
String relHref = link.attr("href"); 
String absHref = link.attr("abs:href"); 
String text = doc.body().text();
String linkHref = link.attr("href"); 
String linkText = link.text(); 

  System.out.println("link:" + link);
  System.out.println("text:" + text);

but cant do it in advance for this purpose, please give me some advices ! Thank you !

4

1 回答 1

1

使用此测试输入:

String test = "<html><body><table>";
test += "<tr>";
test += "<td width=\"10%\" bgcolor=\"#FFFFFF\"><font class=\"bodytext9\">1-Jun-2013</font></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=center><font class=\"bodytext9\">Sat</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\">Another Text</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\"><img src=\"img/colors/white.gif\"></font></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a class=\"black_9\" href=\"link2\">Here is also Text</a></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a href=\"LINKtoWeb\" class=list><u>TEXT THAT DOESN'T MATCH</u></a></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=\"center\"><a target=\"_new\" href=\"NotMatchLink\"><img src=\"img/img2.gif\" border=\"0\"></a></td>";
test += "</tr>";
test += "<tr>";
test += "<td width=\"10%\" bgcolor=\"#FFFFFF\"><font class=\"bodytext9\">1-Jun-2013</font></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=center><font class=\"bodytext9\">Sat</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\">Another Text</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\"><img src=\"img/colors/white.gif\"></font></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a class=\"black_9\" href=\"link2\">Here is also Text</a></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a href=\"LINKtoWeb\" class=list><u>STRING TO CAPTURE</u></a></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=\"center\"><a target=\"_new\" href=\"AnotherLink\"><img src=\"img/img2.gif\" border=\"0\"></a></td>";
test += "</tr>";
test += "<tr>";
test += "<td width=\"10%\" bgcolor=\"#FFFFFF\"><font class=\"bodytext9\">1-Jun-2013</font></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=center><font class=\"bodytext9\">Sat</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\">Another Text</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\"><img src=\"img/colors/white.gif\"></font></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a class=\"black_9\" href=\"link2\">Here is also Text</a></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a href=\"LINKtoWeb\" class=list><u>MORE TEXT THAT DOESN'T MATCH</u></a></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=\"center\"><a target=\"_new\" href=\"NotMatchLink\"><img src=\"img/img2.gif\" border=\"0\"></a></td>";
test += "</tr>";
test += "<tr>";
test += "<td width=\"10%\" bgcolor=\"#FFFFFF\"><font class=\"bodytext9\">1-Jun-2013</font></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=center><font class=\"bodytext9\">Sat</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\">Another Text</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\"><img src=\"img/colors/white.gif\"></font></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a class=\"black_9\" href=\"link2\">Here is also Text</a></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a href=\"LINKtoWeb\" class=list><u>STILL MORE TEXT THAT DOESN'T MATCH</u></a></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=\"center\"><a target=\"_new\" href=\"NotMatchLink\"><img src=\"img/img2.gif\" border=\"0\"></a></td>";
test += "</tr>";
test += "</table></body></html>";
test += "<td width=\"10%\" bgcolor=\"#FFFFFF\"><font class=\"bodytext9\">Second 1-Jun-2013</font></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=center><font class=\"bodytext9\">Second Sat</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\">Second Another Text</font></td>";
test += "<td width=\"5%\" bgcolor=\"#FFFFFF\" align=\"center\"><font class=\"bodytext9\"><img src=\"img/colors/white.gif\"></font></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a class=\"black_9\" href=\"link2\">Second Here is also Text</a></td>";
test += "<td width=\"15%\" bgcolor=\"#FFFFFF\" align=\"center\"><a href=\"LINKtoWeb\" class=list><u>STRING TO CAPTURE</u></a></td>";
test += "<td width=\"4%\" bgcolor=\"#FFFFFF\" align=\"center\"><a target=\"_new\" href=\"SecondAnotherLink\"><img src=\"img/img2.gif\" border=\"0\"></a></td>";
test += "</tr>";

而这段代码:

final Document document = Jsoup.parse(test);
final Element entireRow = document.select("tr:contains(STRING TO CAPTURE)").get(0);
for (final Element column : entireRow.select("td")) {
    System.out.println("Column text is: " + column.text());
}
final Elements link = entireRow.select("td:contains(STRING TO CAPTURE) + td > a[href]");
System.out.println("Target link is: " + link.attr("href"));

它输出:

Column text is: 1-Jun-2013
Column text is: Sat
Column text is: 
Column text is: Another Text
Column text is: 
Column text is: Here is also Text
Column text is: STRING TO CAPTURE
Column text is: 
Target link is: AnotherLink
于 2013-05-30T12:55:08.910 回答