我是第一次做数据抓取。我的任务是从有多个链接的网页获取特定的 URL(帮助,单击此处等)。如何获取特定网址并忽略随机链接?在此链接中,我只想让美国证券交易委员会通过对豁免发行框架的更改而忽略其他链接。我如何在 Java 中做到这一点?我能够提取所有 URL,但不确定如何获取特定 URL。下面是我的代码
while (rs.next()) {
String Content = rs.getString("Content");
doc = Jsoup.parse(Content);
//email extract
Pattern p = Pattern.compile("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+");
Matcher matcher = p.matcher(doc.text());
Set<String> emails = new HashSet<String>();
while (matcher.find()) {
emails.add(matcher.group());
}
System.out.println(emails);
//title extract
String title = doc.title();
System.out.println("Title: " + title);
}
Elements links = doc.select("a");
for(Element link: links) {
String url = link.attr("href");
System.out.println("\nlink :"+ url);
System.out.println("text: " + link.text());
}
System.out.println("Getting all the images");
Elements image = doc.getElementsByTag("img");
for(Element src:image) {
System.out.println("src "+ src.attr("abs:src"));
}