1 .Fetch all contents from a Webpage
2. fetch hyperlinks from the webpage.
3. Repeat the 1 & 2 from the fetched hyperlink
4. repeat the process untill 200 hyperlinks regietered or no more hyperlink to fetch.
我写了一个示例程序,但由于对递归的理解不足,我的循环变成了无限循环。建议我解决符合预期的代码。
import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Content
{
private static final String HTML_A_HREF_TAG_PATTERN =
"\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
Pattern pattern;
public Content ()
{
pattern = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
}
private void fetchContentFromURL(String strLink) {
String content = null;
URLConnection connection = null;
try {
connection = new URL(strLink).openConnection();
Scanner scanner = new Scanner(connection.getInputStream());
scanner.useDelimiter("\\Z");
content = scanner.next();
}catch ( Exception ex ) {
ex.printStackTrace();
return;
}
fetchURL(content);
}
private void fetchURL ( String content )
{
Matcher matcher = pattern.matcher( content );
while(matcher.find()) {
String group = matcher.group();
if(group.toLowerCase().contains( "http" ) || group.toLowerCase().contains( "https" )) {
group = group.substring( group.indexOf( "=" )+1 );
group = group.replaceAll( "'", "" );
group = group.replaceAll( "\"", "" );
System.out.println("lINK "+group);
fetchContentFromURL(group);
}
}
System.out.println("DONE");
}
/**
* @param args
*/
public static void main ( String[] args )
{
new Content().fetchContentFromURL( "http://www.google.co.in" );
}
}
我也对任何其他解决方案持开放态度,但只想坚持使用核心 java Api 而不是第 3 方。