您应该编写一个控制器和一个爬虫。
这是 Controller.java 文件:
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class Controller {
public static void main(String[] args) throws Exception {
RobotstxtConfig robotstxtConfig2 = new RobotstxtConfig();
System.out.println(robotstxtConfig2.getCacheSize());
System.out.println(robotstxtConfig2.getUserAgentName());
String crawlStorageFolder = "/crawler/testdata";
int numberOfCrawlers = 4;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
System.out.println(robotstxtConfig.getCacheSize());
System.out.println(robotstxtConfig.getUserAgentName());
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config,
pageFetcher, robotstxtServer);
controller.addSeed("http://cyesilkaya.wordpress.com/");
controller.start(Crawler.class, numberOfCrawlers);
}
}
这是 Crawler.java 文件:
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.url.WebURL;
public class Crawler extends WebCrawler {
@Override
public boolean shouldVisit(WebURL url) {
// you can write your own filter to decide crawl the incoming URL or not.
return true;
}
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
try {
// Do whatever you want with the crawled page
}
catch (IOException e) {
}
}
}
只需运行 Controller.class