为什么不在 crawler4j 中排除有关 Robotstxt 的所有内容?我需要抓取一个站点并忽略机器人,这对我有用。
我像这样在 .crawler 中更改了 CrawlController 和 WebCrawler :
WebCrawler.java:
删除
private RobotstxtServer robotstxtServer;
删除
this.robotstxtServer = crawlController.getRobotstxtServer();
编辑
if ((shouldVisit(webURL)) && (this.robotstxtServer.allows(webURL)))
-->
if ((shouldVisit(webURL)))
编辑
if (((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) &&
(shouldVisit(webURL)) && (this.robotstxtServer.allows(webURL)))
-->
if (((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) &&
(shouldVisit(webURL)))
CrawlController.java:
删除
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
删除
protected RobotstxtServer robotstxtServer;
编辑
public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer) throws Exception
-->
public CrawlController(CrawlConfig config, PageFetcher pageFetcher) throws Exception
删除
this.robotstxtServer = robotstxtServer;
编辑
if (!this.robotstxtServer.allows(webUrl))
{
logger.info("Robots.txt does not allow this seed: " + pageUrl);
}
else
{
this.frontier.schedule(webUrl);
}
-->
this.frontier.schedule(webUrl);
删除
public RobotstxtServer getRobotstxtServer()
{
return this.robotstxtServer;
}
public void setRobotstxtServer(RobotstxtServer robotstxtServer)
{
this.robotstxtServer = robotstxtServer;
}
希望这就是你要找的。