我正在 Grails 上的 Groovy 中制作一个爬虫应用程序。我正在使用 Crawler4j 并遵循本教程。
- 我创建了一个新的 grails 项目
- 将 BasicCrawlController.groovy 文件放入 controllers->package
- 没有创建任何视图,因为我预计在运行应用程序时,我的爬取数据会出现在我的crawlStorageFolder中(如果我的理解有缺陷,请纠正我)
之后我只是通过做运行应用程序,run-app
但我没有在任何地方看到任何爬网数据。
- 我是否期望在我提供为 C:/crawl/crawler4jStorage 的 crawlStorageFolder 位置创建一些文件?
- 我需要为此创建任何视图吗?
- 如果我想在单击表单的提交按钮时从其他视图调用这个爬虫控制器,我可以写
<g:form name="submitWebsite" url="[controller:'BasicCrawlController ']">
吗?
我问这个是因为我在这个控制器中没有任何方法,那么它是调用这个控制器的正确方法吗?
我的代码如下:
//All necessary imports
public class BasicCrawlController {
static main(args) throws Exception {
String crawlStorageFolder = "C:/crawl/crawler4jStorage";
int numberOfCrawlers = 1;
//int maxDepthOfCrawling = -1; default
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(1000);
config.setMaxPagesToFetch(100);
config.setResumableCrawling(false);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
controller.start(BasicCrawler.class, 1);
}
}
class BasicCrawler extends WebCrawler {
final static Pattern FILTERS = Pattern
.compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
"|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")
/**
* You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).
*/
@Override
boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase()
!FILTERS.matcher(href).matches() && href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
}
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
void visit(Page page) {
int docid = page.getWebURL().getDocid()
String url = page.getWebURL().getURL()
String domain = page.getWebURL().getDomain()
String path = page.getWebURL().getPath()
String subDomain = page.getWebURL().getSubDomain()
String parentUrl = page.getWebURL().getParentUrl()
String anchor = page.getWebURL().getAnchor()
println("Docid: ${docid} ")
println("URL: ${url} ")
println("Domain: '${domain}'")
println("Sub-domain: ' ${subDomain}'")
println("Path: '${path}'")
println("Parent page:${parentUrl} ")
println("Anchor text: ${anchor} " )
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
String text = htmlParseData.getText()
String html = htmlParseData.getHtml()
List<WebURL> links = htmlParseData.getOutgoingUrls()
println("Text length: " + text.length())
println("Html length: " + html.length())
println("Number of outgoing links: " + links.size())
}
Header[] responseHeaders = page.getFetchResponseHeaders()
if (responseHeaders != null) {
println("Response headers:")
for (Header header : responseHeaders) {
println("\t ${header.getName()} : ${header.getValue()}")
}
}
println("=============")
}
}