-1

我正在 Grails 上的 Groovy 中制作一个爬虫应用程序。我正在使用 Crawler4j 并遵循本教程

  1. 我创建了一个新的 grails 项目
  2. 将 BasicCrawlController.groovy 文件放入 controllers->package
  3. 没有创建任何视图,因为我预计在运行应用程序时,我的爬取数据会出现在我的crawlStorageFolder中(如果我的理解有缺陷,请纠正我)

之后我只是通过做运行应用程序,run-app但我没有在任何地方看到任何爬网数据。

  1. 我是否期望在我提供为 C:/crawl/crawler4jStorage 的 crawlStorageFolder 位置创建一些文件?
  2. 我需要为此创建任何视图吗?
  3. 如果我想在单击表单的提交按钮时从其他视图调用这个爬虫控制器,我可以写<g:form name="submitWebsite" url="[controller:'BasicCrawlController ']">吗?

我问这个是因为我在这个控制器中没有任何方法,那么它是调用这个控制器的正确方法吗?

我的代码如下:

//All necessary imports  



    public class BasicCrawlController {
        static main(args) throws Exception {
            String crawlStorageFolder = "C:/crawl/crawler4jStorage";
            int numberOfCrawlers = 1;
            //int maxDepthOfCrawling = -1;    default
            CrawlConfig config = new CrawlConfig();
            config.setCrawlStorageFolder(crawlStorageFolder);
            config.setPolitenessDelay(1000);
            config.setMaxPagesToFetch(100);
            config.setResumableCrawling(false);
            PageFetcher pageFetcher = new PageFetcher(config);
            RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
            RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
            CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
            controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
            controller.start(BasicCrawler.class, 1);

        }
    }


    class BasicCrawler extends WebCrawler {

    final static Pattern FILTERS = Pattern
    .compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
             "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")

    /**
     * You should implement this function to specify whether the given url
     * should be crawled or not (based on your crawling logic).
     */
    @Override
    boolean shouldVisit(WebURL url) {
        String href = url.getURL().toLowerCase()
        !FILTERS.matcher(href).matches() &&       href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
    }

    /**
     * This function is called when a page is fetched and ready to be processed
     * by your program.
     */
    @Override
    void visit(Page page) {
        int docid = page.getWebURL().getDocid()
        String url = page.getWebURL().getURL()
        String domain = page.getWebURL().getDomain()
        String path = page.getWebURL().getPath()
        String subDomain = page.getWebURL().getSubDomain()
        String parentUrl = page.getWebURL().getParentUrl()
        String anchor = page.getWebURL().getAnchor()

        println("Docid: ${docid} ")
        println("URL: ${url}  ")
        println("Domain: '${domain}'")
        println("Sub-domain: ' ${subDomain}'")
        println("Path: '${path}'")
        println("Parent page:${parentUrl}  ")
        println("Anchor text: ${anchor} " )

        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
            String text = htmlParseData.getText()
            String html = htmlParseData.getHtml()
            List<WebURL> links = htmlParseData.getOutgoingUrls()

            println("Text length: " + text.length())
            println("Html length: " + html.length())
            println("Number of outgoing links: " + links.size())
        }
        Header[] responseHeaders = page.getFetchResponseHeaders()
        if (responseHeaders != null) {
            println("Response headers:")
            for (Header header : responseHeaders) {
                println("\t ${header.getName()} : ${header.getValue()}")
            }
        }
        println("=============")
    }
}
4

1 回答 1

2

我会尝试将您的代码翻译成 Grails 标准。

在下面使用这个grails-app/controller

class BasicCrawlController {

   def index() {
        String crawlStorageFolder = "C:/crawl/crawler4jStorage";
        int numberOfCrawlers = 1;
        //int maxDepthOfCrawling = -1;    default
        CrawlConfig crawlConfig = new CrawlConfig();
        crawlConfig.setCrawlStorageFolder(crawlStorageFolder);
        crawlConfig.setPolitenessDelay(1000);
        crawlConfig.setMaxPagesToFetch(100);
        crawlConfig.setResumableCrawling(false);
        PageFetcher pageFetcher = new PageFetcher(crawlConfig);
        RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
        RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
        CrawlController controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
        controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
        controller.start(BasicCrawler.class, 1);

        render "done crawling"

    }
}

在下面使用这个src/groovy

class BasicCrawler extends WebCrawler {

final static Pattern FILTERS = Pattern
.compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
         "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")

/**
 * You should implement this function to specify whether the given url
 * should be crawled or not (based on your crawling logic).
 */
@Override
boolean shouldVisit(WebURL url) {
    String href = url.getURL().toLowerCase()
    !FILTERS.matcher(href).matches() &&       href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
}

/**
 * This function is called when a page is fetched and ready to be processed
 * by your program.
 */
@Override
void visit(Page page) {
    int docid = page.getWebURL().getDocid()
    String url = page.getWebURL().getURL()
    String domain = page.getWebURL().getDomain()
    String path = page.getWebURL().getPath()
    String subDomain = page.getWebURL().getSubDomain()
    String parentUrl = page.getWebURL().getParentUrl()
    String anchor = page.getWebURL().getAnchor()

    println("Docid: ${docid} ")
    println("URL: ${url}  ")
    println("Domain: '${domain}'")
    println("Sub-domain: ' ${subDomain}'")
    println("Path: '${path}'")
    println("Parent page:${parentUrl}  ")
    println("Anchor text: ${anchor} " )

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
        String text = htmlParseData.getText()
        String html = htmlParseData.getHtml()
        List<WebURL> links = htmlParseData.getOutgoingUrls()

        println("Text length: " + text.length())
        println("Html length: " + html.length())
        println("Number of outgoing links: " + links.size())
    }
    Header[] responseHeaders = page.getFetchResponseHeaders()
    if (responseHeaders != null) {
        println("Response headers:")
        for (Header header : responseHeaders) {
            println("\t ${header.getName()} : ${header.getValue()}")
        }
    }
    println("=============")
  }
}
于 2014-06-27T13:07:52.253 回答