在我的项目中,stormcrawler maven 原型似乎与 warc 模块不兼容。目前它只创建名称为“crawl-20180802121925-00000.warc.gz”的空 0 字节文件。我在这里错过了什么吗?
我尝试通过创建一个像这样的默认项目来启用warc写作:
mvn archetype:generate -DarchetypeGroupId=com.digitalpebble.stormcrawler -DarchetypeArtifactId=storm-crawler-archetype -DarchetypeVersion=1.10
然后像这样将依赖项添加到 pom.xml 中的 warc 模块
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-warc</artifactId>
        <version>1.10</version>
    </dependency>
然后我将 WARCHdfsBolt 添加到 fetch 分组中,同时尝试写入本地文件系统目录。
public class CrawlTopology extends ConfigurableTopology {
    public static void main(String[] args) throws Exception {
        ConfigurableTopology.start(new CrawlTopology(), args);
    }
    @Override
    protected int run(String[] args) {
        TopologyBuilder builder = new TopologyBuilder();
        String[] testURLs = new String[] { "http://www.lequipe.fr/",
                "http://www.lemonde.fr/", "http://www.bbc.co.uk/",
                "http://storm.apache.org/", "http://digitalpebble.com/" };
        builder.setSpout("spout", new MemorySpout(testURLs));
        builder.setBolt("partitioner", new URLPartitionerBolt())
                .shuffleGrouping("spout");
        builder.setBolt("fetch", new FetcherBolt())
                .fieldsGrouping("partitioner", new Fields("key"));
        builder.setBolt("warc", getWarcBolt())
                .localOrShuffleGrouping("fetch");
        builder.setBolt("sitemap", new SiteMapParserBolt())
                .localOrShuffleGrouping("fetch");
        builder.setBolt("feeds", new FeedParserBolt())
                .localOrShuffleGrouping("sitemap");
        builder.setBolt("parse", new JSoupParserBolt())
                .localOrShuffleGrouping("feeds");
        builder.setBolt("index", new StdOutIndexer())
                .localOrShuffleGrouping("parse");
        Fields furl = new Fields("url");
        // can also use MemoryStatusUpdater for simple recursive crawls
        builder.setBolt("status", new StdOutStatusUpdater())
                .fieldsGrouping("fetch", Constants.StatusStreamName, furl)
                .fieldsGrouping("sitemap", Constants.StatusStreamName, furl)
                .fieldsGrouping("feeds", Constants.StatusStreamName, furl)
                .fieldsGrouping("parse", Constants.StatusStreamName, furl)
                .fieldsGrouping("index", Constants.StatusStreamName, furl);
        return submit("crawl", conf, builder);
    }
    private WARCHdfsBolt getWarcBolt() {
        String warcFilePath = "/Users/user/Documents/workspace/test/warc";
        FileNameFormat fileNameFormat = new WARCFileNameFormat()
                .withPath(warcFilePath);
        Map<String,String> fields = new HashMap<>();
        fields.put("software:", "StormCrawler 1.0 http://stormcrawler.net/");
        fields.put("conformsTo:", "http://www.archive.org/documents/WarcFileFormat-1.0.html");
        WARCHdfsBolt warcbolt = (WARCHdfsBolt) new WARCHdfsBolt()
                .withFileNameFormat(fileNameFormat);
        warcbolt.withHeader(fields);
        // can specify the filesystem - will use the local FS by default
//        String fsURL = "hdfs://localhost:9000";
//        warcbolt.withFsUrl(fsURL);
        // a custom max length can be specified - 1 GB will be used as a default
        FileSizeRotationPolicy rotpol = new FileSizeRotationPolicy(50.0f,
                FileSizeRotationPolicy.Units.MB);
        warcbolt.withRotationPolicy(rotpol);
        return warcbolt;
    }
}
无论我使用或不使用助焊剂在本地运行它,似乎都没有区别。您可以在这里查看演示存储库:https ://github.com/keyboardsamurai/storm-test-warc