web-crawler - Stormcrawler：用于解析 PDF 属性的 Apache Tika

Question

我添加了 Tika 作为对我的 StormCrawler 实现的引用，它可以在爬网中获取 PDF 文档。但是，Title、Authors和其他属性不会被解析。我尝试使用不同的组合来' index.md.mapping :' 并将相应的属性添加到ES_IndexInit，但 PDF 文档的 Kibana（索引）中的内容字段始终为空。一切都适用于 HTML 页面。如果我遗漏了什么或者我可以看一个例子，你能否提供一些指示？

es-crawler.flux：

name: "crawler"



includes:
    - resource: true
      file: "/crawler-default.yaml"
      override: false

- resource: false
  file: "crawler-conf.yaml"
  override: true

- resource: false
  file: "es-conf.yaml"
  override: true


spouts:
  - id: "spout"
    className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout"
    parallelism: 10

bolts:
  - id: "partitioner"
    className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt"
    parallelism: 1
  - id: "fetcher"
    className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt"
    parallelism: 1
  - id: "sitemap"
    className: "com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt"
    parallelism: 1
  - id: "parse"
    className: "com.digitalpebble.stormcrawler.bolt.JSoupParserBolt"
    parallelism: 5
  - id: "index"
    className: "com.digitalpebble.stormcrawler.elasticsearch.bolt.IndexerBolt"
    parallelism: 1
  - id: "status"
    className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt"
    parallelism: 1
  - id: "status_metrics"
    className: "com.digitalpebble.stormcrawler.elasticsearch.metrics.StatusMetricsBolt"
    parallelism: 4
  - id: "redirection_bolt"
    className: "com.digitalpebble.stormcrawler.tika.RedirectionBolt"
    parallelism: 1
  - id: "parser_bolt"
    className: "com.digitalpebble.stormcrawler.tika.ParserBolt"
    parallelism: 1

streams:
  - from: "spout"
    to: "partitioner"
    grouping:
      type: SHUFFLE


from: "spout"
to: "status_metrics"
grouping:
  type: SHUFFLE     
from: "partitioner"
to: "fetcher"
grouping:
  type: FIELDS
  args: ["key"]
from: "fetcher"
to: "sitemap"
grouping:
  type: LOCAL_OR_SHUFFLE
from: "sitemap"
to: "parse"
grouping:
  type: LOCAL_OR_SHUFFLE
from: "parse"
to: "index"
grouping:
  type: LOCAL_OR_SHUFFLE
from: "fetcher"
to: "status"
grouping:
  type: FIELDS
  args: ["url"]
  streamId: "status"
from: "sitemap"
to: "status"
grouping:
  type: FIELDS
  args: ["url"]
  streamId: "status"
from: "parse"
to: "status"
grouping:
  type: FIELDS
  args: ["url"]
  streamId: "status"
from: "index"
to: "status"
grouping:
  type: FIELDS
  args: ["url"]
  streamId: "status"  
from: "parse"
to: "redirection_bolt"
grouping:
  type: LOCAL_OR_SHUFFLE
from: "redirection_bolt"
to: "parser_bolt"
grouping:
  type: LOCAL_OR_SHUFFLE
from: "redirection_bolt"
to: "index"
grouping:
  type: LOCAL_OR_SHUFFLE

from: "parser_bolt" to: "index" grouping: type: LOCAL_OR_SHUFFLE

es-injector.flux: name: "injector"



includes:
    - resource: true
      file: "/crawler-default.yaml"
      override: false

- resource: false
  file: "crawler-conf.yaml"
  override: true

- resource: false
  file: "es-conf.yaml"
  override: true

- resource: false
  file: "injection-conf.yaml"
  override: true


components:
  - id: "scheme"
    className: "com.digitalpebble.stormcrawler.util.StringTabScheme"
    constructorArgs:
      - DISCOVERED

spouts:
  - id: "spout"
    className: "com.digitalpebble.stormcrawler.spout.FileSpout"
    parallelism: 1
    constructorArgs:
      - "."
      - "seeds.txt"
      - ref: "scheme"

bolts:
  - id: "status"
    className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt"
    parallelism: 1
  - id: "parser_bolt"
    className: "com.digitalpebble.stormcrawler.tika.ParserBolt"
    parallelism: 1

streams: - from: "spout" to: "status" grouping: type: FIELDS args: ["url"]

pom.xml： http://maven.apache.org/maven-v4_0_0.xsd">



<modelVersion>4.0.0</modelVersion>
<groupId>xyz.com</groupId>
<artifactId>search</artifactId>
<version>search1.0</version>
<packaging>jar</packaging>

<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<build>
    <plugins>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.2</version>
            <configuration>
                <source>1.8</source>
                <target>1.8</target>
            </configuration>
        </plugin>
        <plugin>
            <groupId>org.codehaus.mojo</groupId>
            <artifactId>exec-maven-plugin</artifactId>
            <version>1.3.2</version>
            <executions>
                <execution>
                    <goals>
                        <goal>exec</goal>
                    </goals>
                </execution>
            </executions>
            <configuration>
                <executable>java</executable>
                <includeProjectDependencies>true</includeProjectDependencies>
                <includePluginDependencies>false</includePluginDependencies>
                <classpathScope>compile</classpathScope>
            </configuration>
        </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-shade-plugin</artifactId>
            <version>1.3.3</version>
            <executions>
                <execution>
                    <phase>package</phase>
                    <goals>
                        <goal>shade</goal>
                    </goals>
                    <configuration>
                        <createDependencyReducedPom>false</createDependencyReducedPom>
                        <transformers>
                            <transformer
                                implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
                            <transformer
                                implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                              <mainClass>org.apache.storm.flux.Flux</mainClass>
                              <manifestEntries>
                                <Change></Change>
                                <Build-Date></Build-Date>
                              </manifestEntries>
                            </transformer>
                        </transformers>
                        <!-- The filters below are necessary if you want to include the Tika
                            module -->
                        <filters>
                            <filter>
                                <artifact>*:*</artifact>
                                <excludes>
                                    <exclude>META-INF/*.SF</exclude>
                                    <exclude>META-INF/*.DSA</exclude>
                                    <exclude>META-INF/*.RSA</exclude>
                                </excludes>
                            </filter>
                        </filters>
                    </configuration>
                </execution>
            </executions>
        </plugin>
    </plugins>
</build>

<dependencies>
    <dependency>
        <groupId>org.apache.storm</groupId>
        <artifactId>storm-core</artifactId>
        <version>1.1.1</version>
        <scope>provided</scope>
    </dependency>
    <dependency>
        <groupId>org.apache.storm</groupId>
        <artifactId>flux-core</artifactId>
        <version>1.0.2</version>
    </dependency>
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-core</artifactId>
        <version>1.7</version>
    </dependency>
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-elasticsearch</artifactId>
        <version>1.7</version>
    </dependency>
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-tika</artifactId>
        <version>1.7</version>
    </dependency>
</dependencies>

score 2 · Accepted Answer

你的 pom 和 Flux 文件看起来不错。您可以将注入作为主要助焊剂的一部分以保持简单。

crawler-conf.yaml 中有什么？您是否在字段名称前加上“parse.”？

这是从您在上面发布的 URL 中提取的元数据

parse.dcterms:modified: 2004-09-29T20:21:18Z
parse.pdf:PDFVersion: 1.4
parse.access_permission:can_print: true
parse.pdf:docinfo:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.pdf:docinfo:modified: 2004-09-29T20:21:18Z
parse.access_permission:extract_for_accessibility: true
parse.created: Fri Sep 24 15:56:30 BST 2004
parse.pdf:docinfo:created: 2004-09-24T14:56:30Z
parse.xmpTPg:NPages: 7
parse.access_permission:fill_in_form: true
parse.producer: Adobe PDF Library 6.0
parse.pdf:docinfo:title: About Metadata
parse.pdf:docinfo:producer: Adobe PDF Library 6.0
parse.dc:format: application/pdf; version=1.4
parse.access_permission:assemble_document: true
parse.access_permission:modify_annotations: true
parse.dc:title: About Metadata
parse.access_permission:can_print_degraded: true
parse.xmpMM:DocumentID: adobe:docid:indd:de7d50b0-0fc1-11d9-b0d4-cd42e793ca90
parse.xmpMM:DerivedFrom:DocumentID: adobe:docid:indd:a04d199f-0f11-11d9-b74d-bb0abf4f1ab0
parse.title: About Metadata
parse.Creation-Date: 2004-09-24T14:56:30Z
parse.modified: 2004-09-29T20:21:18Z
parse.resourceName: /digitalimag/pdfs/about_metadata.pdf
parse.dc:description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.Last-Save-Date: 2004-09-29T20:21:18Z
parse.creator: Adobe Systems Incorporated
parse.pdf:encrypted: false
parse.trapped: False
parse.pdf:docinfo:creator: Adobe Systems Incorporated
parse.date: 2004-09-29T20:21:18Z
parse.meta:save-date: 2004-09-29T20:21:18Z
parse.Author: Adobe Systems Incorporated
parse.X-Parsed-By: org.apache.tika.parser.DefaultParser
parse.X-Parsed-By: org.apache.tika.parser.pdf.PDFParser
parse.pdf:docinfo:creator_tool: Adobe InDesign CS (3.0.1)
parse.dcterms:created: 2004-09-24T14:56:30Z
parse.access_permission:can_modify: true
parse.subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.meta:author: Adobe Systems Incorporated
parse.access_permission:extract_content: true
parse.xmp:CreatorTool: Adobe InDesign CS (3.0.1)
parse.dc:creator: Adobe Systems Incorporated
parse.cp:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.pdf:docinfo:trapped: False
parse.meta:creation-date: 2004-09-24T14:56:30Z
parse.xmpMM:DerivedFrom:InstanceID: de7d50af-0fc1-11d9-b0d4-cd42e793ca90
parse.Last-Modified: 2004-09-29T20:21:18Z
parse.Content-Type: application/pdf
parse.description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).

你的 conf 应该包含类似

  indexer.md.mapping:
  - parse.title=title
  - parse.Author=author

从测试用例的代码可以猜到，需要在external/tika/src/test/resources/中添加文件，并在测试代码中引用文件名，如示例中的about_metadata.pdf以下

 @Test
public void testMetadata() throws IOException {

    bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(),
            new OutputCollector(output));

    parse("https://www.adobe.com/digitalimag/pdfs/about_metadata.pdf",
            "about_metadata.pdf");

    List<List<Object>> outTuples = output.getEmitted();

    // single document
    Assert.assertEquals(1, outTuples.size());
    // metadata
    Metadata md = (Metadata) outTuples.get(0).get(2);
    Assert.assertTrue(
            md.getFirstValue("parse.pdf:docinfo:subject").contains(
                    "By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient."));

}

更新

仔细检查，问题出在你的助焊剂上。重定向螺栓通过名为“tika”的定制流将元组发送到 Tika。因此定义应该是

from: "redirection_bolt"
to: "parser_bolt"
grouping:
  type: LOCAL_OR_SHUFFLE
  streamId: "tika"

web-crawler - Stormcrawler：用于解析 PDF 属性的 Apache Tika

1 回答 1

Related

Reference