1

我已经安装了 Nutch 1.7 和 Solr 3.6.2 并且能够搜索和索引 xls、doc、pdf 和 zip 文件。现在我想索引 .avi、.mov 等视频文件

我已经编辑了 regex-urlfilter.txt 以删除这些扩展类型,但唯一能够被索引的文件是 .flv 文件。我知道这就是 Tika 所说的支持,但我不需要视频文件的元数据索引,我只希望对文件名进行索引。

我该如何启用它?

正则表达式-urlfilter.txt

# skip image and other suffixes we can't yet parse
-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|gz|rpm|tgz|exe|jpeg|JPEG|bmp|BMP)$

nutch-site.xml

<configuration>

<property>
     <name>http.agent.name</name>
      <value>crawler</value>
</property>

<property>
      <name>http.robots.agents</name>
      <value>crawler,*</value>
</property>

<property>
      <name>http.accept.language</name>
      <value>zh-cn, ja-jp, en-us,en-gb,en;q=0.7,*;q=0.3</value>
      <description>Value of the “Accept-Language” request header field.
      This allows selecting non-English language as default one to retrieve.
      It is a useful setting for search engines build for certain national group.
      </description>
</property>

<property>
      <name>parser.character.encoding.default</name>
      <value>utf-8</value>
      <description>The character encoding to fall back to when no other information
      is available</description>
</property>

<property>
      <name>http.content.limit</name>
      <value>10000000</value>
      <description>The length limit for downloaded content, in bytes.
  If this value is nonnegative (>=0), content longer than it will be truncated;
  otherwise, no truncation at all.
      </description>
</property>

<property>
      <name>file.content.limit</name>
      <value>10000000</value>
      <description>The length limit for downloaded content, in bytes.
       If this value is nonnegative (>=0), content longer than it will be      truncated; otherwise, no truncation at all.
      </description>
</property>

<property>
      <name>plugin.includes</name>
      <value>protocol-http|urlfilter-regex|parse-(html|tika|metatags|zip)|index-(basic|anchor|metadata)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
</property> 

<property>
      <name>metatags.names</name>
      <value>*</value>
      <description> Names of the metatags to extract, separated by;.
  Use '*' to extract all metatags. Prefixes the names with 'metatag.'
  in the parse-metadata. For instance to index description and keywords,
  you need to activate the plugin index-metadata and set the value of the
  parameter 'index.parse.md' to 'metatag.description;metatag.keywords'.
      </description>
</property>

<property>
      <name>index.parse.md</name>
      <value>metatag.description,metatag.keywords</value>
      <description> Comma-separated list of keys to be taken from the parse metadata to generate fields.  Can be used e.g. for 'description' or 'keywords' provided that these values are generated by a parser (see parse-metatags plugin)
      </description>
</property>

</configuration>
4

0 回答 0