带有 Solr 的 Tika 为内容和元数据生成不同的字段。
如果您使用标准 ExtractingRequestHandler -
<requestHandler name="/update/extract"
startup="lazy"
class="solr.extraction.ExtractingRequestHandler" >
<lst name="defaults">
<!-- All the main content goes into "text"... if you need to return
the extracted text or do highlighting, use a stored field. -->
<str name="fmap.content">text</str>
<str name="lowernames">true</str>
<str name="uprefix">ignored_</str>
<!-- capture link hrefs but ignore div attributes -->
<str name="captureAttr">true</str>
<str name="fmap.a">links</str>
<str name="fmap.div">ignored_</str>
</lst>
</requestHandler>
字段映射内容设置为文本字段,它应该只是您的 pdf 的内容。
可以通过修改 schema.xml 轻松检查其他元数据字段。
将已存储字段类型标记为真
<fieldtype name="ignored" stored="true" indexed="false" multiValued="true" class="solr.StrField" />
捕获所有字段 -
<dynamicField name="*" type="ignored" multiValued="true" />
Tika 为元数据添加了许多字段,内容被单独设置,例如当使用 ppt 提供提取处理程序时的响应。
<doc>
<arr name="application_name">
<str>Microsoft PowerPoint</str>
</arr>
<str name="category">POT - US</str>
<str name="comments">version 1.1</str>
<arr name="company">
<str>
</str>
</arr>
<arr name="content_type">
<str>application/vnd.ms-powerpoint</str>
</arr>
<arr name="creation_date">
<str>2000-03-15T16:57:27Z</str>
</arr>
<arr name="custom_delivery_date">
<str>
</str>
</arr>
<arr name="custom_docid">
<str>
</str>
</arr>
<arr name="custom_docidinslide">
<str>true</str>
</arr>
<arr name="custom_docidintitle">
<str>true</str>
</arr>
<arr name="custom_docidposition">
<str>0</str>
</arr>
<arr name="custom_event">
<str>
</str>
</arr>
<arr name="custom_final">
<str>false</str>
</arr>
<arr name="custom_mckpapersize">
<str>US</str>
</arr>
<arr name="custom_notespagelayout">
<str>Lower</str>
</arr>
<arr name="custom_title">
<str>Lower Universal Template US</str>
</arr>
<arr name="custom_universal_objects">
<str>true</str>
</arr>
<arr name="edit_time">
<str>284587970000</str>
</arr>
<str name="id">101</str>
<arr name="ignored_">
<str>slideShow</str>
<str>slide</str>
<str>slide</str>
<str>slideNotes</str>
</arr>
<str name="keywords">test</str>
<arr name="last_author">
<str>Corporate</str>
</arr>
<arr name="last_printed">
<str>2000-03-17T20:28:57Z</str>
</arr>
<arr name="last_save_date">
<str>2009-03-24T16:52:26Z</str>
</arr>
<arr name="manager">
<str>
</str>
</arr>
<arr name="meta">
<str>stream_source_info</str>
<str>file:/C:/temp/nuggets/100000.ppt</str>
<str>Last-Author</str>
<str>Corporate</str>
<str>Slide-Count</str>
<str>2</str>
<str>custom:DocIDPosition</str>
<str>0</str>
<str>Application-Name</str>
<str>Microsoft PowerPoint</str>
<str>custom:Delivery Date</str>
<str>
</str>
<str>custom:Event</str>
<str>
</str>
<str>Edit-Time</str>
<str>284587970000</str>
<str>Word-Count</str>
<str>120</str>
<str>Creation-Date</str>
<str>2000-03-15T16:57:27Z</str>
<str>stream_size</str>
<str>181248</str>
<str>Manager</str>
<str>
</str>
<str>stream_name</str>
<str>100000.ppt</str>
<str>Company</str>
<str>
</str>
<str>Keywords</str>
<str>test</str>
<str>Last-Save-Date</str>
<str>2009-03-24T16:52:26Z</str>
<str>Revision-Number</str>
<str>91</str>
<str>Last-Printed</str>
<str>2000-03-17T20:28:57Z</str>
<str>Comments</str>
<str>version 1.1</str>
<str>Template</str>
<str>
</str>
<str>custom:PaperSize</str>
<str>US</str>
<str>custom:DocID</str>
<str>
</str>
<str>xmpTPg:NPages</str>
<str>2</str>
<str>custom:NotesPageLayout</str>
<str>Lower</str>
<str>custom:DocIDinSlide</str>
<str>true</str>
<str>Category</str>
<str>POT - US</str>
<str>custom:Universal Objects</str>
<str>true</str>
<str>custom:Final</str>
<str>false</str>
<str>custom:DocIDinTitle</str>
<str>true</str>
<str>Content-Type</str>
<str>application/vnd.ms-powerpoint</str>
<str>custom:Title</str>
<str>test</str>
</arr>
<arr name="p">
<str>slide-content</str>
<str>slide-content</str>
</arr>
<arr name="revision_number">
<str>91</str>
</arr>
<arr name="slide_count">
<str>2</str>
</arr>
<arr name="stream_name">
<str>100000.ppt</str>
</arr>
<arr name="stream_size">
<str>181248</str>
</arr>
<arr name="stream_source_info">
<str>file:/C:/temp/test/100000.ppt</str>
</arr>
<arr name="template">
<str>
</str>
</arr>
<!-- Content field -->
<arr name="text">
<str>test Test test test test tes t</str>
</arr>
<arr name="title">
<str>test</str>
</arr>
<arr name="word_count">
<str>120</str>
</arr>
<arr name="xmptpg_npages">
<str>2</str>
</arr>
</doc>