0

问候朋友,

开门见山。我在 Mysql DB 中存储了许多 BLOBS。这些主要是PDF(80%)和.doc。我在数据库中也有文本。到目前为止,我已经建立了索引并且可以查询文本,但我无法索引 BLOBS。我正在尝试制作一个集合(文档)-但很烂。有没有关于如何做这件事的秘诀?

data-config.xml 的一部分:

<?xml version="1.0" encoding="utf-8"?>

<dataConfig>

<dataSource type="JdbcDataSource"
  autoCommit="true" batchSize="-1"
  convertType="false"
  driver="com.mysql.jdbc.Driver"
  url="jdbc:mysql://127.0.0.1:3306/ktimatologio"
  user="root"
  password="********"
  name="db"/>

<dataSource name="fieldReader" type="FieldStreamDataSource" />  

  <document> 

    <entity name="aitiologikes_ektheseis"
    dataSource="db"
    transformer="HTMLStripTransformer"
    query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'"
    deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text' and id='${dataimporter.delta.id}'"
    deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text' and last_modified &gt; '${dataimporter.last_index_time}'">
      <field column="id" name="ida" />               
      <field column="solr_id" name="solr_id" />
      <field column="title" name="title" stripHTML="true" />
      <field column="grid_title" name="grid_title" stripHTML="true" />
      <field column="model" name="model" stripHTML="true" />
      <field column="type" name="type" stripHTML="true" />
      <field column="url" name="url" stripHTML="true" />
      <field column="last_modified" name="last_modified" stripHTML="true"  />
      <field column="search_tag" name="search_tag" stripHTML="true" />
      <field column="content" name="content" stripHTML="true" />
    </entity>

    <entity name="aitiologikes_ektheseis_bin"
    query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin'"
    deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and id='${dataimporter.delta.id}'"
    deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and last_modified &gt; '${dataimporter.last_index_time}'"
    transformer="TemplateTransformer"
    dataSource="db">         
      <field column="id" name="ida" />               
      <field column="solr_id" name="solr_id" />
      <field column="title" name="title" stripHTML="true" />
      <field column="grid_title" name="grid_title" stripHTML="true" />
      <field column="model" name="model" stripHTML="true" />
      <field column="type" name="type" stripHTML="true" />
      <field column="url" name="url" stripHTML="true" />
      <field column="last_modified" name="last_modified" stripHTML="true"  />
      <field column="search_tag" name="search_tag" stripHTML="true" />

      <entity dataSource="fieldReader" processor="TikaEntityProcessor" dataField="aitiologikes_ektheseis_bin.text" format="text"> 
          <field column="text" name="contentbin" stripHTML="true" />
      </entity>

    </entity>

       ...
       ...
  </document> 

</dataConfig>

schema.xml 的一部分(fieldTypes 和字段定义):

<fieldType name="text_ktimatologio" class="solr.TextField" positionIncrementGap="100">

<analyzer type="index">               
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
  <filter class="solr.LowerCaseFilterFactory"/>
  <filter class="solr.EnglishPossessiveFilterFactory"/>               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>       
  <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
  <filter class="solr.PorterStemFilterFactory"/>
</analyzer> 

<analyzer type="query">
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.LowerCaseFilterFactory"/>
  <filter class="solr.EnglishPossessiveFilterFactory"/>
  <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
  <filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
  <charFilter class="solr.HTMLStripCharFilterFactory"/>
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StandardFilterFactory"/>   
  <filter class="solr.LowerCaseFilterFactory"/>                               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>

<analyzer type="query">
  <charFilter class="solr.HTMLStripCharFilterFactory"/>
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StandardFilterFactory"/>
  <filter class="solr.LowerCaseFilterFactory"/>   
  <filter class="solr.LowerCaseFilterFactory"/>                               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>
</fieldType>

<fields>
  <field  name="ida" type="string" indexed="true" stored="true" multiValued="false"/>
  <field  name="solr_id" type="string" indexed="true" stored="true" multiValued="false"/> 
  <field  name="title" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="grid_title" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="model" type="string" indexed="true" stored="true" multiValued="false"/>
  <field  name="type" type="string" indexed="true" stored="true"/>
  <field  name="url" type="string" indexed="true" stored="true"/>
  <field  name="last_modified" type="string" indexed="true" stored="true"/>
  <field  name="search_tag" type="string" indexed="true" stored="true"/>
  <field  name="contentbin" type="text" indexed="true" stored="true" multiValued="true"/>
  <field  name="content" type="text_ktimatologio" indexed="true" stored="true" multiValued="true"/>     
</fields>

我真的需要帮助!

尊重地,

汤姆

希腊

4

1 回答 1

0

你想“索引”一个 BLOB 吗?意味着您希望最终能够搜索它?我不确定我是否正确理解了您的问题。

我猜您可能想先在 Solr 中使用 Apache Tika 之类的东西转换您的 PDF 或 .doc ,然后让 Solr 为您编制索引。此外,如果您想让您的用户访问 PDF 或 doc,最好的地方是数据库并从那里检索它?

于 2012-08-23T21:36:07.757 回答