mysql - 索引和查询存储在 Mysql 中的 BLOBS

Question

问候朋友，

开门见山。我在 Mysql DB 中存储了许多 BLOBS。这些主要是PDF（80%）和.doc。我在数据库中也有文本。到目前为止，我已经建立了索引并且可以查询文本，但我无法索引 BLOBS。我正在尝试制作一个集合（文档）-但很烂。有没有关于如何做这件事的秘诀？

data-config.xml 的一部分：

<?xml version="1.0" encoding="utf-8"?>

<dataConfig>

<dataSource type="JdbcDataSource"
  autoCommit="true" batchSize="-1"
  convertType="false"
  driver="com.mysql.jdbc.Driver"
  url="jdbc:mysql://127.0.0.1:3306/ktimatologio"
  user="root"
  password="********"
  name="db"/>

<dataSource name="fieldReader" type="FieldStreamDataSource" />  

  <document> 

    <entity name="aitiologikes_ektheseis"
    dataSource="db"
    transformer="HTMLStripTransformer"
    query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'"
    deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text' and id='${dataimporter.delta.id}'"
    deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text' and last_modified &gt; '${dataimporter.last_index_time}'">
      <field column="id" name="ida" />               
      <field column="solr_id" name="solr_id" />
      <field column="title" name="title" stripHTML="true" />
      <field column="grid_title" name="grid_title" stripHTML="true" />
      <field column="model" name="model" stripHTML="true" />
      <field column="type" name="type" stripHTML="true" />
      <field column="url" name="url" stripHTML="true" />
      <field column="last_modified" name="last_modified" stripHTML="true"  />
      <field column="search_tag" name="search_tag" stripHTML="true" />
      <field column="content" name="content" stripHTML="true" />
    </entity>

    <entity name="aitiologikes_ektheseis_bin"
    query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin'"
    deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and id='${dataimporter.delta.id}'"
    deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and last_modified &gt; '${dataimporter.last_index_time}'"
    transformer="TemplateTransformer"
    dataSource="db">         
      <field column="id" name="ida" />               
      <field column="solr_id" name="solr_id" />
      <field column="title" name="title" stripHTML="true" />
      <field column="grid_title" name="grid_title" stripHTML="true" />
      <field column="model" name="model" stripHTML="true" />
      <field column="type" name="type" stripHTML="true" />
      <field column="url" name="url" stripHTML="true" />
      <field column="last_modified" name="last_modified" stripHTML="true"  />
      <field column="search_tag" name="search_tag" stripHTML="true" />

      <entity dataSource="fieldReader" processor="TikaEntityProcessor" dataField="aitiologikes_ektheseis_bin.text" format="text"> 
          <field column="text" name="contentbin" stripHTML="true" />
      </entity>

    </entity>

       ...
       ...
  </document> 

</dataConfig>

schema.xml 的一部分（fieldTypes 和字段定义）：

<fieldType name="text_ktimatologio" class="solr.TextField" positionIncrementGap="100">

<analyzer type="index">               
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
  <filter class="solr.LowerCaseFilterFactory"/>
  <filter class="solr.EnglishPossessiveFilterFactory"/>               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>       
  <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
  <filter class="solr.PorterStemFilterFactory"/>
</analyzer> 

<analyzer type="query">
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.LowerCaseFilterFactory"/>
  <filter class="solr.EnglishPossessiveFilterFactory"/>
  <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
  <filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
  <charFilter class="solr.HTMLStripCharFilterFactory"/>
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StandardFilterFactory"/>   
  <filter class="solr.LowerCaseFilterFactory"/>                               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>

<analyzer type="query">
  <charFilter class="solr.HTMLStripCharFilterFactory"/>
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StandardFilterFactory"/>
  <filter class="solr.LowerCaseFilterFactory"/>   
  <filter class="solr.LowerCaseFilterFactory"/>                               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>
</fieldType>

<fields>
  <field  name="ida" type="string" indexed="true" stored="true" multiValued="false"/>
  <field  name="solr_id" type="string" indexed="true" stored="true" multiValued="false"/> 
  <field  name="title" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="grid_title" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="model" type="string" indexed="true" stored="true" multiValued="false"/>
  <field  name="type" type="string" indexed="true" stored="true"/>
  <field  name="url" type="string" indexed="true" stored="true"/>
  <field  name="last_modified" type="string" indexed="true" stored="true"/>
  <field  name="search_tag" type="string" indexed="true" stored="true"/>
  <field  name="contentbin" type="text" indexed="true" stored="true" multiValued="true"/>
  <field  name="content" type="text_ktimatologio" indexed="true" stored="true" multiValued="true"/>     
</fields>

我真的需要帮助！

尊重地，

汤姆

希腊

score 0 · Accepted Answer

你想“索引”一个 BLOB 吗？意味着您希望最终能够搜索它？我不确定我是否正确理解了您的问题。

我猜您可能想先在 Solr 中使用 Apache Tika 之类的东西转换您的 PDF 或 .doc ，然后让 Solr 为您编制索引。此外，如果您想让您的用户访问 PDF 或 doc，最好的地方是数据库并从那里检索它？

mysql - 索引和查询存储在 Mysql 中的 BLOBS

1 回答 1

Related

Reference