1

我正在尝试将 xml 文件从文件夹批量导入到 solr。我的 DIH 配置看起来像这样。

<dataConfig>
  <dataSource type="FileDataSource"/>
  <document>
    <!-- this outer processor generates a list of files satisfying the conditions
         specified in the attributes -->
    <entity name="xmlImport" processor="FileListEntityProcessor"
            fileName=".*xml"
            recursive="true"
            rootEntity="false"
            dataSource="null"
            baseDir="/home/rsp/shellscript/output"
          >

      <!-- this processor extracts content using Xpath from each file found -->

      <entity name="nested" processor="XPathEntityProcessor" transformer="DateFormatTransformer,TemplateTransformer"
              forEach="/root" url="${xmlImport.fileAbsolutePath}">


        <field column="id" xpath="/root/sub1/sub2/id"/>

      </entity>
    </entity>
  </document>
</dataConfig>

当我尝试运行完全导入时,作业失败并留下以下日志。

Full Import failed:java.lang.RuntimeException: java.lang.RuntimeException: org.apache.solr.handler.dataimport.DataImportHandlerException: Parsing failed for xml, url:/home/rsp/shellscript/output/file1.xml rows processed:0 Processing Document # 1
    at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:271)
    at org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:417)
    at org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:481)
    at org.apache.solr.handler.dataimport.DataImporter$1.run(DataImporter.java:462)
Caused by: java.lang.RuntimeException: org.apache.solr.handler.dataimport.DataImportHandlerException: Parsing failed for xml, url:/home/rsp/shellscript/output/file1.xml rows processed:0 Processing Document # 1
    at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:417)
    at org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:330)
    at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:233)
    ... 3 more
Caused by: org.apache.solr.handler.dataimport.DataImportHandlerException: Parsing failed for xml, url:/home/rsp/shellscript/output/file1.xml rows processed:0 Processing Document # 1
    at org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow(DataImportHandlerException.java:70)
    at org.apache.solr.handler.dataimport.XPathEntityProcessor.initQuery(XPathEntityProcessor.java:330)
    at org.apache.solr.handler.dataimport.XPathEntityProcessor.fetchNextRow(XPathEntityProcessor.java:225)
    at org.apache.solr.handler.dataimport.XPathEntityProcessor.nextRow(XPathEntityProcessor.java:205)
    at org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:244)
    at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:476)
    at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:515)
    at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:415)
    ... 5 more
Caused by: java.lang.RuntimeException: com.ctc.wstx.exc.WstxParsingException: Undeclared general entity "ldquo"
 at [row,col {unknown-source}]: [147,57]
    at org.apache.solr.handler.dataimport.XPathRecordReader.streamRecords(XPathRecordReader.java:188)
    at org.apache.solr.handler.dataimport.XPathEntityProcessor.initQuery(XPathEntityProcessor.java:319)
    ... 11 more
Caused by: com.ctc.wstx.exc.WstxParsingException: Undeclared general entity "ldquo"
 at [row,col {unknown-source}]: [147,57]
    at com.ctc.wstx.sr.StreamScanner.constructWfcException(StreamScanner.java:614)
    at com.ctc.wstx.sr.StreamScanner.throwParseError(StreamScanner.java:487)
    at com.ctc.wstx.sr.BasicStreamReader.handleUndeclaredEntity(BasicStreamReader.java:5470)
    at com.ctc.wstx.sr.StreamScanner.expandUnresolvedEntity(StreamScanner.java:1742)
    at com.ctc.wstx.sr.StreamScanner.expandEntity(StreamScanner.java:1626)
    at com.ctc.wstx.sr.StreamScanner.fullyResolveEntity(StreamScanner.java:1564)
    at com.ctc.wstx.sr.BasicStreamReader.skipTokenText(BasicStreamReader.java:3604)
    at com.ctc.wstx.sr.BasicStreamReader.skipToken(BasicStreamReader.java:3369)
    at com.ctc.wstx.sr.BasicStreamReader.nextFromTree(BasicStreamReader.java:2629)
    at com.ctc.wstx.sr.BasicStreamReader.next(BasicStreamReader.java:1073)
    at org.apache.solr.handler.dataimport.XPathRecordReader$Node.handleStartElement(XPathRecordReader.java:377)
    at org.apache.solr.handler.dataimport.XPathRecordReader$Node.parse(XPathRecordReader.java:311)
    at org.apache.solr.handler.dataimport.XPathRecordReader$Node.handleStartElement(XPathRecordReader.java:347)
    at org.apache.solr.handler.dataimport.XPathRecordReader$Node.parse(XPathRecordReader.java:311)
    at org.apache.solr.handler.dataimport.XPathRecordReader$Node.handleStartElement(XPathRecordReader.java:347)
    at org.apache.solr.handler.dataimport.XPathRecordReader$Node.parse(XPathRecordReader.java:311)
    at org.apache.solr.handler.dataimport.XPathRecordReader$Node.handleStartElement(XPathRecordReader.java:347)
    at org.apache.solr.handler.dataimport.XPathRecordReader$Node.parse(XPathRecordReader.java:311)
    at org.apache.solr.handler.dataimport.XPathRecordReader$Node.access$200(XPathRecordReader.java:203)
    at org.apache.solr.handler.dataimport.XPathRecordReader.streamRecords(XPathRecordReader.java:185)
    ... 12 more

“xml 中未声明的通用实体 'ldquo'”。

我在 xml 中有像 &ldquo,® 这样的 xhtml 实体,这些实体不是索引所必需的。我如何忽略这一点并从 xml 中检索字段值?我正在使用 solr 5.4.1 实例和 java 版本“1.7.0_45”。

4

1 回答 1

-1

对于批量插入,我使用更新命令而不是从文件夹导入。我认为性能是一样的。

例如对于导入 csv 文件,您可以使用以下命令: http://localhost:8983/solr/ /update/csv?stream.file=&stream.contentType=text/plain;charset=utf-8

于 2016-01-28T08:43:13.050 回答