1

我希望 Tika 只解析 zip 文件和 pdf 文件。

具有以下内容tika_config.xml

<?xml version="1.0" encoding="UTF-8"?>
<properties>
  <parsers>
    <parser class="org.apache.tika.parser.pkg.PackageParser"/>
    <parser class="org.apache.tika.parser.pdf.PDFParser"/>
  </parsers>
</properties>

启动 tika-server 1.17:

java -jar tika-1.17-src/tika-1.17/tika-server/target/tika-server-1.17.jar --config tika_config.xml -enableUnsecureFeatures -enableFileUrl

提交一个由 pdf ant txt 组成的 zip 文件:

curl -H "fileUrl:file:///home/[...]/mixed.zip" -X PUT http://localhost:9998/rmeta/text --header "Accept: application/json" > output.txt

我明白了

[
  {
    "Content-Type": "application/zip",
    "X-Parsed-By": [
      "org.apache.tika.parser.CompositeParser",
      "org.apache.tika.parser.pkg.PackageParser"
    ],
    "X-TIKA:content": "\n\n\n\n\n\n\n\nmixed-1.pdf\n\n\nmixed-2.txt\n\n\nmixed-3.pdf\n\n\nmixed-4.txt\n\n",
    "X-TIKA:parse_time_millis": "16"
  },
  {
    "Content-Length": "-1",
    "Content-Type": "application/pdf",
    "Creation-Date": "2018-01-23T21:07:49Z",
    "Last-Modified": "2018-01-23T21:07:50Z",
    "Last-Save-Date": "2018-01-23T21:07:50Z",
    "X-Parsed-By": [
      "org.apache.tika.parser.CompositeParser",
      "org.apache.tika.parser.pdf.PDFParser"
    ],
    "X-TIKA:content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nplop\n\n\n",
    "X-TIKA:embedded_resource_path": "/mixed-1.pdf",
    "X-TIKA:parse_time_millis": "6",
    "access_permission:assemble_document": "true",
    "access_permission:can_modify": "true",
    "access_permission:can_print": "true",
    "access_permission:can_print_degraded": "true",
    "access_permission:extract_content": "true",
    "access_permission:extract_for_accessibility": "true",
    "access_permission:fill_in_form": "true",
    "access_permission:modify_annotations": "true",
    "created": "Tue Jan 23 22:07:49 CET 2018",
    "date": "2018-01-23T21:07:50Z",
    "dc:format": "application/pdf; version=1.4",
    "dcterms:created": "2018-01-23T21:07:49Z",
    "dcterms:modified": "2018-01-23T21:07:50Z",
    "embeddedRelationshipId": "mixed-1.pdf",
    "meta:creation-date": "2018-01-23T21:07:49Z",
    "meta:save-date": "2018-01-23T21:07:50Z",
    "modified": "2018-01-23T21:07:50Z",
    "pdf:PDFVersion": "1.4",
    "pdf:docinfo:created": "2018-01-23T21:07:49Z",
    "pdf:docinfo:creator_tool": "Writer",
    "pdf:docinfo:producer": "LibreOffice 5.4",
    "pdf:encrypted": "false",
    "producer": "LibreOffice 5.4",
    "resourceName": "mixed-1.pdf",
    "xmp:CreatorTool": "Writer",
    "xmpTPg:NPages": "1"
  },
  {
    "Content-Length": "-1",
    "Content-Type": "text/plain",
    "Last-Modified": "2018-01-23T21:08:30Z",
    "Last-Save-Date": "2018-01-23T21:08:30Z",
    "X-Parsed-By": "org.apache.tika.server.resource.TikaResource$1",
    "X-TIKA:EXCEPTION:embedded_exception": "org.apache.tika.exception.TikaException: Unexpected RuntimeException from org.apache.tika.server.resource.TikaResource$1@22c07473\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:282)\n\tat org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)\n\tat org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)\n\tat org.apache.tika.parser.RecursiveParserWrapper$EmbeddedParserDecorator.parse(RecursiveParserWrapper.java:317)\n\tat org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:72)\n\tat org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:102)\n\tat org.apache.tika.parser.pkg.PackageParser.parseEntry(PackageParser.java:346)\n\tat org.apache.tika.parser.pkg.PackageParser.parse(PackageParser.java:283)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)\n\tat org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)\n\tat org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)\n\tat org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:158)\n\tat org.apache.tika.server.resource.TikaResource.parse(TikaResource.java:322)\n\tat org.apache.tika.server.resource.RecursiveMetadataResource.parseMetadata(RecursiveMetadataResource.java:139)\n\tat org.apache.tika.server.resource.RecursiveMetadataResource.getMetadata(RecursiveMetadataResource.java:120)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat org.apache.cxf.service.invoker.AbstractInvoker.performInvocation(AbstractInvoker.java:181)\n\tat org.apache.cxf.service.invoker.AbstractInvoker.invoke(AbstractInvoker.java:97)\n\tat org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:202)\n\tat org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:101)\n\tat org.apache.cxf.interceptor.ServiceInvokerInterceptor$1.run(ServiceInvokerInterceptor.java:59)\n\tat org.apache.cxf.interceptor.ServiceInvokerInterceptor.handleMessage(ServiceInvokerInterceptor.java:96)\n\tat org.apache.cxf.phase.PhaseInterceptorChain.doIntercept(PhaseInterceptorChain.java:307)\n\tat org.apache.cxf.transport.ChainInitiationObserver.onMessage(ChainInitiationObserver.java:121)\n\tat org.apache.cxf.transport.http.AbstractHTTPDestination.invoke(AbstractHTTPDestination.java:274)\n\tat org.apache.cxf.transport.http_jetty.JettyHTTPDestination.doService(JettyHTTPDestination.java:261)\n\tat org.apache.cxf.transport.http_jetty.JettyHTTPHandler.handle(JettyHTTPHandler.java:76)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1088)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1024)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)\n\tat org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:370)\n\tat org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:494)\n\tat org.eclipse.jetty.server.AbstractHttpConnection.headerComplete(AbstractHttpConnection.java:973)\n\tat org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.headerComplete(AbstractHttpConnection.java:1035)\n\tat org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:641)\n\tat org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:231)\n\tat org.eclipse.jetty.server.AsyncHttpConnection.handle(AsyncHttpConnection.java:82)\n\tat org.eclipse.jetty.io.nio.SelectChannelEndPoint.handle(SelectChannelEndPoint.java:696)\n\tat org.eclipse.jetty.io.nio.SelectChannelEndPoint$1.run(SelectChannelEndPoint.java:53)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: javax.ws.rs.WebApplicationException: HTTP 415 Unsupported Media Type\n\tat org.apache.tika.server.resource.TikaResource$1.parse(TikaResource.java:120)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)\n\t... 47 more\n",
    "X-TIKA:embedded_resource_path": "/mixed-2.txt",
    "X-TIKA:parse_time_millis": "1",
    "date": "2018-01-23T21:08:30Z",
    "dcterms:modified": "2018-01-23T21:08:30Z",
    "embeddedRelationshipId": "mixed-2.txt",
    "meta:save-date": "2018-01-23T21:08:30Z",
    "modified": "2018-01-23T21:08:30Z",
    "resourceName": "mixed-2.txt"
  },
  {
    "Content-Length": "-1",
    "Content-Type": "application/pdf",
    "Creation-Date": "2018-01-23T21:07:49Z",
    "Last-Modified": "2018-01-23T21:07:50Z",
    "Last-Save-Date": "2018-01-23T21:07:50Z",
    "X-Parsed-By": [
      "org.apache.tika.parser.CompositeParser",
      "org.apache.tika.parser.pdf.PDFParser"
    ],
    "X-TIKA:content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nplop\n\n\n",
    "X-TIKA:embedded_resource_path": "/mixed-3.pdf",
    "X-TIKA:parse_time_millis": "5",
    "access_permission:assemble_document": "true",
    "access_permission:can_modify": "true",
    "access_permission:can_print": "true",
    "access_permission:can_print_degraded": "true",
    "access_permission:extract_content": "true",
    "access_permission:extract_for_accessibility": "true",
    "access_permission:fill_in_form": "true",
    "access_permission:modify_annotations": "true",
    "created": "Tue Jan 23 22:07:49 CET 2018",
    "date": "2018-01-23T21:07:50Z",
    "dc:format": "application/pdf; version=1.4",
    "dcterms:created": "2018-01-23T21:07:49Z",
    "dcterms:modified": "2018-01-23T21:07:50Z",
    "embeddedRelationshipId": "mixed-3.pdf",
    "meta:creation-date": "2018-01-23T21:07:49Z",
    "meta:save-date": "2018-01-23T21:07:50Z",
    "modified": "2018-01-23T21:07:50Z",
    "pdf:PDFVersion": "1.4",
    "pdf:docinfo:created": "2018-01-23T21:07:49Z",
    "pdf:docinfo:creator_tool": "Writer",
    "pdf:docinfo:producer": "LibreOffice 5.4",
    "pdf:encrypted": "false",
    "producer": "LibreOffice 5.4",
    "resourceName": "mixed-3.pdf",
    "xmp:CreatorTool": "Writer",
    "xmpTPg:NPages": "1"
  },
  {
    "Content-Length": "-1",
    "Content-Type": "text/plain",
    "Last-Modified": "2018-01-23T21:08:30Z",
    "Last-Save-Date": "2018-01-23T21:08:30Z",
    "X-Parsed-By": "org.apache.tika.server.resource.TikaResource$1",
    "X-TIKA:EXCEPTION:embedded_exception": "org.apache.tika.exception.TikaException: Unexpected RuntimeException from org.apache.tika.server.resource.TikaResource$1@22c07473\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:282)\n\tat org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)\n\tat org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)\n\tat org.apache.tika.parser.RecursiveParserWrapper$EmbeddedParserDecorator.parse(RecursiveParserWrapper.java:317)\n\tat org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:72)\n\tat org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:102)\n\tat org.apache.tika.parser.pkg.PackageParser.parseEntry(PackageParser.java:346)\n\tat org.apache.tika.parser.pkg.PackageParser.parse(PackageParser.java:283)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)\n\tat org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)\n\tat org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)\n\tat org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:158)\n\tat org.apache.tika.server.resource.TikaResource.parse(TikaResource.java:322)\n\tat org.apache.tika.server.resource.RecursiveMetadataResource.parseMetadata(RecursiveMetadataResource.java:139)\n\tat org.apache.tika.server.resource.RecursiveMetadataResource.getMetadata(RecursiveMetadataResource.java:120)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat org.apache.cxf.service.invoker.AbstractInvoker.performInvocation(AbstractInvoker.java:181)\n\tat org.apache.cxf.service.invoker.AbstractInvoker.invoke(AbstractInvoker.java:97)\n\tat org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:202)\n\tat org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:101)\n\tat org.apache.cxf.interceptor.ServiceInvokerInterceptor$1.run(ServiceInvokerInterceptor.java:59)\n\tat org.apache.cxf.interceptor.ServiceInvokerInterceptor.handleMessage(ServiceInvokerInterceptor.java:96)\n\tat org.apache.cxf.phase.PhaseInterceptorChain.doIntercept(PhaseInterceptorChain.java:307)\n\tat org.apache.cxf.transport.ChainInitiationObserver.onMessage(ChainInitiationObserver.java:121)\n\tat org.apache.cxf.transport.http.AbstractHTTPDestination.invoke(AbstractHTTPDestination.java:274)\n\tat org.apache.cxf.transport.http_jetty.JettyHTTPDestination.doService(JettyHTTPDestination.java:261)\n\tat org.apache.cxf.transport.http_jetty.JettyHTTPHandler.handle(JettyHTTPHandler.java:76)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1088)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1024)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)\n\tat org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:370)\n\tat org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:494)\n\tat org.eclipse.jetty.server.AbstractHttpConnection.headerComplete(AbstractHttpConnection.java:973)\n\tat org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.headerComplete(AbstractHttpConnection.java:1035)\n\tat org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:641)\n\tat org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:231)\n\tat org.eclipse.jetty.server.AsyncHttpConnection.handle(AsyncHttpConnection.java:82)\n\tat org.eclipse.jetty.io.nio.SelectChannelEndPoint.handle(SelectChannelEndPoint.java:696)\n\tat org.eclipse.jetty.io.nio.SelectChannelEndPoint$1.run(SelectChannelEndPoint.java:53)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: javax.ws.rs.WebApplicationException: HTTP 415 Unsupported Media Type\n\tat org.apache.tika.server.resource.TikaResource$1.parse(TikaResource.java:120)\n\tat org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)\n\t... 47 more\n",
    "X-TIKA:embedded_resource_path": "/mixed-4.txt",
    "X-TIKA:parse_time_millis": "0",
    "date": "2018-01-23T21:08:30Z",
    "dcterms:modified": "2018-01-23T21:08:30Z",
    "embeddedRelationshipId": "mixed-4.txt",
    "meta:save-date": "2018-01-23T21:08:30Z",
    "modified": "2018-01-23T21:08:30Z",
    "resourceName": "mixed-4.txt"
  }
]

X-TIKA:EXCEPTION:embedded_exception对于不是预期类型的​​文件,在元字段中获取异常堆栈跟踪是否正常?

有没有办法在不引发异常的情况下忽略这些文件,例如将 EmptyParser 与所有可能的类型显式关联?

更新:在为 Tika 定义 .TXT 文件的 MIME 类型中单独的第 1 期,澄清并为第 2 期提供日志。

4

0 回答 0