我正在尝试使用 Apache Tika 解析一个 .doc 文件,其中包含 alpha、beta、gamma 等希腊字符,并且 tika 的结果与我的预期完全不同,我正在使用下面的代码来解析 .doc 文件
FileInputStream fileInputStream = new FileInputStream();
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1);
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Metadata metadatafromtika = new Metadata();
metadatafromtika.add(Metadata.CONTENT_ENCODING,"UTF-8");
parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
String text = handler.toString();
我在该行中使用 UTF-8 编码
metadatafromtika.add(Metadata.CONTENT_ENCODING,"UTF-8");
以下是我正在使用的依赖项
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.18</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>ST4</artifactId>
<version>4.0.8</version>
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.1.4</version>
</dependency>
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>commons-dbutils</groupId>
<artifactId>commons-dbutils</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20171018</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>1.1.0-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.0-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-tools</artifactId>
<version>2.6.0-mr1-cdh5.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.htrace</groupId>
<artifactId>htrace-core4</artifactId>
<version>4.0.1-incubating</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.1</version>
</dependency>
<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
<version>1.6.5</version>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.9.5</version>
</dependency>
word文档中的内容是
当我使用上面的 tika 代码时得到的输出是
UTF-8 编码是否不适合使用 Apache Tika 解析希腊字符?还是我在代码中遗漏了什么?
提前致谢
编辑:这是我正在使用的完整 java 代码
import org.apache.commons.io.FileUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import java.io.File;
import java.io.FileInputStream;
import java.nio.charset.StandardCharsets;
public class Tika {
public static void main(String[] args) {
try {
String inputPath = args[0];
String outputPath = args[1];
File f = new File(inputPath);
System.out.println("path is : " + f.getAbsoluteFile());
FileInputStream fileInputStream = new FileInputStream(f);
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1);
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Metadata metadatafromtika = new Metadata();
metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
String text = handler.toString();
System.out.println("done parsing for file : " + f.getAbsolutePath());
System.out.println("text is : \n" + text);
byte[] bytes = text.getBytes();
String encodedText = new String(bytes, StandardCharsets.UTF_8);
System.out.println("encoded text is : " + encodedText);
FileUtils.writeStringToFile(new File(outputPath + File.separator + f.getName() + "_content.txt"),
text, "UTF-8");
}
catch (Exception e) {
e.printStackTrace();
}
}
}
编辑 2:下面是使用 PrintWriter 的代码
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
public class TikaTmp {
public static void main(String[] args) {
FileInputStream fileInputStream = null;
try {
String inputPath = args[0];
String outputPath = args[1];
File f = new File(inputPath);
System.out.println("path is : " + f.getAbsoluteFile());
fileInputStream = new FileInputStream(f);
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1);
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Metadata metadatafromtika = new Metadata();
metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
PrintWriter printWriter = new PrintWriter(new File(
output_path + File.separator + f.getName() +"_content.txt"),"UTF-8");
printWriter.write(String.valueOf(handler));
printWriter.flush();
printWriter.close();
}
catch (Exception e) {
e.printStackTrace();
}
finally {
try {
if(fileInputStream!=null)
fileInputStream.close();
}
catch (IOException e) {
e.printStackTrace();
}
}
}
}
编辑 3:我试图解析的字符来自 microsoft word 使用的符号字体,Tika 仅对符号字体中的字符失败
我假设这些不是实际的希腊字符,但看起来像希腊字符