我需要获取不同文件的元数据和内容。
为了做到这一点,我编写了 C++ porgram,它将文件路径发送到 Java(使用 JNI),Java 类调用 Tika 获得必要的结果并将结果返回给 C++。我的问题是对于像 10 MB 这样的文件,这个过程需要 130 sn。这太多了。
我想知道 Tikaserver 是否已经初始化并运行。JNI 是否允许我与此 Tika 进程进行通信。
那么如何在 C++ 中发送文件名并从 Tika 获取结果?如何获得指向jvm和JNIEnv的指针
这是我当前的 C++ 和 Java 代码
// ConsoleApplication8.cpp : Defines the entry point for the console application.
#include <windows.h>
#include "stdafx.h"
#include <iostream>
#include <jni.h>
#include <ctime>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
clock_t begin = clock();
JavaVM *jvm; /* denotes a Java VM */
JNIEnv *env; /* pointer to native method interface */
JavaVMInitArgs vm_args; /* JDK/JRE 6 VM initialization arguments */
JavaVMOption* options = new JavaVMOption[4];
options[0].optionString = "-Djava.compiler=NONE"; /* disable JIT */
options[1].optionString = "-Djava.class.path=C:/Users/yv/workspace/TServer/bin;C:/Users/yv/workspace/TServer/src/tikaserver12.jar"; // my class and jar is under this directory. /* user classes */
// options[2].optionString = "-Djava.library.path=c:\\Program Files\\Java\\jdk1.7.0_17\\lib"; /* set native library path C:\Program Files\Java\jdk1.6.0_38\lib */
options[2].optionString = "-Djava.library.path=c:\\Program Files\\Java\\jdk1.6.0_38\\lib"; /* set native library path */
options[3].optionString = "-verbose:jni";
vm_args.version = JNI_VERSION_1_6;
vm_args.nOptions = 4;
vm_args.options = options;
vm_args.ignoreUnrecognized = false;
/* load and initialize a Java VM, return a JNI interface pointer in env */
JNI_CreateJavaVM(&jvm, (void **)&env, &vm_args);
jclass cls = env->FindClass("TikaWrapper");
jmethodID mid = env->GetStaticMethodID(cls, "pFile","(Ljava/lang/String;)Ljava/lang/String;");
const char* utf_string = "Not copied anything yet";
jboolean isCopy;
jstring estr = env->NewStringUTF(" Getten bu string sonuç olarak dönecek");
jstring estr3 = env->NewStringUTF("C:/cu.docx");
estr=(jstring) env->CallStaticObjectMethod(cls, mid, estr3);
utf_string = env->GetStringUTFChars(estr, &isCopy);
cout<<utf_string;
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
cout<<elapsed_secs;
if (isCopy == JNI_TRUE) {
env->ReleaseStringUTFChars(estr, utf_string);
}
delete[] options;
jvm->DestroyJavaVM();
system("pause");
return 0;
}
而Java代码如下
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.net.URL;
import org.apache.tika.Tika;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class TikaWrapper {
static final int BUFFER_SIZE = 8192;
@SuppressWarnings("deprecation")
public static String pFile(String filename) throws Exception
{
System.out.println("Java ParseFile a geldim elimde : ");
System.out.println(filename);
String result=new String();
ParseContext context = new ParseContext();
Detector detector = new DefaultDetector();
Parser parser = new AutoDetectParser(detector);
context.set(Parser.class, parser);
ByteArrayOutputStream outputstream = new ByteArrayOutputStream();
Metadata metadata = new Metadata();
PrintWriter out = new PrintWriter("Z:/out.txt");
URL url;
File file = new File(filename);
if (file.isFile()) {
url = file.toURI().toURL();
} else {
url = new URL(filename);
}
InputStream input = TikaInputStream.get(url, metadata);
ContentHandler handler = new BodyContentHandler(outputstream);
parser.parse(input, handler, metadata, context);
metadata.add("metaerdem", "deneme");
input.close();
result += "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
result += "<document>\n";
result += "<Name>"+ metadata.get(Metadata.RESOURCE_NAME_KEY) +"</Name>\n";
result += "<Title>"+ metadata.get(Metadata.TITLE) +"</Title>\n";
result += "<Author>"+ metadata.get(Metadata.AUTHOR) +"</Author>\n";
result += "<LastSavedBy>"+ metadata.get(Metadata.LAST_AUTHOR) +"</LastSavedBy>\n";
result += "<ContentCreated>"+ metadata.get(Metadata.CREATION_DATE) +"</ContentCreated>\n";
result += "<DateLastSaved>"+ metadata.get(Metadata.LAST_SAVED) +"</DateLastSaved>\n";
result += "<TotalEditingTime>"+ metadata.get(Metadata.TOTAL_TIME) +"</TotalEditingTime>\n";
result += "<Company>"+ metadata.get(Metadata.COMPANY) +"</Company>\n";
result += "<CharacterCount>"+ metadata.get(Metadata.CHARACTER_COUNT) +"</CharacterCount>\n";
result += "<WordCount>"+ metadata.get(Metadata.WORD_COUNT) +"</WordCount>\n";
result += "<ParagraphCount>"+ metadata.get(Metadata.PARAGRAPH_COUNT) +"</ParagraphCount>\n";
result += "<Pages>"+ metadata.get(Metadata.PAGE_COUNT) +"</Pages>\n";
result += "<ERDEM>"+ metadata.get("metaerdem") +"</ERDEM>\n";
result += "<Tags>\n";
result += "<Content>"+ outputstream.toString() +"</Content>\n";
out.println(outputstream.toString());
return result;
}
public static void main(String[] args) throws Exception {
String filename= new String();
if (args.length == 0) {
System.out.println("(None)");
filename="C:\\a.pdf";
System.out.print("Got the file from Java: " + filename);
System.out.print(pFile(filename));
// String a=pFile(filename);
} else {
System.out.println(args.length + " arguments sent to this program");
for (int i=0; i<args.length; i++) {
System.out.println(args[i] + " ");
}
filename=args[0].toString();
String son=pFile(filename);
System.out.print("File is sent from C++ : " + filename);
System.out.print("Sonuç C++ da yazılacak");
}
System.out.print("bitti");
}
}