1

我需要获取不同文件的元数据和内容。

为了做到这一点,我编写了 C++ porgram,它将文件路径发送到 Java(使用 JNI),Java 类调用 Tika 获得必要的结果并将结果返回给 C++。我的问题是对于像 10 MB 这样的文件,这个过程需要 130 sn。这太多了。

我想知道 Tikaserver 是否已经初始化并运行。JNI 是否允许我与此 Tika 进程进行通信。

那么如何在 C++ 中发送文件名并从 Tika 获取结果?如何获得指向jvmJNIEnv的指针

这是我当前的 C++ 和 Java 代码

// ConsoleApplication8.cpp : Defines the entry point for the console application.
#include <windows.h> 
#include "stdafx.h"
#include <iostream>
#include <jni.h>       
#include <ctime>


using namespace std;

int _tmain(int argc, _TCHAR* argv[])
{ 
clock_t begin = clock();
JavaVM *jvm;       /* denotes a Java VM */
JNIEnv *env;       /* pointer to native method interface */
JavaVMInitArgs vm_args; /* JDK/JRE 6 VM initialization arguments */
JavaVMOption* options = new JavaVMOption[4];

options[0].optionString = "-Djava.compiler=NONE";           /* disable JIT */
options[1].optionString = "-Djava.class.path=C:/Users/yv/workspace/TServer/bin;C:/Users/yv/workspace/TServer/src/tikaserver12.jar";    // my class and jar is under this directory. /* user classes   */
//  options[2].optionString = "-Djava.library.path=c:\\Program Files\\Java\\jdk1.7.0_17\\lib";  /* set native library path C:\Program Files\Java\jdk1.6.0_38\lib */ 
options[2].optionString = "-Djava.library.path=c:\\Program Files\\Java\\jdk1.6.0_38\\lib";  /* set native library path */
options[3].optionString = "-verbose:jni";

vm_args.version = JNI_VERSION_1_6; 
vm_args.nOptions = 4;
vm_args.options = options;
vm_args.ignoreUnrecognized = false;

/* load and initialize a Java VM, return a JNI interface  pointer in env */
  JNI_CreateJavaVM(&jvm, (void **)&env, &vm_args);      

jclass cls = env->FindClass("TikaWrapper");
jmethodID mid = env->GetStaticMethodID(cls, "pFile","(Ljava/lang/String;)Ljava/lang/String;");

const char* utf_string = "Not copied anything yet";
jboolean isCopy;
jstring estr = env->NewStringUTF(" Getten bu string sonuç olarak dönecek");

jstring estr3 = env->NewStringUTF("C:/cu.docx");
estr=(jstring) env->CallStaticObjectMethod(cls, mid, estr3);
utf_string = env->GetStringUTFChars(estr, &isCopy);
cout<<utf_string;


clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
cout<<elapsed_secs;
if (isCopy == JNI_TRUE) {
    env->ReleaseStringUTFChars(estr, utf_string);
}

delete[] options;
jvm->DestroyJavaVM();

system("pause");
return 0;
}

而Java代码如下

import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.net.URL;


import org.apache.tika.Tika;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

public class TikaWrapper {
  static final int BUFFER_SIZE = 8192;

  @SuppressWarnings("deprecation")
public  static String pFile(String filename) throws Exception
  {
    System.out.println("Java ParseFile a geldim elimde : ");
    System.out.println(filename);
    String result=new String();

    ParseContext context = new ParseContext();
    Detector detector = new DefaultDetector();
    Parser parser = new AutoDetectParser(detector); 
    context.set(Parser.class, parser);
    ByteArrayOutputStream outputstream = new ByteArrayOutputStream();
    Metadata metadata = new Metadata();

    PrintWriter out = new PrintWriter("Z:/out.txt");

    URL url;
    File file = new File(filename);
    if (file.isFile()) {
        url = file.toURI().toURL();
    } else {
        url = new URL(filename);
    }

    InputStream input = TikaInputStream.get(url, metadata);
    ContentHandler handler = new BodyContentHandler(outputstream);
    parser.parse(input, handler, metadata, context);
    metadata.add("metaerdem", "deneme");
    input.close();

    result += "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
    result += "<document>\n";
    result += "<Name>"+ metadata.get(Metadata.RESOURCE_NAME_KEY) +"</Name>\n";
    result += "<Title>"+ metadata.get(Metadata.TITLE) +"</Title>\n";
    result += "<Author>"+ metadata.get(Metadata.AUTHOR) +"</Author>\n";
    result += "<LastSavedBy>"+ metadata.get(Metadata.LAST_AUTHOR) +"</LastSavedBy>\n";
    result += "<ContentCreated>"+ metadata.get(Metadata.CREATION_DATE) +"</ContentCreated>\n";
    result += "<DateLastSaved>"+ metadata.get(Metadata.LAST_SAVED) +"</DateLastSaved>\n";
    result += "<TotalEditingTime>"+ metadata.get(Metadata.TOTAL_TIME) +"</TotalEditingTime>\n";
    result += "<Company>"+ metadata.get(Metadata.COMPANY) +"</Company>\n";
    result += "<CharacterCount>"+ metadata.get(Metadata.CHARACTER_COUNT) +"</CharacterCount>\n";
    result += "<WordCount>"+ metadata.get(Metadata.WORD_COUNT) +"</WordCount>\n";
    result += "<ParagraphCount>"+ metadata.get(Metadata.PARAGRAPH_COUNT) +"</ParagraphCount>\n";
    result += "<Pages>"+ metadata.get(Metadata.PAGE_COUNT) +"</Pages>\n";
    result += "<ERDEM>"+ metadata.get("metaerdem") +"</ERDEM>\n";
    result += "<Tags>\n";
    result += "<Content>"+ outputstream.toString() +"</Content>\n";         

    out.println(outputstream.toString());
    return result;
  }         

public static void main(String[] args) throws Exception {
    String filename= new String();
    if (args.length == 0) {
        System.out.println("(None)");
        filename="C:\\a.pdf";
        System.out.print("Got the file from Java: " + filename);
        System.out.print(pFile(filename));                  
//          String a=pFile(filename);                   
    } else {
        System.out.println(args.length + " arguments sent to this program");
        for (int i=0; i<args.length; i++) {
            System.out.println(args[i] + " ");
        }
        filename=args[0].toString();
        String son=pFile(filename);
        System.out.print("File is sent from C++ : " + filename);
        System.out.print("Sonuç C++ da yazılacak");
    }
    System.out.print("bitti");          
}

}
4

0 回答 0