我正在尝试使用 Tesseract (Tess4J) OCR 将 PDF 文件中的图像(每一页都是扫描图像)转换为文本,但它不起作用:(参见代码中的注释)
public static void main(String[] args) throws IOException
{
Tesseract instance = Tesseract.getInstance(); // JNA Interface Mapping
PdfReader reader = new PdfReader(PREFACE); // a PDF File D:\pdf\test.pdf
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
// Looking for image and manipulating image stream
for (int i = 0; i < n; i++)
{
object = reader.getPdfObject(i); // every time object null ?why
if (object == null || !object.isStream())
continue;
stream = (PRStream)object;
PdfImageObject image = new PdfImageObject(stream);
BufferedImage bi = image.getBufferedImage();
try {
String result = instance.doOCR(bi); //take Buffered iamge to transfor ti
System.out.println(result);
} catch (TesseractException e) {
System.err.println(e.getMessage());
stream.clear();
}
}
}