0

我正在使用Tesseract处理一些图像,然后尝试使用PoDoFo C++合并所有生成的PDF 。

尝试了 2 种方法(第一种是我需要的):

  1. 使用Tesseract C++ APIPoDoFo C++

我的代码有点像这样:

对于 OCR 部分(运行 001.jpg & 002.jpg):

    const char* input_image = "001.jpg";
    const char* output_base = "001";
    const char* datapath = "/home/test/Desktop/Example2";

    int timeout_ms = 5000;
    const char* retry_config = nullptr;
    bool textonly = false;

    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
    if (api->Init(datapath, "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        exit(1);
    }

   tesseract::TessPDFRenderer *renderer = new tesseract::TessPDFRenderer(
              output_base, api->GetDatapath(), textonly);

    bool succeed = api->ProcessPages(input_image, retry_config, timeout_ms, renderer);
    if (!succeed) {
      fprintf(stderr, "Error during processing.\n");
      return EXIT_FAILURE;
    }

    api->End();
    return EXIT_SUCCESS;

对于 PDF 合并部分:

void mergePDF(std::vector<char*> inputfiles,char* outputfile) {
    try {

        /*Reading first PDF */
        fprintf(stdout,"Reading file: %s\n",inputfiles[0]);
        PoDoFo::PdfMemDocument doc1;
        doc1.Load(inputfiles[0]);

        /*Reading Second PDF */
        fprintf(stdout,"Reading file: %s\n",inputfiles[1]);
        PoDoFo::PdfMemDocument doc2;
        doc2.Load(inputfiles[1]);


        /* Appending doc1 to doc1 */
        doc1.Append(doc2);


        fprintf(stdout,"Writing files to %s\n ",outputfile);
        doc1.Write(outputfile);
    }
    catch(const PoDoFo::PdfError& e) {
        throw e;
    }
}

int main(int argc,char* argv[]) {
    if (argc < 2) {
        printHelp();
        exit(EXIT_FAILURE);
    }
    
     PoDoFo::PdfError::EnableDebug(false);
     std::vector<char*> inputfiles;
     char* outputfile;

     inputfiles.emplace_back(argv[1]);
     inputfiles.emplace_back(argv[2]);
     outputfile = argv[3];
     try {
         mergePDF(inputfiles,outputfile);
     }
     catch(const PoDoFo::PdfError &e) {
         fprintf(stderr,"Error %i occured!\n",e.GetError());
         e.PrintErrorMsg();
         return e.GetError();
     }
     exit(EXIT_SUCCESS);
}

输出:

Warning: Invalid resolution 0 dpi. Using 70 instead.
Warning: Invalid resolution 0 dpi. Using 70 instead.

Reading file: /home/test/Desktop/Example2/001.pdf
Error 17 occured!


PoDoFo encountered an error. Error: 17 ePdfError_NoEOFToken
    Error Description: No EOF Marker was found in the PDF file.
    Callstack:
    #0 Error Source: /home/test/podofo/src/podofo/doc/PdfMemDocument.cpp:263
        Information: Handler fixes issue #49
    #1 Error Source: /home/test/podofo/src/podofo/base/PdfParser.cpp:272
        Information: Unable to load objects from file.
    #2 Error Source: /home/test/podofo/src/podofo/base/PdfParser.cpp:310
        Information: EOF marker could not be found.
    #3 Error Source: /home/test/podofo/src/podofo/base/PdfParser.cpp:1528
  1. 使用 Tesseract 命令行实用程序和 PoDoFo C++ 库

对于 OCR 部分,我使用 Tesseract CLI 工具,如下所示:

tesseract 001.jpg 001 pdf
tesseract 002.jpg 002 pdf

对于 PDF 合并部分,代码与第 1 点相同。1)以上

输出:

Reading file: /home/test/Desktop/Example2/001.pdf
Reading file: /home/test/Desktop/Example2/002.pdf
Fixing references in 13 0 R by 12
Fixing references in 14 0 R by 12
Fixing references in 15 0 R by 12
Fixing references in 16 0 R by 12
Fixing references in 17 0 R by 12
Fixing references in 18 0 R by 12
Fixing references in 19 0 R by 12
Fixing references in 20 0 R by 12
Fixing references in 21 0 R by 12
Fixing references in 22 0 R by 12
Fixing references in 23 0 R by 12
Fixing references in 24 0 R by 12
Reading file: /home/test/Desktop/Example2/output.pdf

我想知道为什么在使用 Tesseract C++ API 后会出现 EOF 标记问题,但在使用 Tesseract CLI 工具后却没有这样的问题。

我是否在第 1 点的 OCR 代码部分中遗漏了某些内容。1)以上?

4

0 回答 0