0

我的任务是从扫描的文档/JPG 中提取文本,然后只获取下面提到的 6 个值,以便我可以在下一个屏幕/活动中自动填写表单数据。

我在带有 Blaze 版本(付费)的 android 应用程序中使用了 google cloud vision api,我得到了一个文本块的结果,但我只想从中提取一些信息,我该如何实现呢?

账单或收据可能一直都不同,但我想要 Ex 的所有发票文本块中的 6 件事 -

  1. 小贩
  2. 帐户
  3. 描述
  4. 截止日期
  5. 发票号码
  6. 数量

是否有任何可用的工具/第 3 方库,以便我可以在我的 android 开发中使用。

注意 - 我认为不需要任何收据或账单图像样本,因为它可以是任何类型的账单或发票,我们只需要从提取的文本中提取 6 个提到的东西。

4

1 回答 1

1

在接下来的场景中,我将创建两种虚构的账单格式,然后编写代码算法来解析它们。我只会写算法,因为我不懂JAVA。

在此处输入图像描述

在第一列,我们有两张账单的精彩图片。在第二列中,我们有从 OCR 软件获得的文本数据。它就像一个简单的文本文件,没有实现任何逻辑。但是我们知道某些可以使它有意义的关键字。Bellow 是将无意义的文件翻译成完美的逻辑 JSON 的算法。

// Text obtained from BILL format 1
var TEXT_FROM_OCR = "Invoice no 12 Amount 55$
Vendor name BusinessTest 1 Account No 1213113
Due date 2019-12-07  
Description Lorem ipsum dolor est"




// Text obtained from BILL format 2
var TEXT_FROM_OCR ="    BusinessTest22        
Invoice no    19    Amount    12$
Account    4564544    Due date    2019-12-15
Description            
Lorem ipsum dolor est            
Another description line            
Last description line"




// This is a valid JSON object which describes the logic behind the text
var TEMPLATES = {


    "bill_template_1": {
        "vendor":{
            "line_no_start": null,                // This means is unknown and will be ignored by our text parsers
            "line_no_end": null,                  // This means is unknown and will be ignored by our text parsers
            "start_delimiter": "Vendor name",     // Searched value starts immediatedly after this start_delimiters
            "end_delimiter": "Account"            // Searched value ends just before this end_delimter
            "value_found": null                   // Save here the value we found
        },
        "account": {
            "line_no_start": null,                // This means is unknown and will be ignored by our text parsers
            "line_no_end": null,                  // This means is unknown and will be ignored by our text parsers
            "start_delimiter": "Account No",      // Searched value starts immediatedly after this start_delimiters
            "end_delimiter": null                 // Extract everything untill the end of current line
            "value_found": null                   // Save here the value we found
        },
        "description": {
            // apply same logic as above
        },
        "due_date" {
            // apply same logic as above
        },
        "invoice_number" {
            // apply same logic as above
        },
        "amount" {
            // apply same logic as above
        },
    },


    "bill_template_2": {
        "vendor":{
            "line_no_start": 0,                    // Extract data from line zero
            "line_no_end": 0,                      // Extract data untill line zero
            "start_delimiter": null,               // Ignore this, because our delimiter is a complete line
            "end_delimiter": null                  // Ignore this, because our delimiter is a complete line
            "value_found": null                    // Save here the value we found
        },
        "account": {
            "line_no_start": null,                // This means is unknown and will be ignored by our text parsers
            "line_no_end": null,                  // This means is unknown and will be ignored by our text parsers
            "start_delimiter": "Account",         // Searched value starts immediatedly after this start_delimiters
            "end_delimiter": "Due date"           // Searched value ends just before this end_delimter
            "value_found": null                   // Save here the value we found
        },
        "description": {
            "line_no_start": 6,                   // Extract data from line zero
            "line_no_end": 99999,                 // Extract data untill line 99999 (a very big number which means EOF)
            "start_delimiter": null,              // Ignore this, because our delimiter is a complete line
            "end_delimiter": null                 // Ignore this, because our delimiter is a complete line
            "value_found": null                   // Save here the value we found
        },
        "due_date" {
            // apply same logic as above
        },
        "invoice_number" {
            // apply same logic as above
        },
        "amount" {
            // apply same logic as above
        },
    }
}


// ALGORITHM

// 1. convert into an array the TEXT_FROM_OCR variable (each index, means a new line in file)
// in JavaScript we would do something like this:

TEXT_FROM_OCR = TEXT_FROM_OCR.split("\r\n");


var MAXIMUM_SCORE = 6; // we are looking to extract 6 values, out of 6


foreach TEMPLATES as TEMPLATE_TO_PARSE => PARSE_METADATA{

    SCORE = 0; // for each field we find, we increment score


    foreach PARSE_METADATA as SEARCHED_FIELD_NAME => DELIMITERS_METADATA{

        // Search by line first
        if (DELIMITERS_METADATA['line_no_start'] !== NULL && DELIMITERS_METADATA['line_no_end'] !== NULL){

            // Initiate value with an empty string
            DELIMITERS_METADATA['value_found'] = '';

            // Concatenate the value found across these lines
            for (LINE_NO = DELIMITERS_METADATA['line_no_start']; LINE_NO <= DELIMITERS_METADATA['line_no_end']; LINE_NO++){

                // Add line, one by one as defined by your delimiters
                DELIMITERS_METADATA['value_found'] += TEXT_FROM_OCR[ LINE_NO ];

            }

            // We have found a good value, continue to next field
            SCORE++;
            continue;
        }



        // Search by text delimiters
        if (DELIMITERS_METADATA['start_delimiter'] !== NULL){



            // Search for text inside each line of the file
            foreach TEXT_FROM_OCR as LINE_CONTENT{

                // If we found start_delimiter on this line, then let's parse it
                if (LINE_CONTENT.indexOf(DELIMITERS_METADATA['start_delimiter']) > -1){

                    // START POSITION OF OUR SEARCHED VALUE IS THE OFFSET WE FOUND + THE TOTAL LENGTH OF START DELIMITER
                    START_POSITION = LINE_CONTENT.indexOf(DELIMITERS_METADATA['start_delimiter']) + LENGTH( DELIMITERS_METADATA['start_delimiter'] );


                    // by default we try to extract all data from START_POSITION untill the end of current line
                    END_POSITION = 999999999999; // till the end of line


                    // HOWEVER, IF THERE IS AN END DELIMITER DEFINED, WE WILL USE THAT
                    if (DELIMITERS_METADATA['end_delimiter'] !== NULL){

                        // IF WE FOUND THE END DELIMITER ON THIS LINE, WE WILL USE ITS OFFSET as END_POSITION
                        if (LINE_CONTENT.indexOf(DELIMITERS_METADATA['end_delimiter']) > -1){

                            END_POSITION = LINE_CONTENT.indexOf(DELIMITERS_METADATA['end_delimiter']);

                        }
                    }


                    // SUBSTRACT THE VALUE WE FOUND
                    DELIMITERS_METADATA['value_found'] = LINE_CONTENT.substr(START_POSITION, END_POSITION);

                    // We have found a good value earlier, increment the score
                    SCORE++;

                    // break this foreach as we found a good value, and we need to move to next field
                    break;
                }

            }

        }
    }


    print(TEMPLATE_TO_PARSE obtained a score of SCORE out of MAXIMUM_SCORE):
}

最后,您将知道哪个模板提取了大部分数据,并基于此知道哪个模板用于该账单。随时在评论中提出任何问题。如果我留了 45 分钟来写这个答案,我肯定也会回复你的评论。:)

于 2019-12-04T13:18:48.813 回答