c# - 使用 Microsoft OCR 获取扫描图像文本的坐标

Question

试图在扫描的图像中找到文本的坐标。扫描的图像有很多文本数据，需要将图像数据转换为文本，然后得到文本的坐标。坐标表示边界框，例如X，Y轴，高度和宽度，其中文本是

使用 Microsoft OCR ProjectOxford Vision

using Microsoft.ProjectOxford.Vision;
using Microsoft.ProjectOxford.Vision.Contract;
using System;
using System.Collections.Generic;
using System.IO;
using System.Threading.Tasks;

 namespace TextExtraction
 {
 class Program
 {  
    const string API_key = "<<Key>>";
    const string API_location = 
    "https://westcentralus.api.cognitive.microsoft.com/vision/v1.0"; 

    static void Main(string[] args)
    {
        string imgToAnalyze = @"C:\Users\abhis\Desktop\image.jpg";
        HandwritingExtraction(imgToAnalyze, false);

        Console.ReadLine();
    }

    public static void PrintResults(string[] res)
    {
        foreach (string r in res)
            Console.WriteLine(r);
        Console.ReadLine();
    }

    public static void HandwritingExtraction(string fname, bool wrds)
    {
        Task.Run(async () =>
        {
            string[] res = await HandwritingExtractionCore(fname, wrds);
            PrintResults(res);

        }).Wait();
    }

    public static async Task<string[]> HandwritingExtractionCore(string fname, bool wrds)
    {
        VisionServiceClient client = new VisionServiceClient(API_key, API_location);
        string[] textres = null;

        if (File.Exists(fname))
            using (Stream stream = File.OpenRead(fname))
            {
                HandwritingRecognitionOperation op = await 
         client.CreateHandwritingRecognitionOperationAsync(stream);
                HandwritingRecognitionOperationResult res = await 
         client.GetHandwritingRecognitionOperationResultAsync(op);

                textres = GetExtracted(res, wrds);
            }

        return textres;
    }

    public static string[] GetExtracted(HandwritingRecognitionOperationResult res, bool wrds)
    {
        List<string> items = new List<string>();

        foreach (HandwritingTextLine l in res.RecognitionResult.Lines)
            if (wrds)
                items.AddRange(GetWords(l));
            else
                items.Add(GetLineAsString(l));

        return items.ToArray();
    }

    public static List<string> GetWords(HandwritingTextLine line)
    {
        List<string> words = new List<string>();

        foreach (HandwritingTextWord w in line.Words)
            words.Add(w.Text);

        return words;
    }

    public static string GetLineAsString(HandwritingTextLine line)
    {
        List<string> words = GetWords(line);
        return words.Count > 0 ? string.Join(" ", words) : string.Empty;
    }
}
}

预期输出： 获取具有各自坐标的文本（x，y，高度，宽度）

输入图像

json输出

{“状态”：“成功”，“成功”：真，“失败”：假，“完成”：真，“识别结果”：[{“页面”：1，“顺时针方向”：359.62，“宽度”：505 ，“高度”：399，“单位”：“像素”，“线”：[{“boundingBox”：[224、58、380、57、381、74、225、75]，“文本”：“政府印度”，“单词”：[ { “boundingBox”：[ 229, 59, 321, 58, 320, 75,229, 75 ], "text": "GOVERNMENT" }, { "boundingBox": [ 324, 58, 341, 58, 341, 75, 323, 75 ], "text": "OF" }, { "boundingBox" : [ 344, 58, 381, 58, 381, 75, 344, 75 ], "文本": "印度" } ] }, { "boundingBox": [ 211, 159, 429,160, 428, 180, 210, 178 ]，“文本”：“FH faPet/出生日期：27/07/1982”，“单词”：[ { “boundingBox”：[ 225、160、243、160、243、179 , 225, 179 ], "text": "FH" }, { "boundingBox": [ 247, 160, 286, 160, 286, 179, 247, 179 ], "text": "faPet/" }, { "边界框”：[290、160、333、160、333、179、290、179]，“文本”：“出生日期：”}，{“边界框”：[337、160、428、162、428、180、337、179] , "text": "27/07/1982" } ] }, { "boundingBox": [ 209, 192, 313, 190, 314, 208, 210, 210 ], "text": "you / MALE", "字”：[ { "boundingBox": [ 214, 192, 247, 192, 246, 209, 214, 210 ], "text": "you" }, { "boundingBox": [ 254, 192, 260, 192, 260, 209 , 254, 209 ], "text": "/" }, { "boundingBox": [ 264, 192, 314, 192, 313, 208, 263, 209 ],"text": "MALE" } ] }, { "boundingBox": [ 201, 314, 351, 313, 352, 330, 202, 331 ], "text": "66 66 6666 6666", "words": [ { "boundingBox": [ 204, 315, 225, 314, 225, 330, 204, 331 ], "text": "66" }, { "boundingBox": [ 229, 314, 251, 314, 251, 330,229, 330 ], "text": "66" }, { "boundingBox": [ 255, 314, 301, 314, 301, 330, 255, 330 ], "text": "6666" }, { "boundingBox" : [ 307, 314, 352, 314, 351, 331, 306, 330 ], "文本": "6666" } ] } ] } ] }66" }, { "boundingBox": [ 255, 314, 301, 314, 301, 330, 255, 330 ], "text": "6666" }, { "boundingBox": [ 307, 314, 352, 314, 351, 331, 306, 330 ], "文本": "6666" } ] } ] } ] }66" }, { "boundingBox": [ 255, 314, 301, 314, 301, 330, 255, 330 ], "text": "6666" }, { "boundingBox": [ 307, 314, 352, 314, 351, 331, 306, 330 ], "文本": "6666" } ] } ] } ] }330, 255, 330 ], "text": "6666" }, { "boundingBox": [ 307, 314, 352, 314, 351, 331, 306, 330 ], "text": "6666" } ] } ] } ] }330, 255, 330 ], "text": "6666" }, { "boundingBox": [ 307, 314, 352, 314, 351, 331, 306, 330 ], "text": "6666" } ] } ] } ] }“文本”：“6666”} ] } ] } ] }“文本”：“6666”} ] } ] } ] }

score 2 · Accepted Answer

我猜你正在使用类似 Microsoft C# Azure 应用程序的东西。这是您问题的详细链接。

https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/quickstarts/csharp-print-text

在内容字符串中。是这样的。

"language": "en",
    "textAngle": -1.5000000000000335,
    "orientation": "Up",
    "regions": [
        {
            "boundingBox": "154,49,351,575",
            "lines": [
                {
                    "boundingBox": "165,49,340,117",
                    "words": [
                        {
                            "boundingBox": "165,49,63,109",
                            "text": "A"
                        },
                        {
                            "boundingBox": "261,50,244,116",
                            "text": "GOAL"
                        }
                    ]
                },
                {

我用 Azsure C# 做了一些项目。但是你的代码看起来不是很熟悉。

我建议您查看textres或res中的所有数据格式（在您的代码中）我认为它包含与上面字符串中所示相同的引用

score 2 · Accepted Answer

首先，请注意 Microsoft 认知服务中有两种不同的文本识别 API。Yuan 博士的输出来自具有更广泛语言覆盖范围的 OCR API，而 Tony 的输出显示他正在调用更新和改进的 Read API。

其次，请注意上面代码示例中引用的客户端 SDKMicrosoft.ProjectOxford.Vision已弃用，您需要切换到替代版本Microsoft.Azure.CognitiveServices.Vision.ComputerVision，您可以在此处找到该示例。

最后，具体问题的答案。文档中识别的文本位置显示在boundingBox字段中。因此，对于您的示例输出 JSON，文本行GOVERNMENT OF INDIA以坐标 (224, 58)、(380, 57)、(381, 74) 和 (225, 75) 为界，代表四个角。它的x,y,width,height格式不允许旋转。请注意，边界框的单位也包含在 JSON 中（在您的情况下为像素）。如果这就是您所追求的，那么该行中每个单词的位置也在您的响应 JSON 中。

c# - 使用 Microsoft OCR 获取扫描图像文本的坐标

2 回答 2

Related

Reference