2

我正在使用亚马逊的 Textract服务从 pdf 文档中提取表格和表格。此处Github 提供的示例仅适用于单页文档。但根据 AWS 提供的演示,他们也能够提取多页 pdf 文档。

根据文档,我们也必须为多页调用相同的服务。但这对我不起作用。

他们提供的所有示例都在 python 或 java 中。

我在 dotnet core 中做这件事。

有什么帮助吗?

这是我的代码。

public IActionResult FileExtract(string filename)
    {

        try
        {
            string lineText = "";

            string wordText = "";
            string fieldsText = "";
            string fieldsText2 = "";
            string tableText = "";


            // Extracting file in below code.
            var textractAnalysisClient = BuildTextractClient();
            var document = PrepareDocument(textractAnalysisClient, "FORMS", filename);
            document.Pages.ForEach(page =>
            {
                page.Lines.ForEach(line =>
                {
                    lineText += "<button class='rawlabel'>" + line.Text + "</button>";
                    line.Words.ForEach(word =>
                    {
                        wordText += word.Text;
                    });
                });
                page.Form.Fields.ForEach(f =>
                {
                    fieldsText += "<div><h5>" + f.Key + "</h5><p style='background-color:lightgray;width: 200px;padding: 6px;'>"
                    + f.Value + "</p></div>";
                });
                var key = "Phone Number:";
                var field = page.Form.GetFieldByKey(key);
                if (field != null)
                {
                    fieldsText2 += "Key: " + field.Key + "  | Value: " + field.Value;
                }
            });

            tableText = "<table id='customers'>";
            document = PrepareDocument(textractAnalysisClient, "TABLES", filename);
            document.Pages.ForEach(page =>
            {
                page.Tables.ForEach(table =>
                {
                    var r = 0;
                    table.Rows.ForEach(row =>
                    {
                        r++;
                        tableText += "<tr>";
                        var c = 0;
                        row.Cells.ForEach(cell =>
                        {
                            c++;
                            tableText += "<td>";
                            tableText += cell.Text + "</td>";
                        });
                        tableText += "</tr>";
                    });
                });
            });

            tableText += "</table>";

            objJsonResponse.fieldsText = fieldsText;
            objJsonResponse.fieldsText2 = fieldsText2;
            objJsonResponse.lineText = lineText;
            objJsonResponse.tableText = tableText;
            objJsonResponse.wordText = wordText;
            objJsonResponse.responsecode = 1;
            return Json(objJsonResponse);
        }
        catch (Exception ex)
        {
            this.objJsonResponse.responsecode = -1;
            this.objJsonResponse.error = "failed";
            return Json(this.objJsonResponse);
        }
    }

    static TextractTextAnalysisService BuildTextractClient()
    {
        var builder = new ConfigurationBuilder()
            .SetBasePath(Environment.CurrentDirectory)
            .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true)
            .AddEnvironmentVariables()
            .Build();
        var awsOptions = builder.GetAWSOptions();
        return new TextractTextAnalysisService(awsOptions.CreateServiceClient<IAmazonTextract>());
    }

    static TextractDocument PrepareDocument(TextractTextAnalysisService textractAnalysisClient, string type, string FormFile)
    {
        var task = textractAnalysisClient.StartDocumentAnalysis(BucketName, FormFile, type);
        var jobId = task.Result;
        textractAnalysisClient.WaitForJobCompletion(jobId);
        var results = textractAnalysisClient.GetJobResults(jobId);
        return new TextractDocument(results);
    }
4

0 回答 0