我正在使用亚马逊的 Textract服务从 pdf 文档中提取表格和表格。此处Github 提供的示例仅适用于单页文档。但根据 AWS 提供的演示,他们也能够提取多页 pdf 文档。
根据文档,我们也必须为多页调用相同的服务。但这对我不起作用。
他们提供的所有示例都在 python 或 java 中。
我在 dotnet core 中做这件事。
有什么帮助吗?
这是我的代码。
public IActionResult FileExtract(string filename)
{
try
{
string lineText = "";
string wordText = "";
string fieldsText = "";
string fieldsText2 = "";
string tableText = "";
// Extracting file in below code.
var textractAnalysisClient = BuildTextractClient();
var document = PrepareDocument(textractAnalysisClient, "FORMS", filename);
document.Pages.ForEach(page =>
{
page.Lines.ForEach(line =>
{
lineText += "<button class='rawlabel'>" + line.Text + "</button>";
line.Words.ForEach(word =>
{
wordText += word.Text;
});
});
page.Form.Fields.ForEach(f =>
{
fieldsText += "<div><h5>" + f.Key + "</h5><p style='background-color:lightgray;width: 200px;padding: 6px;'>"
+ f.Value + "</p></div>";
});
var key = "Phone Number:";
var field = page.Form.GetFieldByKey(key);
if (field != null)
{
fieldsText2 += "Key: " + field.Key + " | Value: " + field.Value;
}
});
tableText = "<table id='customers'>";
document = PrepareDocument(textractAnalysisClient, "TABLES", filename);
document.Pages.ForEach(page =>
{
page.Tables.ForEach(table =>
{
var r = 0;
table.Rows.ForEach(row =>
{
r++;
tableText += "<tr>";
var c = 0;
row.Cells.ForEach(cell =>
{
c++;
tableText += "<td>";
tableText += cell.Text + "</td>";
});
tableText += "</tr>";
});
});
});
tableText += "</table>";
objJsonResponse.fieldsText = fieldsText;
objJsonResponse.fieldsText2 = fieldsText2;
objJsonResponse.lineText = lineText;
objJsonResponse.tableText = tableText;
objJsonResponse.wordText = wordText;
objJsonResponse.responsecode = 1;
return Json(objJsonResponse);
}
catch (Exception ex)
{
this.objJsonResponse.responsecode = -1;
this.objJsonResponse.error = "failed";
return Json(this.objJsonResponse);
}
}
static TextractTextAnalysisService BuildTextractClient()
{
var builder = new ConfigurationBuilder()
.SetBasePath(Environment.CurrentDirectory)
.AddJsonFile("appsettings.json", optional: false, reloadOnChange: true)
.AddEnvironmentVariables()
.Build();
var awsOptions = builder.GetAWSOptions();
return new TextractTextAnalysisService(awsOptions.CreateServiceClient<IAmazonTextract>());
}
static TextractDocument PrepareDocument(TextractTextAnalysisService textractAnalysisClient, string type, string FormFile)
{
var task = textractAnalysisClient.StartDocumentAnalysis(BucketName, FormFile, type);
var jobId = task.Result;
textractAnalysisClient.WaitForJobCompletion(jobId);
var results = textractAnalysisClient.GetJobResults(jobId);
return new TextractDocument(results);
}