node.js - 使用nodejs服务器和reactjs网页从pdf中提取文本的问题

Question

以下是我的 textractUtils.js 代码 -

const _ = require("lodash");
const aws = require("aws-sdk");
const config = require("./config");

aws.config.update({
  accessKeyId: config.awsAccesskeyID,
  secretAccessKey: config.awsSecretAccessKey,
  region: config.awsRegion
});

const textract = new aws.Textract();

const getText = (result, blocksMap) => {
  let text = "";

  if (_.has(result, "Relationships")) {
    result.Relationships.forEach(relationship => {
      if (relationship.Type === "CHILD") {
        relationship.Ids.forEach(childId => {
          const word = blocksMap[childId];
          if (word.BlockType === "WORD") {
            text += `${word.Text} `;
          }
          if (word.BlockType === "SELECTION_ELEMENT") {
            if (word.SelectionStatus === "SELECTED") {
              text += `X `;
            }
          }
        });
      }
    });
  }

  return text.trim();
};

const findValueBlock = (keyBlock, valueMap) => {
  let valueBlock;
  keyBlock.Relationships.forEach(relationship => {
    if (relationship.Type === "VALUE") {
      // eslint-disable-next-line array-callback-return
      relationship.Ids.every(valueId => {
        if (_.has(valueMap, valueId)) {
          valueBlock = valueMap[valueId];
          return false;
        }
      });
    }
  });

  return valueBlock;
};

const getKeyValueRelationship = (keyMap, valueMap, blockMap) => {
  const keyValues = {};

  const keyMapValues = _.values(keyMap);

  keyMapValues.forEach(keyMapValue => {
    const valueBlock = findValueBlock(keyMapValue, valueMap);
    const key = getText(keyMapValue, blockMap);
    const value = getText(valueBlock, blockMap);
    keyValues[key] = value;
  });

  return keyValues;
};

const getKeyValueMap = blocks => {
  const keyMap = {};
  const valueMap = {};
  const blockMap = {};

  let blockId;
  blocks.forEach(block => {
    blockId = block.Id;
    blockMap[blockId] = block;

    if (block.BlockType === "KEY_VALUE_SET") {
      if (_.includes(block.EntityTypes, "KEY")) {
        keyMap[blockId] = block;
      } else {
        valueMap[blockId] = block;
      }
    }
  });

  return { keyMap, valueMap, blockMap };
};

module.exports = async buffer => {
  const params = {
    Document: {
      /* required */
      Bytes: buffer
    },
    FeatureTypes: ["FORMS"]
  };

  const request = textract.analyzeDocument(params);
  const data = await request.promise();

  if (data && data.Blocks) {
    const { keyMap, valueMap, blockMap } = getKeyValueMap(data.Blocks);
    const keyValues = getKeyValueRelationship(keyMap, valueMap, blockMap);

    return keyValues;
  }

  // in case no blocks are found return undefined
  return undefined;
};

它适用于图像，但不适用于 pdf（单页和多页）。以下是我通过导入pdf运行它时的错误-

(node:2001) UnhandledPromiseRejectionWarning: UnsupportedDocumentException: Request has unsupported document format
    at Request.extractError (/home/<user>/textract-lab/node_modules/aws-sdk/lib/protocol/json.js:51:27)
    at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:106:20)
    at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:78:10)
    at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:683:14)
    at Request.transition (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:22:10)
    at AcceptorStateMachine.runTo (/home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:14:12)
    at /home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:26:10
    at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:38:9)
    at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:685:12)
    at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:116:18)
(node:2001) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:2001) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.

我尝试过的东西是包含图像的非文本、包含图像的文本、包含图像的表格、单页 pdf 和多页 pdf。我也有一个概念上的疑问，如果我已经导入了 aws-sdk，为什么要为 pdf 编码，因为 textract 的 aws-sdk 会处理 pdf、png、jpeg 和 jpg 格式的图像？我必须对 textractUtils.js 进行哪些更改才能处理 epdf 文件？

score 2 · Accepted Answer

AnalyzeDocumentAPI 操作仅支持 PNG 或 JPEG 格式的图像。从文本文档：

Amazon Textract 同步操作 (DetectDocumentText和AnalyzeDocument) 支持 PNG 和 JPEG 图像格式。异步操作 ( StartDocumentTextDetection, StartDocumentAnalysis) 也支持 PDF 文件格式。

您应该使用异步操作来处理您的 PDF 文档。否则，一种解决方法是将 PDF 文档转换为代码中的图像，然后对这些图像使用同步 API 操作来处理文档。

node.js - 使用nodejs服务器和reactjs网页从pdf中提取文本的问题

1 回答 1

Related

Reference