我有类似的东西:
public class AnalyzeDocument {
public DocumentModel startProcess(byte[] content) {
Region region = Region.EU_WEST_2;
TextractClient textractClient = TextractClient.builder().region(region)
.credentialsProvider(EnvironmentVariableCredentialsProvider.create()).build();
return analyzeDoc(textractClient, content);
}
public DocumentModel analyzeDoc(TextractClient textractClient, byte[] content) {
try {
SdkBytes sourceBytes = SdkBytes.fromByteArray(content);
Util util = new Util();
Document myDoc = Document.builder().bytes(sourceBytes).build();
List<FeatureType> featureTypes = new ArrayList<FeatureType>();
featureTypes.add(FeatureType.FORMS);
featureTypes.add(FeatureType.TABLES);
AnalyzeDocumentRequest analyzeDocumentRequest = AnalyzeDocumentRequest.builder().featureTypes(featureTypes)
.document(myDoc).build();
AnalyzeDocumentResponse analyzeDocument = textractClient.analyzeDocument(analyzeDocumentRequest);
List<Block> docInfo = analyzeDocument.blocks();
// util.displayBlockInfo(docInfo);
PageModel pageModel = util.getTableResults(docInfo);
DocumentModel documentModel = new DocumentModel();
documentModel.getPages().add(pageModel);
Iterator<Block> blockIterator = docInfo.iterator();
while (blockIterator.hasNext()) {
Block block = blockIterator.next();
log.debug("The block type is " + block.blockType().toString());
}
return documentModel;
} catch (TextractException e) {
System.err.println(e.getMessage());
}
return null;
}
这是 util 文件:
public PageModel getTableResults(List<Block> blocks) {
List<Block> tableBlocks = new ArrayList<>();
Map<String, Block> blockMap = new HashMap<>();
for (Block block : blocks) {
blockMap.put(block.id(), block);
if (block.blockType().equals(BlockType.TABLE)) {
tableBlocks.add(block);
log.debug("added table: " + block.text());
}
}
PageModel page = new PageModel();
if (tableBlocks.size() == 0) {
return null;
}
int i = 0;
for (Block table : tableBlocks) {
page.getTables().add(generateTable(table, blockMap, i++));
}
return page;
}
private TableModel generateTable(Block table, Map<String, Block> blockMap, int index) {
TableModel model = new TableModel();
Map<Integer, Map<Integer, String>> rows = getRowsColumnsMap(table, blockMap);
model.setTableId("Table_" + index);
for (Map.Entry<Integer, Map<Integer, String>> entry : rows.entrySet()) {
RowModel rowModel = new RowModel();
Map<Integer, String> value = entry.getValue();
for (int i = 0; i < value.size(); i++) {
rowModel.getCells().add(value.get(i));
}
model.getRows().add(rowModel);
}
return model;
}
private Map<Integer, Map<Integer, String>> getRowsColumnsMap(Block block, Map<String, Block> blockMap) {
Map<Integer, Map<Integer, String>> rows = new HashMap<>();
for (Relationship relationship : block.relationships()) {
if (relationship.type().equals(RelationshipType.CHILD)) {
for (String childId : relationship.ids()) {
Block cell = blockMap.get(childId);
if (cell != null) {
int rowIndex = cell.rowIndex();
int colIndex = cell.columnIndex();
if (rows.get(rowIndex) == null) {
Map<Integer, String> row = new HashMap<>();
rows.put(rowIndex, row);
}
rows.get(rowIndex).put(colIndex, getText(cell, blockMap));
}
}
}
}
return rows;
}
public String getText(Block block, Map<String, Block> blockMap) {
String text = "";
if (block.relationships() != null && block.relationships().size() > 0) {
for (Relationship relationship : block.relationships()) {
if (relationship.type().equals(RelationshipType.CHILD)) {
for (String childId : relationship.ids()) {
Block wordBlock = blockMap.get(childId);
if (wordBlock != null && wordBlock.blockType() != null) {
if (wordBlock.blockType().equals(BlockType.WORD))) {
text += wordBlock.text() + " ";
}
}
}
}
}
}
return text;
}