I found article below to do in python.
https://docs.aws.amazon.com/textract/latest/dg/examples-export-table-csv.html
also I used article below to extract text.
https://docs.aws.amazon.com/textract/latest/dg/detecting-document-text.html
but above article helped to get only text, I also used function "block.getBlockType()" of Block but none of block returned its type as "CELL" even tables are there in image/pdf.
Help me found java library similar to "boto3" to extract all tables.
What I did, I created models of each dataset in the json response and can use this models to build a table view in jsf.
public static List<TableModel> getTablesFromTextract(TextractModel textractModel) {
List<TableModel> tables = null;
try {
if (textractModel != null) {
tables = new ArrayList<>();
List<BlockModel> tableBlocks = new ArrayList<>();
Map<String, BlockModel> blockMap = new HashMap<>();
for (BlockModel block : textractModel.getBlocks()) {
if (block.getBlockType().equals("TABLE")) {
tableBlocks.add(block);
}
blockMap.put(block.getId(), block);
}
for (BlockModel blockModel : tableBlocks) {
Map<Long, Map<Long, String>> rowMap = new HashMap<>();
for (RelationshipModel relationship : blockModel.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
BlockModel cell = blockMap.get(id);
if (cell.getBlockType().equals("CELL")) {
long rowIndex = cell.getRowIndex();
long columnIndex = cell.getColumnIndex();
if (!rowMap.containsKey(rowIndex)) {
rowMap.put(rowIndex, new HashMap<>());
}
Map<Long, String> columnMap = rowMap.get(rowIndex);
columnMap.put(columnIndex, getCellText(cell, blockMap));
}
}
}
}
tables.add(new TableModel(blockModel, rowMap));
}
System.out.println("row Map " + tables.toString());
}
} catch (Exception e) {
LOG.error("Could not get table from textract model", e);
}
return tables;
}
private static String getCellText(BlockModel cell, Map<String, BlockModel> blockMap) {
String text = "";
try {
if (cell != null
&& CollectionUtils.isNotEmpty(cell.getRelationships())) {
for (RelationshipModel relationship : cell.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
BlockModel word = blockMap.get(id);
if (word.getBlockType().equals("WORD")) {
text += word.getText() + " ";
} else if (word.getBlockType().equals("SELECTION_ELEMENT")) {
if (word.getSelectionStatus().equals("SELECTED")) {
text += "X ";
}
}
}
}
}
}
} catch (Exception e) {
LOG.error("Could not get cell text of table", e);
}
return text;
}
TableModel to create the view from:
public class TableModel {
private BlockModel table;
private Map<Long, Map<Long, String>> rowMap;
public TableModel(BlockModel table, Map<Long, Map<Long, String>> rowMap) {
this.table = table;
this.rowMap = rowMap;
}
public BlockModel getTable() {
return table;
}
public void setTable(BlockModel table) {
this.table = table;
}
public Map<Long, Map<Long, String>> getRowMap() {
return rowMap;
}
public void setRowMap(Map<Long, Map<Long, String>> rowMap) {
this.rowMap = rowMap;
}
@Override
public String toString() {
return table.getId() + " - " + rowMap.toString();
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With