private void replaceMissPlacedCells() { for (ArrayList<Element> cluster : missPlacedCells) { for (Column2 column : dataInColumns) { if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) && column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: column.addCell(cluster); } else if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) || column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: column.addCell(cluster); } else if (column.touchesColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // System.out.println("CLUSTER: " + cluster + " touches: " + column); column.addCell(cluster); } } } }
/** * This method adds lines with missing partitions to the current columns. It loops trough lines * that were flagged for containing missing data and then adds the ones to columns that they fit * in (or to columns that fit in the partition). If this fails it will also try to check if the * partition merely touches a column, although this will return a lower validation score if * successful. */ private void addLinesWithMissingDataToColumns() { ArrayList<Cell> cellsWithMissingDataAdded = new ArrayList<Cell>(); ArrayList<Column2> newDataInColumns = new ArrayList<Column2>(); for (Line line : linesWithMissingData) { ArrayList<ArrayList<Element>> clusters = line.getClusters(); for (ArrayList<Element> cluster : clusters) { for (Column2 column : dataInColumns) { if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) && column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 3, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } else if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) || column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 2, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } else if (column.touchesColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // System.out.println("CLUSTER: " + cluster + " touches: " + // column); newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 1, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } } } } validation.setCellsWithMissingDataAdded(cellsWithMissingDataAdded.size()); validation.setCellsWithMissingDataAddedScores(cellsWithMissingDataAdded); }
/** * This method is used for extraction of tables with lot of empty cells in it. It is required for * the successful extraction of most Matrix tables. */ private void fillBlankCells() { // We say: cells get a line number. If a column does not contain a cell on a certain line, add a // whitespace. // Any cell that is not filled must be empty: for (Line line : data) { int lineNumber = line.getLineNumber(); COLUMNLOOP: for (Column2 column : dataInColumns) { for (Cell cell : column.getCellObjects()) { if (cell.getLineNumber() == lineNumber) { break; } if (cell.getLineNumber() > line.getLineNumber()) { // the last cell? // Add a blank cell to this column. // System.out.println("Add line to :" + column + " in line: " + // line.getLineNumber()); // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span> Tag t = Tag.valueOf("span"); Attributes attributes = new Attributes(); attributes.put("class", "ocrx_word"); attributes.put("id", "word_ADDEDBYTEA"); attributes.put( "title", "bbox " + column.getAverageX1() + " " + (int) line.getAverageY1() + " " + column.getAverageX2() + " " + (int) line.getAverageY2()); Element newElement = new Element(t, "localhost:8080", attributes); newElement.text(" "); ArrayList<Element> newCell = new ArrayList<Element>(); newCell.add(newElement); // System.out.println("adding: " +newElement.text()); column.addCell(newCell); break COLUMNLOOP; } } } } }