/** * This method is used for extraction of tables with lot of empty cells in it. It is required for * the successful extraction of most Matrix tables. */ private void fillBlankCells() { // We say: cells get a line number. If a column does not contain a cell on a certain line, add a // whitespace. // Any cell that is not filled must be empty: for (Line line : data) { int lineNumber = line.getLineNumber(); COLUMNLOOP: for (Column2 column : dataInColumns) { for (Cell cell : column.getCellObjects()) { if (cell.getLineNumber() == lineNumber) { break; } if (cell.getLineNumber() > line.getLineNumber()) { // the last cell? // Add a blank cell to this column. // System.out.println("Add line to :" + column + " in line: " + // line.getLineNumber()); // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span> Tag t = Tag.valueOf("span"); Attributes attributes = new Attributes(); attributes.put("class", "ocrx_word"); attributes.put("id", "word_ADDEDBYTEA"); attributes.put( "title", "bbox " + column.getAverageX1() + " " + (int) line.getAverageY1() + " " + column.getAverageX2() + " " + (int) line.getAverageY2()); Element newElement = new Element(t, "localhost:8080", attributes); newElement.text(" "); ArrayList<Element> newCell = new ArrayList<Element>(); newCell.add(newElement); // System.out.println("adding: " +newElement.text()); column.addCell(newCell); break COLUMNLOOP; } } } } }
/** * This method creates a list of cell objects derived from the partitions in this line. * * @return A list containing the cell objects in this line. */ public ArrayList<Cell> getCellObjects() { ArrayList<Cell> cellObjects = new ArrayList<Cell>(); for (ArrayList<Element> cell : cells) { // Now we need to find out in which line this cell is: String pos; String[] positions; pos = cell.get(0).attr("title"); positions = pos.split("\\s+"); int cellY1 = Integer.parseInt(positions[4]); int lineNumber = 0; for (Line line : data) { if (line.getAverageY1() > cellY1) { break; } lineNumber = line.getLineNumber(); } Cell currentCell = new Cell(cell, lineNumber); cellObjects.add(currentCell); } return cellObjects; }
// This method will try to recreate the original lines of the table using all the previous // information about the table. private ArrayList<ArrayList<Cell>> recreateTableLines(SemanticFramework semanticFramework) { ArrayList<ArrayList<Cell>> table = new ArrayList<ArrayList<Cell>>(); for (Column2 column : dataInColumns) { ArrayList<Cell> cells = column.getCellObjects(); table.add(cells); } int maxLength = Integer.MIN_VALUE; for (ArrayList<Cell> column : table) { int length = column.size(); if (length > maxLength) { maxLength = length; } } ArrayList<ArrayList<Cell>> newTable = new ArrayList<ArrayList<Cell>>(); ArrayList<Column2> incompleteRowColumns = new ArrayList<Column2>(); ArrayList<Cell> col = new ArrayList<Cell>(); int counter = 0; while (counter < maxLength) { ArrayList<Cell> line = new ArrayList<Cell>(); newTable.add(line); counter++; } for (Column2 column : dataInColumns) { if (column.getCellObjects().size() == maxLength) { col = column.getCellObjects(); for (ArrayList<Cell> line : newTable) { if (line.size() >= dataInColumns.indexOf(column)) { line.add(dataInColumns.indexOf(column), col.get(newTable.indexOf(line))); } else { line.add(col.get(newTable.indexOf(line))); } } } else { incompleteRowColumns.add(column); // gonna be mapped } } for (Column2 column : incompleteRowColumns) { ArrayList<Cell> cells = column.getCellObjects(); for (Cell cellOfNewColumn : cells) { System.out.println(cellOfNewColumn); boolean breaking = false; Cells: for (ArrayList<Cell> line : newTable) { for (Cell cell : line) { if (cell.getY1() >= cellOfNewColumn .getY2()) { // ||(CommonMethods.calcDistance(cell.getY2(), // cellOfNewColumn.getY1())>(averageLineDistance*verticalThresholdModifier))&&line.indexOf(cellOfNewColumn)!=0) System.out.println( newTable.indexOf(line) + " " + cell + " " + cellOfNewColumn + " " + newTable.size() + " " + dataInColumns.indexOf(column) + " " + line.size()); if (newTable.indexOf(line) != 0) { System.out.println( newTable.get(newTable.indexOf(line) - 1).size() + " " + dataInColumns.indexOf(column)); if (newTable.get(newTable.indexOf(line) - 1).size() >= dataInColumns.indexOf(column)) { newTable .get(newTable.indexOf(line) - 1) .add(dataInColumns.indexOf(column), cellOfNewColumn); breaking = true; break Cells; } else { newTable.get(newTable.indexOf(line) - 1).add(cellOfNewColumn); breaking = true; break Cells; } } else { newTable.add(0, new ArrayList<Cell>()); newTable.get(0).add(cellOfNewColumn); breaking = true; break Cells; } } } } if (!breaking) { System.out.println("wasn't breaking. Now it is!"); System.out.println( newTable.size() + " " + dataInColumns.indexOf(column) + " " + newTable.get(newTable.size() - 1).size()); System.out.println(dataInColumns.indexOf(column) + " " + (newTable.size() - 1)); if (dataInColumns.indexOf(column) <= newTable.get(newTable.size() - 1).size() - 1) { newTable.get(newTable.size() - 1).add(dataInColumns.indexOf(column), cellOfNewColumn); breaking = true; } else { newTable.get(newTable.size() - 1).add(cellOfNewColumn); breaking = true; } } } } // TODO: Implement missing data by looking at the touchcolumn of the rowspan and add all the // columns from data in columns as empty cells. double totalY2 = 0; double averageY2; for (Line rowSpanner : semanticFramework.getValidatedRowSpanners()) { for (ArrayList<Cell> line : newTable) { for (Cell cell : line) { totalY2 = totalY2 + cell.getY2(); } averageY2 = totalY2 / line.size(); if (rowSpanner.getAverageY1() < averageY2) { newTable.add(newTable.indexOf(line), new ArrayList<Cell>()); newTable.get(newTable.indexOf(line) - 1).add(rowSpanner.getCellObject()); break; } } } for (Line subHeader : semanticFramework.getRowSpanners()) { for (ArrayList<Cell> line : newTable) { for (Cell cell : line) { totalY2 = totalY2 + cell.getY2(); } int firstMappedColumn = 0; averageY2 = totalY2 / line.size(); if (subHeader.getAverageY1() < averageY2) { newTable.add(newTable.indexOf(line), new ArrayList<Cell>()); for (Column2 column : dataInColumns) { newTable.get(newTable.indexOf(line) - 1).add(new Cell()); if (column.touchesColumn(subHeader.getFirstX1(), subHeader.getLastX2())) { firstMappedColumn = dataInColumns.indexOf(column); } } newTable .get(newTable.indexOf(line) - 1) .set(firstMappedColumn, subHeader.getCellObject()); break; } } } System.out.println("NEWTABLE:"); System.out.println(newTable); return newTable; }