/** * This method is used for extraction of tables with lot of empty cells in it. It is required for * the successful extraction of most Matrix tables. */ private void fillBlankCells() { // We say: cells get a line number. If a column does not contain a cell on a certain line, add a // whitespace. // Any cell that is not filled must be empty: for (Line line : data) { int lineNumber = line.getLineNumber(); COLUMNLOOP: for (Column2 column : dataInColumns) { for (Cell cell : column.getCellObjects()) { if (cell.getLineNumber() == lineNumber) { break; } if (cell.getLineNumber() > line.getLineNumber()) { // the last cell? // Add a blank cell to this column. // System.out.println("Add line to :" + column + " in line: " + // line.getLineNumber()); // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span> Tag t = Tag.valueOf("span"); Attributes attributes = new Attributes(); attributes.put("class", "ocrx_word"); attributes.put("id", "word_ADDEDBYTEA"); attributes.put( "title", "bbox " + column.getAverageX1() + " " + (int) line.getAverageY1() + " " + column.getAverageX2() + " " + (int) line.getAverageY2()); Element newElement = new Element(t, "localhost:8080", attributes); newElement.text(" "); ArrayList<Element> newCell = new ArrayList<Element>(); newCell.add(newElement); // System.out.println("adding: " +newElement.text()); column.addCell(newCell); break COLUMNLOOP; } } } } }
/** * This is the constructor of the table class. It takes it's parameters and sets them as local * variables. It also puts the default values for the rest of the table and then starts calling * the other methods in this class. to extract the table according to the rules of TEA. * * @param spans These are the words below the table detection from the Page class. * @param charLengthThreshold This is the character length threshold as calculated in the Page * class. * @param file This is the File that was used to extract the table from. It is only used for the * creation of provenance. * @param workspace This is the workspace as specified by the user. * @param tableID This is the ID of the detected table. It is mainly used for the creation of the * output file and for provenance. * @param verticalThresholdModifier The modifier from the configuration file that should be used * to indicate how much space there should be between lines * @param horizontalThresholdModifier The modifier used for creating the threshold in horizontal * partitioning. * @param averageLineDistance The average (vertical) distance between lines as calculated in the * Page class. * @param debugging is true if the program is in debugging mode. * @param allowedHeaderIterations The amount of iterations that the program is allowed to run, * searching for headers. * @param allowedHeaderSize The amount of headers supported by the program. Implemented as a last * cut-off if thresholding fails. * @throws IOException When one of the files cant be found */ public Table2( Elements spans, double charLengthThreshold, File file, int pageNumber, String workspace, int tableID, double verticalThresholdModifier, double horizontalThresholdModifier, double averageLineDistance, boolean debugging, int allowedHeaderSize, int allowedHeaderIterations) throws IOException { String debugContent = ""; this.averageLineDistance = averageLineDistance; this.maxY1 = 0; this.spans = spans; this.name = ""; this.horizontalThresholdModifier = horizontalThresholdModifier; this.verticalThresholdModifier = verticalThresholdModifier; this.validation = new Validation(); this.validation.setAverageDistanceBetweenRows(averageLineDistance); this.pageNumber = pageNumber; if (spans.size() > 0) { setMaxY1(); this.table = new ArrayList<Line>(); createLines(charLengthThreshold); separateDataByCluster(); filterLinesThatAreAboveY1(); if (data.size() > 1) { System.out.println(getRawTable()); debugContent = debugContent + getRawTable() + "\n"; filterEmptyLines(); findMissingData(); findColumns(); createColumns(charLengthThreshold); checkColumns(); debugContent = debugContent + "lines with missing data: " + linesWithMissingData + "\n"; if (linesWithMissingData != null) { addLinesWithMissingDataToColumns(); } fillBlankCells(); } else { LOGGER.info( "The word Table was detected but no clusters were found.\n" + "It was found at position: " + maxY1); } if (data.size() > 1) { for (Line line : data) { validation.setClusterCertainty(line.getDistances(), line.getDistanceThreshold()); validation.setLineThreshold(line.getDistanceThreshold()); } LOGGER.info("Table: " + getName()); System.out.println("In Table: " + getName()); System.out.println( "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"); System.out.println("Data in this table is: "); ArrayList<Integer> distances = new ArrayList<Integer>(); Column2 lastColumn = null; for (Column2 column : dataInColumns) { System.out.println(column); if (dataInColumns.indexOf(column) == 0) { lastColumn = column; continue; } if (lastColumn != null) { distances.add(column.getAverageX1() - lastColumn.getAverageX2()); } } validation.setClusterCertainty( distances, averageLineDistance * horizontalThresholdModifier); if (linesWithMissingData != null && linesWithMissingData.size() > 0) { System.out.println( "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"); System.out.println( "The following lines were detected for having missing data or it was a line that had more clusters then the rest of the table.: "); for (Line line : linesWithMissingData) { System.out.println(line); } } System.out.println( "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"); if (rowSpanners.size() > 0) { System.out.println("Potential rowspanners: "); for (Line line : rowSpanners) { System.out.println(line); } } System.out.println("Validation:\n" + validation); System.out.println(table); setClusterCertainties(); System.out.println("Checking out the semantics."); SemanticFramework semanticFramework = new SemanticFramework( dataInColumns, (averageLineDistance * verticalThresholdModifier), rowSpanners, charLengthThreshold * horizontalThresholdModifier, table, validation, titleAndHeaders, allowedHeaderSize, allowedHeaderIterations); System.out.println("Checking for false positive..."); checkForFalsePositive(); System.out.println("False positive: " + validation.getFalsePositive()); LOGGER.info("False positive: " + validation.getFalsePositive()); System.out.println(); fillBlankCells(); System.out.println(semanticFramework); System.out.println("Calculating final table statistics."); setTableBoundaries(semanticFramework); System.out.println("Now writing to file."); write2( (workspace), file, tableID, semanticFramework); // write: getXMLContent(file, tableID, semanticFramework.getXML()), if (debugging) { writeDebugFile(debugContent, workspace, file); } } else { LOGGER.info("All the found data was filtered out!"); } } }