/** * This method finds the average distances between the partitions and parses those to the * validation object for the calculation of the column confidence. */ private void setClusterCertainties() { method: while (true) { ArrayList<Integer> totalDistances = data.get(0).getDistances(); for (Line line : data) { if (data.indexOf(line) > 0) { for (int x = 0; x < line.getDistances().size(); x++) { if (!(x >= totalDistances.size() || x >= line.getDistances().size())) { int totalDistance = totalDistances.get(x) + line.getDistances().get(x); totalDistances.set(x, totalDistance); } else { LOGGER.info( "Found a problem during the cluster certainties. I've given the table a very low confidence"); ArrayList<Integer> lowValidation = new ArrayList<Integer>(); for (int o : line.getDistances()) { lowValidation.add(0); } validation.setClusterCertainty(lowValidation, data.get(0).getDistanceThreshold()); validation.setLineThreshold(data.get(0).getDistanceThreshold()); break method; } } } } ArrayList<Integer> averageDistances = new ArrayList<Integer>(); for (int distance : totalDistances) { averageDistances.add(distance / data.size()); } validation.setClusterCertainty(averageDistances, data.get(0).getDistanceThreshold()); validation.setLineThreshold(data.get(0).getDistanceThreshold()); break method; } }
/** * This is the constructor of the table class. It takes it's parameters and sets them as local * variables. It also puts the default values for the rest of the table and then starts calling * the other methods in this class. to extract the table according to the rules of TEA. * * @param spans These are the words below the table detection from the Page class. * @param charLengthThreshold This is the character length threshold as calculated in the Page * class. * @param file This is the File that was used to extract the table from. It is only used for the * creation of provenance. * @param workspace This is the workspace as specified by the user. * @param tableID This is the ID of the detected table. It is mainly used for the creation of the * output file and for provenance. * @param verticalThresholdModifier The modifier from the configuration file that should be used * to indicate how much space there should be between lines * @param horizontalThresholdModifier The modifier used for creating the threshold in horizontal * partitioning. * @param averageLineDistance The average (vertical) distance between lines as calculated in the * Page class. * @param debugging is true if the program is in debugging mode. * @param allowedHeaderIterations The amount of iterations that the program is allowed to run, * searching for headers. * @param allowedHeaderSize The amount of headers supported by the program. Implemented as a last * cut-off if thresholding fails. * @throws IOException When one of the files cant be found */ public Table2( Elements spans, double charLengthThreshold, File file, int pageNumber, String workspace, int tableID, double verticalThresholdModifier, double horizontalThresholdModifier, double averageLineDistance, boolean debugging, int allowedHeaderSize, int allowedHeaderIterations) throws IOException { String debugContent = ""; this.averageLineDistance = averageLineDistance; this.maxY1 = 0; this.spans = spans; this.name = ""; this.horizontalThresholdModifier = horizontalThresholdModifier; this.verticalThresholdModifier = verticalThresholdModifier; this.validation = new Validation(); this.validation.setAverageDistanceBetweenRows(averageLineDistance); this.pageNumber = pageNumber; if (spans.size() > 0) { setMaxY1(); this.table = new ArrayList<Line>(); createLines(charLengthThreshold); separateDataByCluster(); filterLinesThatAreAboveY1(); if (data.size() > 1) { System.out.println(getRawTable()); debugContent = debugContent + getRawTable() + "\n"; filterEmptyLines(); findMissingData(); findColumns(); createColumns(charLengthThreshold); checkColumns(); debugContent = debugContent + "lines with missing data: " + linesWithMissingData + "\n"; if (linesWithMissingData != null) { addLinesWithMissingDataToColumns(); } fillBlankCells(); } else { LOGGER.info( "The word Table was detected but no clusters were found.\n" + "It was found at position: " + maxY1); } if (data.size() > 1) { for (Line line : data) { validation.setClusterCertainty(line.getDistances(), line.getDistanceThreshold()); validation.setLineThreshold(line.getDistanceThreshold()); } LOGGER.info("Table: " + getName()); System.out.println("In Table: " + getName()); System.out.println( "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"); System.out.println("Data in this table is: "); ArrayList<Integer> distances = new ArrayList<Integer>(); Column2 lastColumn = null; for (Column2 column : dataInColumns) { System.out.println(column); if (dataInColumns.indexOf(column) == 0) { lastColumn = column; continue; } if (lastColumn != null) { distances.add(column.getAverageX1() - lastColumn.getAverageX2()); } } validation.setClusterCertainty( distances, averageLineDistance * horizontalThresholdModifier); if (linesWithMissingData != null && linesWithMissingData.size() > 0) { System.out.println( "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"); System.out.println( "The following lines were detected for having missing data or it was a line that had more clusters then the rest of the table.: "); for (Line line : linesWithMissingData) { System.out.println(line); } } System.out.println( "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"); if (rowSpanners.size() > 0) { System.out.println("Potential rowspanners: "); for (Line line : rowSpanners) { System.out.println(line); } } System.out.println("Validation:\n" + validation); System.out.println(table); setClusterCertainties(); System.out.println("Checking out the semantics."); SemanticFramework semanticFramework = new SemanticFramework( dataInColumns, (averageLineDistance * verticalThresholdModifier), rowSpanners, charLengthThreshold * horizontalThresholdModifier, table, validation, titleAndHeaders, allowedHeaderSize, allowedHeaderIterations); System.out.println("Checking for false positive..."); checkForFalsePositive(); System.out.println("False positive: " + validation.getFalsePositive()); LOGGER.info("False positive: " + validation.getFalsePositive()); System.out.println(); fillBlankCells(); System.out.println(semanticFramework); System.out.println("Calculating final table statistics."); setTableBoundaries(semanticFramework); System.out.println("Now writing to file."); write2( (workspace), file, tableID, semanticFramework); // write: getXMLContent(file, tableID, semanticFramework.getXML()), if (debugging) { writeDebugFile(debugContent, workspace, file); } } else { LOGGER.info("All the found data was filtered out!"); } } }