/** * This method removes the lines that have missing data and stores them in a separate variable. * These lines might contain valuable information about the content or could be a mistake by the * OCR or separator. They need special processing in order to be useful (as done in the * addLinesWithMissingDataToColumns method). */ private void findMissingData() { ArrayList<Line> dataWithoutMissingLines = new ArrayList<Line>(); ArrayList<Line> linesWithMissingData = new ArrayList<Line>(); ArrayList<Integer> numberOfClusters = new ArrayList<Integer>(); int highestAmountOfClusters = 0; // calculating the highest amount of clusters: for (Line line : data) { numberOfClusters.add(line.getClusterSize()); if (line.getClusterSize() > highestAmountOfClusters) { highestAmountOfClusters = line.getClusterSize(); } } // calculate the highest amount of cluster occurrences: int highestAmountOfClustersOccurrences = 0; ArrayList<Integer> numberOfClustersSave = new ArrayList<Integer>(numberOfClusters); while (numberOfClusters.contains(highestAmountOfClusters)) { highestAmountOfClustersOccurrences++; numberOfClusters.remove(numberOfClusters.indexOf(highestAmountOfClusters)); } numberOfClusters = new ArrayList<Integer>(numberOfClustersSave); validation.setHighestAmountOfClustersOccurrences(highestAmountOfClustersOccurrences); if (highestAmountOfClustersOccurrences > 4) { int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters); validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters); validation.setHighestAmountOfClusters(highestAmountOfClusters); for (Line line : data) { if (line.getClusterSize() < highestAmountOfClusters) { linesWithMissingData.add(line); } else { dataWithoutMissingLines.add(line); } } this.linesWithMissingData = linesWithMissingData; this.data = dataWithoutMissingLines; } else if (numberOfClusters.size() > 0) { int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters); validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters); validation.setHighestAmountOfClusters(highestAmountOfClusters); for (Line line : data) { if (line.getClusterSize() < mostFrequentAmountOfClusters) { // Now we now this line got missing data linesWithMissingData.add(line); } else { dataWithoutMissingLines.add(line); } } this.linesWithMissingData = linesWithMissingData; this.data = dataWithoutMissingLines; // System.out.println("Lines without missing data: "); // for (Line line : data){ // System.out.println(line); // } } }
/** * This method creates the lines by reading trough the file. * * @param charLengthThreshold This is the average characterLength as calculated in the Page class. */ public void createLines(double charLengthThreshold) { String pos; String[] positions; int lastX2 = 0; int lastY2 = 0; Elements currentLine = new Elements(); int lineNumber = 0; for (Element span : spans) { pos = span.attr("title"); positions = pos.split("\\s+"); int x1 = Integer.parseInt(positions[1]); int y1 = Integer.parseInt(positions[2]); int x2 = Integer.parseInt(positions[3]); int y2 = Integer.parseInt(positions[4]); // This is where the modifier can be placed for the 1,2,3 parameter as described in the // version test: if (((x1 <= lastX2) || y1 > lastY2 || CommonMethods.calcDistance(lastY2, y1) > (averageLineDistance * verticalThresholdModifier)) && spans.indexOf(span) != 0) { Line line = new Line(currentLine, charLengthThreshold, horizontalThresholdModifier); // System.out.println(line); line.setLineNumber(lineNumber); table.add(line); currentLine = new Elements(); lineNumber += 1; } lastX2 = x2; lastY2 = y2; currentLine.add(span); } if (currentLine.size() > 4) { // For in case the last line is part of the table Line line = new Line(currentLine, charLengthThreshold, horizontalThresholdModifier); table.add(line); } }