/** * This method removes the lines that have missing data and stores them in a separate variable. * These lines might contain valuable information about the content or could be a mistake by the * OCR or separator. They need special processing in order to be useful (as done in the * addLinesWithMissingDataToColumns method). */ private void findMissingData() { ArrayList<Line> dataWithoutMissingLines = new ArrayList<Line>(); ArrayList<Line> linesWithMissingData = new ArrayList<Line>(); ArrayList<Integer> numberOfClusters = new ArrayList<Integer>(); int highestAmountOfClusters = 0; // calculating the highest amount of clusters: for (Line line : data) { numberOfClusters.add(line.getClusterSize()); if (line.getClusterSize() > highestAmountOfClusters) { highestAmountOfClusters = line.getClusterSize(); } } // calculate the highest amount of cluster occurrences: int highestAmountOfClustersOccurrences = 0; ArrayList<Integer> numberOfClustersSave = new ArrayList<Integer>(numberOfClusters); while (numberOfClusters.contains(highestAmountOfClusters)) { highestAmountOfClustersOccurrences++; numberOfClusters.remove(numberOfClusters.indexOf(highestAmountOfClusters)); } numberOfClusters = new ArrayList<Integer>(numberOfClustersSave); validation.setHighestAmountOfClustersOccurrences(highestAmountOfClustersOccurrences); if (highestAmountOfClustersOccurrences > 4) { int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters); validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters); validation.setHighestAmountOfClusters(highestAmountOfClusters); for (Line line : data) { if (line.getClusterSize() < highestAmountOfClusters) { linesWithMissingData.add(line); } else { dataWithoutMissingLines.add(line); } } this.linesWithMissingData = linesWithMissingData; this.data = dataWithoutMissingLines; } else if (numberOfClusters.size() > 0) { int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters); validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters); validation.setHighestAmountOfClusters(highestAmountOfClusters); for (Line line : data) { if (line.getClusterSize() < mostFrequentAmountOfClusters) { // Now we now this line got missing data linesWithMissingData.add(line); } else { dataWithoutMissingLines.add(line); } } this.linesWithMissingData = linesWithMissingData; this.data = dataWithoutMissingLines; // System.out.println("Lines without missing data: "); // for (Line line : data){ // System.out.println(line); // } } }