/** * This method finds columns by taking the lines in the data variable and storing them in a map * based on their partition positions. */ private void findColumns() { int counterForColumns = 0; Map<Integer, ArrayList<ArrayList<Element>>> columnMap = new HashMap<Integer, ArrayList<ArrayList<Element>>>(); // Find highest amount of clusters: int highestAmountOfClusters = 0; for (Line line : data) { if (line.getClusterSize() > highestAmountOfClusters) { highestAmountOfClusters = line.getClusterSize(); } } // for(Line line : data){ // System.out.println(line.getClusterSize()); // } for (Line line : data) { for (ArrayList<Element> cluster : line.getClusters()) { if (columnMap.containsKey(counterForColumns)) { ArrayList<ArrayList<Element>> fullClusters = columnMap.get(counterForColumns); fullClusters.add(cluster); columnMap.put(counterForColumns, fullClusters); } else { ArrayList<ArrayList<Element>> fullClusters = new ArrayList<ArrayList<Element>>(); fullClusters.add(cluster); columnMap.put(counterForColumns, fullClusters); } counterForColumns++; } line.getClusters(); counterForColumns = 0; } this.dataByColumn = columnMap; System.out.println("columMap: "); System.out.println(dataByColumn); }
/** * This method removes the lines that have missing data and stores them in a separate variable. * These lines might contain valuable information about the content or could be a mistake by the * OCR or separator. They need special processing in order to be useful (as done in the * addLinesWithMissingDataToColumns method). */ private void findMissingData() { ArrayList<Line> dataWithoutMissingLines = new ArrayList<Line>(); ArrayList<Line> linesWithMissingData = new ArrayList<Line>(); ArrayList<Integer> numberOfClusters = new ArrayList<Integer>(); int highestAmountOfClusters = 0; // calculating the highest amount of clusters: for (Line line : data) { numberOfClusters.add(line.getClusterSize()); if (line.getClusterSize() > highestAmountOfClusters) { highestAmountOfClusters = line.getClusterSize(); } } // calculate the highest amount of cluster occurrences: int highestAmountOfClustersOccurrences = 0; ArrayList<Integer> numberOfClustersSave = new ArrayList<Integer>(numberOfClusters); while (numberOfClusters.contains(highestAmountOfClusters)) { highestAmountOfClustersOccurrences++; numberOfClusters.remove(numberOfClusters.indexOf(highestAmountOfClusters)); } numberOfClusters = new ArrayList<Integer>(numberOfClustersSave); validation.setHighestAmountOfClustersOccurrences(highestAmountOfClustersOccurrences); if (highestAmountOfClustersOccurrences > 4) { int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters); validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters); validation.setHighestAmountOfClusters(highestAmountOfClusters); for (Line line : data) { if (line.getClusterSize() < highestAmountOfClusters) { linesWithMissingData.add(line); } else { dataWithoutMissingLines.add(line); } } this.linesWithMissingData = linesWithMissingData; this.data = dataWithoutMissingLines; } else if (numberOfClusters.size() > 0) { int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters); validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters); validation.setHighestAmountOfClusters(highestAmountOfClusters); for (Line line : data) { if (line.getClusterSize() < mostFrequentAmountOfClusters) { // Now we now this line got missing data linesWithMissingData.add(line); } else { dataWithoutMissingLines.add(line); } } this.linesWithMissingData = linesWithMissingData; this.data = dataWithoutMissingLines; // System.out.println("Lines without missing data: "); // for (Line line : data){ // System.out.println(line); // } } }
/** * This method filters the lines that contains less the 1 cluster but ended up in the table * anyway. */ private void filterEmptyLines() { ArrayList<Line> newTable = new ArrayList<Line>(); for (Line line : table) { if (!(line.getClusterSize() < 1)) { newTable.add(line); } else if (!(newTable.isEmpty())) { newTable.add(line); } } this.table = newTable; }