/** * This method finds columns by taking the lines in the data variable and storing them in a map * based on their partition positions. */ private void findColumns() { int counterForColumns = 0; Map<Integer, ArrayList<ArrayList<Element>>> columnMap = new HashMap<Integer, ArrayList<ArrayList<Element>>>(); // Find highest amount of clusters: int highestAmountOfClusters = 0; for (Line line : data) { if (line.getClusterSize() > highestAmountOfClusters) { highestAmountOfClusters = line.getClusterSize(); } } // for(Line line : data){ // System.out.println(line.getClusterSize()); // } for (Line line : data) { for (ArrayList<Element> cluster : line.getClusters()) { if (columnMap.containsKey(counterForColumns)) { ArrayList<ArrayList<Element>> fullClusters = columnMap.get(counterForColumns); fullClusters.add(cluster); columnMap.put(counterForColumns, fullClusters); } else { ArrayList<ArrayList<Element>> fullClusters = new ArrayList<ArrayList<Element>>(); fullClusters.add(cluster); columnMap.put(counterForColumns, fullClusters); } counterForColumns++; } line.getClusters(); counterForColumns = 0; } this.dataByColumn = columnMap; System.out.println("columMap: "); System.out.println(dataByColumn); }
/** * This method separates the data based on the amount of partitions (clusters) in each line. The * lines before we can find any lines with partitions are stored in the private titleAndHeaders * variable The lines that contain partitions are stored in the private data variable The lines * that were inside the data lines and contained just one partition are stored in the private * rowspanner variable. */ private void separateDataByCluster() { ArrayList<Line> titleAndHeaders = new ArrayList<Line>(); ArrayList<Line> data = new ArrayList<Line>(); boolean foundData = false; Line breakingLine = null; Line doubleBreakingLine = null; ArrayList<Line> rowSpanners = new ArrayList<Line>(); boolean breaking = false; // System.out.println("---------------------------------------------------------"); for (Line line : table) { ArrayList<ArrayList<Element>> clusters = line.getClusters(); int size = clusters.size(); // System.out.println(size + " " + line); if (size < 1 && foundData && breakingLine != null && doubleBreakingLine != null) { break; // then we have reached the end of the table. } else if (size < 1 && foundData && breakingLine != null) { doubleBreakingLine = line; } else if (size < 1 && foundData && breakingLine == null) { breakingLine = line; } else if (size < 2) { // Found no data, so should be above it. titleAndHeaders.add(line); } else if (breakingLine == null) { data.add(line); // Hooray, data! foundData = true; } else if (size > 1 && doubleBreakingLine != null && breakingLine != null) { rowSpanners.add(breakingLine); rowSpanners.add(doubleBreakingLine); data.add(line); breakingLine = null; doubleBreakingLine = null; } else if (size > 1 && breakingLine != null) { rowSpanners.add(breakingLine); data.add(line); breakingLine = null; } } // System.out.println("breaking line: " + breakingLine); this.titleAndHeaders = titleAndHeaders; this.data = data; this.rowSpanners = rowSpanners; }
/** * This method adds lines with missing partitions to the current columns. It loops trough lines * that were flagged for containing missing data and then adds the ones to columns that they fit * in (or to columns that fit in the partition). If this fails it will also try to check if the * partition merely touches a column, although this will return a lower validation score if * successful. */ private void addLinesWithMissingDataToColumns() { ArrayList<Cell> cellsWithMissingDataAdded = new ArrayList<Cell>(); ArrayList<Column2> newDataInColumns = new ArrayList<Column2>(); for (Line line : linesWithMissingData) { ArrayList<ArrayList<Element>> clusters = line.getClusters(); for (ArrayList<Element> cluster : clusters) { for (Column2 column : dataInColumns) { if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) && column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 3, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } else if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) || column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 2, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } else if (column.touchesColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // System.out.println("CLUSTER: " + cluster + " touches: " + // column); newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 1, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } } } } validation.setCellsWithMissingDataAdded(cellsWithMissingDataAdded.size()); validation.setCellsWithMissingDataAddedScores(cellsWithMissingDataAdded); }