예제 #1
0
  /**
   * This method finds columns by taking the lines in the data variable and storing them in a map
   * based on their partition positions.
   */
  private void findColumns() {
    int counterForColumns = 0;
    Map<Integer, ArrayList<ArrayList<Element>>> columnMap =
        new HashMap<Integer, ArrayList<ArrayList<Element>>>();

    // Find highest amount of clusters:
    int highestAmountOfClusters = 0;
    for (Line line : data) {
      if (line.getClusterSize() > highestAmountOfClusters) {
        highestAmountOfClusters = line.getClusterSize();
      }
    }
    //        for(Line line : data){
    //            System.out.println(line.getClusterSize());
    //        }
    for (Line line : data) {
      for (ArrayList<Element> cluster : line.getClusters()) {
        if (columnMap.containsKey(counterForColumns)) {
          ArrayList<ArrayList<Element>> fullClusters = columnMap.get(counterForColumns);
          fullClusters.add(cluster);
          columnMap.put(counterForColumns, fullClusters);
        } else {
          ArrayList<ArrayList<Element>> fullClusters = new ArrayList<ArrayList<Element>>();
          fullClusters.add(cluster);
          columnMap.put(counterForColumns, fullClusters);
        }
        counterForColumns++;
      }
      line.getClusters();
      counterForColumns = 0;
    }
    this.dataByColumn = columnMap;
    System.out.println("columMap: ");
    System.out.println(dataByColumn);
  }
예제 #2
0
  /**
   * This method removes the lines that have missing data and stores them in a separate variable.
   * These lines might contain valuable information about the content or could be a mistake by the
   * OCR or separator. They need special processing in order to be useful (as done in the
   * addLinesWithMissingDataToColumns method).
   */
  private void findMissingData() {
    ArrayList<Line> dataWithoutMissingLines = new ArrayList<Line>();
    ArrayList<Line> linesWithMissingData = new ArrayList<Line>();
    ArrayList<Integer> numberOfClusters = new ArrayList<Integer>();
    int highestAmountOfClusters = 0;

    // calculating the highest amount of clusters:
    for (Line line : data) {
      numberOfClusters.add(line.getClusterSize());
      if (line.getClusterSize() > highestAmountOfClusters) {
        highestAmountOfClusters = line.getClusterSize();
      }
    }
    // calculate the highest amount of cluster occurrences:
    int highestAmountOfClustersOccurrences = 0;
    ArrayList<Integer> numberOfClustersSave = new ArrayList<Integer>(numberOfClusters);
    while (numberOfClusters.contains(highestAmountOfClusters)) {
      highestAmountOfClustersOccurrences++;
      numberOfClusters.remove(numberOfClusters.indexOf(highestAmountOfClusters));
    }
    numberOfClusters = new ArrayList<Integer>(numberOfClustersSave);
    validation.setHighestAmountOfClustersOccurrences(highestAmountOfClustersOccurrences);
    if (highestAmountOfClustersOccurrences > 4) {
      int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters);
      validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters);
      validation.setHighestAmountOfClusters(highestAmountOfClusters);
      for (Line line : data) {
        if (line.getClusterSize() < highestAmountOfClusters) {
          linesWithMissingData.add(line);
        } else {
          dataWithoutMissingLines.add(line);
        }
      }
      this.linesWithMissingData = linesWithMissingData;
      this.data = dataWithoutMissingLines;
    } else if (numberOfClusters.size() > 0) {
      int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters);
      validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters);
      validation.setHighestAmountOfClusters(highestAmountOfClusters);
      for (Line line : data) {
        if (line.getClusterSize() < mostFrequentAmountOfClusters) {
          // Now we now this line got missing data
          linesWithMissingData.add(line);
        } else {
          dataWithoutMissingLines.add(line);
        }
      }
      this.linesWithMissingData = linesWithMissingData;
      this.data = dataWithoutMissingLines;
      //            System.out.println("Lines without missing data: ");
      //            for (Line line :  data){
      //                System.out.println(line);
      //            }
    }
  }
예제 #3
0
 /**
  * This method filters the lines that contains less the 1 cluster but ended up in the table
  * anyway.
  */
 private void filterEmptyLines() {
   ArrayList<Line> newTable = new ArrayList<Line>();
   for (Line line : table) {
     if (!(line.getClusterSize() < 1)) {
       newTable.add(line);
     } else if (!(newTable.isEmpty())) {
       newTable.add(line);
     }
   }
   this.table = newTable;
 }