コード例 #1
0
ファイル: Table2.java プロジェクト: Traple/TableExtraction
  /**
   * This method removes the lines that have missing data and stores them in a separate variable.
   * These lines might contain valuable information about the content or could be a mistake by the
   * OCR or separator. They need special processing in order to be useful (as done in the
   * addLinesWithMissingDataToColumns method).
   */
  private void findMissingData() {
    ArrayList<Line> dataWithoutMissingLines = new ArrayList<Line>();
    ArrayList<Line> linesWithMissingData = new ArrayList<Line>();
    ArrayList<Integer> numberOfClusters = new ArrayList<Integer>();
    int highestAmountOfClusters = 0;

    // calculating the highest amount of clusters:
    for (Line line : data) {
      numberOfClusters.add(line.getClusterSize());
      if (line.getClusterSize() > highestAmountOfClusters) {
        highestAmountOfClusters = line.getClusterSize();
      }
    }
    // calculate the highest amount of cluster occurrences:
    int highestAmountOfClustersOccurrences = 0;
    ArrayList<Integer> numberOfClustersSave = new ArrayList<Integer>(numberOfClusters);
    while (numberOfClusters.contains(highestAmountOfClusters)) {
      highestAmountOfClustersOccurrences++;
      numberOfClusters.remove(numberOfClusters.indexOf(highestAmountOfClusters));
    }
    numberOfClusters = new ArrayList<Integer>(numberOfClustersSave);
    validation.setHighestAmountOfClustersOccurrences(highestAmountOfClustersOccurrences);
    if (highestAmountOfClustersOccurrences > 4) {
      int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters);
      validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters);
      validation.setHighestAmountOfClusters(highestAmountOfClusters);
      for (Line line : data) {
        if (line.getClusterSize() < highestAmountOfClusters) {
          linesWithMissingData.add(line);
        } else {
          dataWithoutMissingLines.add(line);
        }
      }
      this.linesWithMissingData = linesWithMissingData;
      this.data = dataWithoutMissingLines;
    } else if (numberOfClusters.size() > 0) {
      int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters);
      validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters);
      validation.setHighestAmountOfClusters(highestAmountOfClusters);
      for (Line line : data) {
        if (line.getClusterSize() < mostFrequentAmountOfClusters) {
          // Now we now this line got missing data
          linesWithMissingData.add(line);
        } else {
          dataWithoutMissingLines.add(line);
        }
      }
      this.linesWithMissingData = linesWithMissingData;
      this.data = dataWithoutMissingLines;
      //            System.out.println("Lines without missing data: ");
      //            for (Line line :  data){
      //                System.out.println(line);
      //            }
    }
  }