コード例 #1
0
ファイル: Table2.java プロジェクト: Traple/TableExtraction
  /**
   * This method removes the lines that have missing data and stores them in a separate variable.
   * These lines might contain valuable information about the content or could be a mistake by the
   * OCR or separator. They need special processing in order to be useful (as done in the
   * addLinesWithMissingDataToColumns method).
   */
  private void findMissingData() {
    ArrayList<Line> dataWithoutMissingLines = new ArrayList<Line>();
    ArrayList<Line> linesWithMissingData = new ArrayList<Line>();
    ArrayList<Integer> numberOfClusters = new ArrayList<Integer>();
    int highestAmountOfClusters = 0;

    // calculating the highest amount of clusters:
    for (Line line : data) {
      numberOfClusters.add(line.getClusterSize());
      if (line.getClusterSize() > highestAmountOfClusters) {
        highestAmountOfClusters = line.getClusterSize();
      }
    }
    // calculate the highest amount of cluster occurrences:
    int highestAmountOfClustersOccurrences = 0;
    ArrayList<Integer> numberOfClustersSave = new ArrayList<Integer>(numberOfClusters);
    while (numberOfClusters.contains(highestAmountOfClusters)) {
      highestAmountOfClustersOccurrences++;
      numberOfClusters.remove(numberOfClusters.indexOf(highestAmountOfClusters));
    }
    numberOfClusters = new ArrayList<Integer>(numberOfClustersSave);
    validation.setHighestAmountOfClustersOccurrences(highestAmountOfClustersOccurrences);
    if (highestAmountOfClustersOccurrences > 4) {
      int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters);
      validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters);
      validation.setHighestAmountOfClusters(highestAmountOfClusters);
      for (Line line : data) {
        if (line.getClusterSize() < highestAmountOfClusters) {
          linesWithMissingData.add(line);
        } else {
          dataWithoutMissingLines.add(line);
        }
      }
      this.linesWithMissingData = linesWithMissingData;
      this.data = dataWithoutMissingLines;
    } else if (numberOfClusters.size() > 0) {
      int mostFrequentAmountOfClusters = CommonMethods.mostCommonElement(numberOfClusters);
      validation.setMostFrequentNumberOfClusters(mostFrequentAmountOfClusters);
      validation.setHighestAmountOfClusters(highestAmountOfClusters);
      for (Line line : data) {
        if (line.getClusterSize() < mostFrequentAmountOfClusters) {
          // Now we now this line got missing data
          linesWithMissingData.add(line);
        } else {
          dataWithoutMissingLines.add(line);
        }
      }
      this.linesWithMissingData = linesWithMissingData;
      this.data = dataWithoutMissingLines;
      //            System.out.println("Lines without missing data: ");
      //            for (Line line :  data){
      //                System.out.println(line);
      //            }
    }
  }
コード例 #2
0
ファイル: Table2.java プロジェクト: Traple/TableExtraction
  /**
   * This method creates the lines by reading trough the file.
   *
   * @param charLengthThreshold This is the average characterLength as calculated in the Page class.
   */
  public void createLines(double charLengthThreshold) {
    String pos;
    String[] positions;
    int lastX2 = 0;
    int lastY2 = 0;
    Elements currentLine = new Elements();

    int lineNumber = 0;

    for (Element span : spans) {
      pos = span.attr("title");
      positions = pos.split("\\s+");
      int x1 = Integer.parseInt(positions[1]);
      int y1 = Integer.parseInt(positions[2]);
      int x2 = Integer.parseInt(positions[3]);
      int y2 = Integer.parseInt(positions[4]);

      // This is where the modifier can be placed for the 1,2,3 parameter as described in the
      // version test:
      if (((x1 <= lastX2)
              || y1 > lastY2
              || CommonMethods.calcDistance(lastY2, y1)
                  > (averageLineDistance * verticalThresholdModifier))
          && spans.indexOf(span) != 0) {
        Line line = new Line(currentLine, charLengthThreshold, horizontalThresholdModifier);
        //                System.out.println(line);
        line.setLineNumber(lineNumber);
        table.add(line);
        currentLine = new Elements();
        lineNumber += 1;
      }
      lastX2 = x2;
      lastY2 = y2;
      currentLine.add(span);
    }
    if (currentLine.size() > 4) { // For in case the last line is part of the table
      Line line = new Line(currentLine, charLengthThreshold, horizontalThresholdModifier);
      table.add(line);
    }
  }