Пример #1
0
 /**
  * This method finds the average distances between the partitions and parses those to the
  * validation object for the calculation of the column confidence.
  */
 private void setClusterCertainties() {
   method:
   while (true) {
     ArrayList<Integer> totalDistances = data.get(0).getDistances();
     for (Line line : data) {
       if (data.indexOf(line) > 0) {
         for (int x = 0; x < line.getDistances().size(); x++) {
           if (!(x >= totalDistances.size() || x >= line.getDistances().size())) {
             int totalDistance = totalDistances.get(x) + line.getDistances().get(x);
             totalDistances.set(x, totalDistance);
           } else {
             LOGGER.info(
                 "Found a problem during the cluster certainties. I've given the table a very low confidence");
             ArrayList<Integer> lowValidation = new ArrayList<Integer>();
             for (int o : line.getDistances()) {
               lowValidation.add(0);
             }
             validation.setClusterCertainty(lowValidation, data.get(0).getDistanceThreshold());
             validation.setLineThreshold(data.get(0).getDistanceThreshold());
             break method;
           }
         }
       }
     }
     ArrayList<Integer> averageDistances = new ArrayList<Integer>();
     for (int distance : totalDistances) {
       averageDistances.add(distance / data.size());
     }
     validation.setClusterCertainty(averageDistances, data.get(0).getDistanceThreshold());
     validation.setLineThreshold(data.get(0).getDistanceThreshold());
     break method;
   }
 }
Пример #2
0
  /**
   * This is the constructor of the table class. It takes it's parameters and sets them as local
   * variables. It also puts the default values for the rest of the table and then starts calling
   * the other methods in this class. to extract the table according to the rules of TEA.
   *
   * @param spans These are the words below the table detection from the Page class.
   * @param charLengthThreshold This is the character length threshold as calculated in the Page
   *     class.
   * @param file This is the File that was used to extract the table from. It is only used for the
   *     creation of provenance.
   * @param workspace This is the workspace as specified by the user.
   * @param tableID This is the ID of the detected table. It is mainly used for the creation of the
   *     output file and for provenance.
   * @param verticalThresholdModifier The modifier from the configuration file that should be used
   *     to indicate how much space there should be between lines
   * @param horizontalThresholdModifier The modifier used for creating the threshold in horizontal
   *     partitioning.
   * @param averageLineDistance The average (vertical) distance between lines as calculated in the
   *     Page class.
   * @param debugging is true if the program is in debugging mode.
   * @param allowedHeaderIterations The amount of iterations that the program is allowed to run,
   *     searching for headers.
   * @param allowedHeaderSize The amount of headers supported by the program. Implemented as a last
   *     cut-off if thresholding fails.
   * @throws IOException When one of the files cant be found
   */
  public Table2(
      Elements spans,
      double charLengthThreshold,
      File file,
      int pageNumber,
      String workspace,
      int tableID,
      double verticalThresholdModifier,
      double horizontalThresholdModifier,
      double averageLineDistance,
      boolean debugging,
      int allowedHeaderSize,
      int allowedHeaderIterations)
      throws IOException {
    String debugContent = "";
    this.averageLineDistance = averageLineDistance;
    this.maxY1 = 0;
    this.spans = spans;
    this.name = "";
    this.horizontalThresholdModifier = horizontalThresholdModifier;
    this.verticalThresholdModifier = verticalThresholdModifier;
    this.validation = new Validation();
    this.validation.setAverageDistanceBetweenRows(averageLineDistance);
    this.pageNumber = pageNumber;

    if (spans.size() > 0) {
      setMaxY1();
      this.table = new ArrayList<Line>();

      createLines(charLengthThreshold);
      separateDataByCluster();
      filterLinesThatAreAboveY1();

      if (data.size() > 1) {
        System.out.println(getRawTable());
        debugContent = debugContent + getRawTable() + "\n";
        filterEmptyLines();
        findMissingData();
        findColumns();
        createColumns(charLengthThreshold);
        checkColumns();
        debugContent = debugContent + "lines with missing data: " + linesWithMissingData + "\n";
        if (linesWithMissingData != null) {
          addLinesWithMissingDataToColumns();
        }
        fillBlankCells();
      } else {
        LOGGER.info(
            "The word Table was detected but no clusters were found.\n"
                + "It was found at position: "
                + maxY1);
      }
      if (data.size() > 1) {
        for (Line line : data) {
          validation.setClusterCertainty(line.getDistances(), line.getDistanceThreshold());
          validation.setLineThreshold(line.getDistanceThreshold());
        }
        LOGGER.info("Table: " + getName());
        System.out.println("In Table: " + getName());
        System.out.println(
            "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-");
        System.out.println("Data in this table is: ");

        ArrayList<Integer> distances = new ArrayList<Integer>();
        Column2 lastColumn = null;
        for (Column2 column : dataInColumns) {
          System.out.println(column);
          if (dataInColumns.indexOf(column) == 0) {
            lastColumn = column;
            continue;
          }
          if (lastColumn != null) {
            distances.add(column.getAverageX1() - lastColumn.getAverageX2());
          }
        }
        validation.setClusterCertainty(
            distances, averageLineDistance * horizontalThresholdModifier);
        if (linesWithMissingData != null && linesWithMissingData.size() > 0) {
          System.out.println(
              "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-");
          System.out.println(
              "The following lines were detected for having missing data or it was a line that had more clusters then the rest of the table.: ");
          for (Line line : linesWithMissingData) {
            System.out.println(line);
          }
        }
        System.out.println(
            "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-");
        if (rowSpanners.size() > 0) {
          System.out.println("Potential rowspanners: ");
          for (Line line : rowSpanners) {
            System.out.println(line);
          }
        }
        System.out.println("Validation:\n" + validation);
        System.out.println(table);
        setClusterCertainties();
        System.out.println("Checking out the semantics.");
        SemanticFramework semanticFramework =
            new SemanticFramework(
                dataInColumns,
                (averageLineDistance * verticalThresholdModifier),
                rowSpanners,
                charLengthThreshold * horizontalThresholdModifier,
                table,
                validation,
                titleAndHeaders,
                allowedHeaderSize,
                allowedHeaderIterations);
        System.out.println("Checking for false positive...");
        checkForFalsePositive();
        System.out.println("False positive: " + validation.getFalsePositive());
        LOGGER.info("False positive: " + validation.getFalsePositive());
        System.out.println();
        fillBlankCells();
        System.out.println(semanticFramework);
        System.out.println("Calculating final table statistics.");
        setTableBoundaries(semanticFramework);
        System.out.println("Now writing to file.");
        write2(
            (workspace),
            file,
            tableID,
            semanticFramework); // write: getXMLContent(file, tableID, semanticFramework.getXML()),
        if (debugging) {
          writeDebugFile(debugContent, workspace, file);
        }
      } else {
        LOGGER.info("All the found data was filtered out!");
      }
    }
  }