Ejemplo n.º 1
0
  /** This method calculates the X1, X2, Y1 and Y2 values of the table. */
  private void setTableBoundaries(SemanticFramework semanticFramework) {
    this.minY1 = semanticFramework.getTitle().get(0).getLowestY1();

    int maxY2 = Integer.MIN_VALUE;
    int y2 = 0;
    for (Column2 column : dataInColumns) {
      y2 = column.getColumnBoundaryY2();
      if (y2 > maxY2) {
        maxY2 = y2;
      }
    }
    this.maxY2 = y2;
    this.minX1 = dataInColumns.get(0).getColumnBoundaryX1();
    this.maxX2 = dataInColumns.get(dataInColumns.size() - 1).getColumnBoundaryX2();
  }
Ejemplo n.º 2
0
  /**
   * This is the new write method that uses XML methods to generate an XML file.
   *
   * @param location The location where the new file should be stored.
   * @param file The file that was being used to create this table.
   * @param tableID The ID of the table.
   * @param semanticFramework The semantic framework object. This contains get methods that are
   *     required for the output File.
   * @throws IOException If the given location doesn't exist.
   */
  private void write2(String location, File file, int tableID, SemanticFramework semanticFramework)
      throws IOException {
    try {
      DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
      DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
      Document doc = docBuilder.newDocument();

      org.w3c.dom.Element rootElement = doc.createElement("TEAFile");
      doc.appendChild(rootElement);

      // provenance:
      org.w3c.dom.Element provenance = doc.createElement("provenance");
      rootElement.appendChild(provenance);

      org.w3c.dom.Element detectionID = doc.createElement("DetectionID");
      detectionID.appendChild(doc.createTextNode(tableID + ""));
      provenance.appendChild(detectionID);
      org.w3c.dom.Element fromFile = doc.createElement("fromFile");
      fromFile.appendChild(doc.createTextNode(file.getName()));
      provenance.appendChild(fromFile);
      org.w3c.dom.Element fromPath = doc.createElement("fromPath");
      fromPath.appendChild(doc.createTextNode(file.getAbsolutePath()));
      provenance.appendChild(fromPath);
      org.w3c.dom.Element fromPage = doc.createElement("fromPage");
      fromPage.appendChild(doc.createTextNode(Integer.toString(pageNumber)));
      provenance.appendChild(fromPage);

      org.w3c.dom.Element horizontalThresholdModifier =
          doc.createElement("horizontalThresholdModifier");
      horizontalThresholdModifier.appendChild(
          doc.createTextNode(this.horizontalThresholdModifier + ""));
      provenance.appendChild(horizontalThresholdModifier);
      org.w3c.dom.Element verticalThresholdModifier =
          doc.createElement("horizontalThresholdModifier");
      verticalThresholdModifier.appendChild(
          doc.createTextNode(this.verticalThresholdModifier + ""));
      provenance.appendChild(verticalThresholdModifier);

      // Results:
      org.w3c.dom.Element results = doc.createElement("results");
      rootElement.appendChild(results);

      org.w3c.dom.Element minX1 = doc.createElement("TableBoundaryX1");
      minX1.appendChild(doc.createTextNode(this.minX1 + ""));
      results.appendChild(minX1);
      org.w3c.dom.Element maxX2 = doc.createElement("TableBoundaryX2");
      maxX2.appendChild(doc.createTextNode(this.maxX2 + ""));
      results.appendChild(maxX2);
      org.w3c.dom.Element minY1 = doc.createElement("TableBoundaryY1");
      minY1.appendChild(doc.createTextNode(this.minY1 + ""));
      results.appendChild(minY1);
      org.w3c.dom.Element maxY2 = doc.createElement("TableBoundaryY2");
      maxY2.appendChild(doc.createTextNode(this.maxY2 + ""));
      results.appendChild(maxY2);

      org.w3c.dom.Element title1 = doc.createElement("title1");
      title1.appendChild(doc.createTextNode(name));
      results.appendChild(title1);
      org.w3c.dom.Element title2 = doc.createElement("title2");
      title2.appendChild(doc.createTextNode(titleAndHeaders.toString()));
      results.appendChild(title2);

      org.w3c.dom.Element columns = doc.createElement("columns");
      results.appendChild(columns);

      for (Column2 columnContent : dataInColumns) {
        org.w3c.dom.Element column = doc.createElement("column");
        column.appendChild(doc.createTextNode(columnContent.toString().replace("�", "")));
        columns.appendChild(column);
      }

      org.w3c.dom.Element lines = doc.createElement("lines");
      results.appendChild(lines);

      ArrayList<ArrayList<Cell>> table = recreateTableLines(semanticFramework);

      for (ArrayList<Cell> line : table) {
        org.w3c.dom.Element XMLLine = doc.createElement("line");
        XMLLine.appendChild(doc.createTextNode(line.toString().replace("�", "")));
        lines.appendChild(XMLLine);
      }

      /*System.out.println("table:" + table);

                  for(Line line : data){
                      if(line.getHighestY2()<=this.maxY2&&line.getLowestY1()>=this.minY1&&line.getClusterSize()>1){
                          org.w3c.dom.Element XMLLine = doc.createElement("line");
                          XMLLine.appendChild(doc.createTextNode(line.toString().replace("�", "")));
                          lines.appendChild(XMLLine);
                      }
                      else{
                          System.out.println("NO GOOD: " + line);
                      }
                  }
      */
      if (rowSpanners.size() > 0) {
        org.w3c.dom.Element rowSpanners = doc.createElement("rowSpanners");
        results.appendChild(rowSpanners);
        for (Line line : this.rowSpanners) {
          org.w3c.dom.Element rowSpanner = doc.createElement("rowSpanner");
          rowSpanner.appendChild(doc.createTextNode(line.toString()));
          rowSpanners.appendChild(rowSpanner);
        }
      }

      // Semantics:
      org.w3c.dom.Element semantics = doc.createElement("tableSemantics");
      rootElement.appendChild(semantics);
      org.w3c.dom.Element title = doc.createElement("title");
      title.appendChild(doc.createTextNode(semanticFramework.getTitle().toString()));
      semantics.appendChild(title);
      org.w3c.dom.Element titleConfidence = doc.createElement("titleConfidence");
      Double semanticFrameworkDouble = semanticFramework.getTitleConfidence();
      titleConfidence.appendChild(doc.createTextNode(semanticFrameworkDouble.toString()));
      semantics.appendChild(titleConfidence);

      if (!semanticFramework.getRowSpanners().isEmpty()) {
        org.w3c.dom.Element rowSpanners = doc.createElement("subHeaders");
        rowSpanners.appendChild(doc.createTextNode(semanticFramework.getRowSpanners().toString()));
        semantics.appendChild(rowSpanners);
        org.w3c.dom.Element IdentifiersConfidenceAlignment =
            doc.createElement("subHeadersConfidenceAlignment");
        IdentifiersConfidenceAlignment.appendChild(
            doc.createTextNode(semanticFramework.getIdentifiersConfidenceAlignment().toString()));
        semantics.appendChild(IdentifiersConfidenceAlignment);
        org.w3c.dom.Element getIdentifiersConfidenceColumnsSpanned =
            doc.createElement("subHeadersConfidenceColumnsSpanned");
        getIdentifiersConfidenceColumnsSpanned.appendChild(
            doc.createTextNode(
                semanticFramework.getIdentifiersConfidenceColumnsSpanned().toString()));
        semantics.appendChild(getIdentifiersConfidenceColumnsSpanned);
        org.w3c.dom.Element IdentifiersConfidenceLineDistance =
            doc.createElement("subHeadersConfidenceLineDistance");
        IdentifiersConfidenceLineDistance.appendChild(
            doc.createTextNode(
                semanticFramework.getIdentifiersConfidenceLineDistance().toString()));
        semantics.appendChild(IdentifiersConfidenceLineDistance);
      }
      if (!semanticFramework.getValidatedRowSpanners().isEmpty()) {
        org.w3c.dom.Element rowSpanners = doc.createElement("rowSpanners");
        rowSpanners.appendChild(
            doc.createTextNode(semanticFramework.getValidatedRowSpanners().toString()));
        semantics.appendChild(rowSpanners);
        org.w3c.dom.Element rowSpannersConfidenceAlignment =
            doc.createElement("rowSpannersConfidenceAlignment");
        rowSpannersConfidenceAlignment.appendChild(
            doc.createTextNode(semanticFramework.getRowSpannersConfidenceAlignment().toString()));
        semantics.appendChild(rowSpannersConfidenceAlignment);
        org.w3c.dom.Element rowSpannersConfidenceColumnsSpanned =
            doc.createElement("rowSpannersConfidenceColumnsSpanned");
        rowSpannersConfidenceColumnsSpanned.appendChild(
            doc.createTextNode(
                semanticFramework.getRowSpannersConfidenceColumnsSpanned().toString()));
        semantics.appendChild(rowSpannersConfidenceColumnsSpanned);
        org.w3c.dom.Element rowSpannersConfidenceLineDistance =
            doc.createElement("rowSpannersConfidenceLineDistance");
        rowSpannersConfidenceLineDistance.appendChild(
            doc.createTextNode(
                semanticFramework.getRowSpannersConfidenceLineDistance().toString()));
        semantics.appendChild(rowSpannersConfidenceLineDistance);
      }
      org.w3c.dom.Element headers = doc.createElement("headers");
      headers.appendChild(doc.createTextNode(semanticFramework.getHeaders().toString()));
      semantics.appendChild(headers);
      org.w3c.dom.Element headersConfidence = doc.createElement("headersConfidence");
      headersConfidence.appendChild(
          doc.createTextNode(semanticFramework.getHeaderConfidence().toString()));
      semantics.appendChild(headersConfidence);

      // validation:
      org.w3c.dom.Element validation = doc.createElement("validation");
      rootElement.appendChild(validation);

      org.w3c.dom.Element clusterCertainty = doc.createElement("columnConfidence");
      clusterCertainty.appendChild(
          doc.createTextNode(this.validation.getClusterCertainty().toString()));
      validation.appendChild(clusterCertainty);
      org.w3c.dom.Element mostFrequentNumberOfClusters =
          doc.createElement("mostFrequentNumberOfClusters");
      mostFrequentNumberOfClusters.appendChild(
          doc.createTextNode(this.validation.getMostFrequentNumberOfClusters() + ""));
      validation.appendChild(mostFrequentNumberOfClusters);
      org.w3c.dom.Element highestAmountOfClusters = doc.createElement("highestAmountOfClusters");
      highestAmountOfClusters.appendChild(
          doc.createTextNode(this.validation.getHighestAmountOfClusters() + ""));
      validation.appendChild(highestAmountOfClusters);
      org.w3c.dom.Element highestAmountOfClustersOccurrences =
          doc.createElement("highestAmountOfClustersOccurrences");
      highestAmountOfClustersOccurrences.appendChild(
          doc.createTextNode(this.validation.getHighestAmountOfClustersOccurrences() + ""));
      validation.appendChild(highestAmountOfClustersOccurrences);
      org.w3c.dom.Element clusterThreshold = doc.createElement("clusterThreshold");
      clusterThreshold.appendChild(doc.createTextNode(this.validation.getLineThreshold() + ""));
      validation.appendChild(clusterThreshold);
      org.w3c.dom.Element cellsWithMissingDataAdded =
          doc.createElement("cellsWithMissingDataAdded");
      cellsWithMissingDataAdded.appendChild(
          doc.createTextNode(this.validation.getCellsWithMissingDataAdded() + ""));
      validation.appendChild(cellsWithMissingDataAdded);
      if (this.validation.getCellsWithMissingDataAdded() > 0) {
        org.w3c.dom.Element cellsWithMissingDataAddedScores =
            doc.createElement("cellsWithMissingDataAddedScores");
        cellsWithMissingDataAddedScores.appendChild(
            doc.createTextNode(this.validation.getCellsWithMissingDataAddedObjects() + ""));
        validation.appendChild(cellsWithMissingDataAddedScores);
      }
      org.w3c.dom.Element averageDistanceBetweenRows =
          doc.createElement("averageDistanceBetweenRows");
      averageDistanceBetweenRows.appendChild(
          doc.createTextNode(this.validation.getAverageDistanceBetweenRows() + ""));
      validation.appendChild(averageDistanceBetweenRows);
      if (this.validation.getTitleConfidence().size() > 0) {
        org.w3c.dom.Element TitleConfidence = doc.createElement("TitleConfidence");
        TitleConfidence.appendChild(doc.createTextNode(this.validation.getTitleConfidence() + ""));
        validation.appendChild(TitleConfidence);
      }
      org.w3c.dom.Element falsePositive = doc.createElement("falsePositive");
      falsePositive.appendChild(doc.createTextNode(this.validation.getFalsePositive() + ""));
      validation.appendChild(falsePositive);

      TransformerFactory transformerFactory = TransformerFactory.newInstance();
      Transformer transformer = transformerFactory.newTransformer();
      DOMSource source = new DOMSource(doc);
      LOGGER.info(
          "Written file: "
              + location
              + "\\results\\"
              + file.getName().substring(0, file.getName().length() - 5)
              + "-"
              + tableID
              + ".xml");
      File file2 =
          new File(
              location
                  + "\\results\\"
                  + file.getName().substring(0, file.getName().length() - 5)
                  + "-"
                  + tableID
                  + ".xml");
      Writer output = new BufferedWriter(new FileWriter(file2));
      StreamResult result = new StreamResult(output);

      // Output to console for testing
      transformer.setOutputProperty(OutputKeys.INDENT, "yes");
      transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
      transformer.transform(source, result);
      output.close();
      System.out.println("File saved.");

    } catch (ParserConfigurationException pce) {
      pce.printStackTrace();
    } catch (TransformerConfigurationException e) {
      e.printStackTrace();
    } catch (TransformerException e) {
      e.printStackTrace();
    }
  }
Ejemplo n.º 3
0
  // This method will try to recreate the original lines of the table using all the previous
  // information about the table.
  private ArrayList<ArrayList<Cell>> recreateTableLines(SemanticFramework semanticFramework) {
    ArrayList<ArrayList<Cell>> table = new ArrayList<ArrayList<Cell>>();
    for (Column2 column : dataInColumns) {
      ArrayList<Cell> cells = column.getCellObjects();
      table.add(cells);
    }
    int maxLength = Integer.MIN_VALUE;
    for (ArrayList<Cell> column : table) {
      int length = column.size();
      if (length > maxLength) {
        maxLength = length;
      }
    }
    ArrayList<ArrayList<Cell>> newTable = new ArrayList<ArrayList<Cell>>();
    ArrayList<Column2> incompleteRowColumns = new ArrayList<Column2>();
    ArrayList<Cell> col = new ArrayList<Cell>();

    int counter = 0;
    while (counter < maxLength) {
      ArrayList<Cell> line = new ArrayList<Cell>();
      newTable.add(line);
      counter++;
    }

    for (Column2 column : dataInColumns) {
      if (column.getCellObjects().size() == maxLength) {
        col = column.getCellObjects();
        for (ArrayList<Cell> line : newTable) {
          if (line.size() >= dataInColumns.indexOf(column)) {
            line.add(dataInColumns.indexOf(column), col.get(newTable.indexOf(line)));
          } else {
            line.add(col.get(newTable.indexOf(line)));
          }
        }
      } else {
        incompleteRowColumns.add(column); // gonna be mapped
      }
    }
    for (Column2 column : incompleteRowColumns) {
      ArrayList<Cell> cells = column.getCellObjects();
      for (Cell cellOfNewColumn : cells) {
        System.out.println(cellOfNewColumn);
        boolean breaking = false;
        Cells:
        for (ArrayList<Cell> line : newTable) {
          for (Cell cell : line) {
            if (cell.getY1()
                >= cellOfNewColumn
                    .getY2()) { // ||(CommonMethods.calcDistance(cell.getY2(),
                                // cellOfNewColumn.getY1())>(averageLineDistance*verticalThresholdModifier))&&line.indexOf(cellOfNewColumn)!=0)
              System.out.println(
                  newTable.indexOf(line)
                      + " "
                      + cell
                      + " "
                      + cellOfNewColumn
                      + " "
                      + newTable.size()
                      + " "
                      + dataInColumns.indexOf(column)
                      + " "
                      + line.size());
              if (newTable.indexOf(line) != 0) {
                System.out.println(
                    newTable.get(newTable.indexOf(line) - 1).size()
                        + " "
                        + dataInColumns.indexOf(column));
                if (newTable.get(newTable.indexOf(line) - 1).size()
                    >= dataInColumns.indexOf(column)) {
                  newTable
                      .get(newTable.indexOf(line) - 1)
                      .add(dataInColumns.indexOf(column), cellOfNewColumn);
                  breaking = true;
                  break Cells;
                } else {
                  newTable.get(newTable.indexOf(line) - 1).add(cellOfNewColumn);
                  breaking = true;
                  break Cells;
                }
              } else {
                newTable.add(0, new ArrayList<Cell>());
                newTable.get(0).add(cellOfNewColumn);
                breaking = true;
                break Cells;
              }
            }
          }
        }
        if (!breaking) {
          System.out.println("wasn't breaking. Now it is!");
          System.out.println(
              newTable.size()
                  + " "
                  + dataInColumns.indexOf(column)
                  + " "
                  + newTable.get(newTable.size() - 1).size());
          System.out.println(dataInColumns.indexOf(column) + " " + (newTable.size() - 1));
          if (dataInColumns.indexOf(column) <= newTable.get(newTable.size() - 1).size() - 1) {
            newTable.get(newTable.size() - 1).add(dataInColumns.indexOf(column), cellOfNewColumn);
            breaking = true;
          } else {
            newTable.get(newTable.size() - 1).add(cellOfNewColumn);
            breaking = true;
          }
        }
      }
    }

    // TODO: Implement missing data by looking at the touchcolumn of the rowspan and add all the
    // columns from data in columns as empty cells.
    double totalY2 = 0;
    double averageY2;
    for (Line rowSpanner : semanticFramework.getValidatedRowSpanners()) {
      for (ArrayList<Cell> line : newTable) {
        for (Cell cell : line) {
          totalY2 = totalY2 + cell.getY2();
        }
        averageY2 = totalY2 / line.size();
        if (rowSpanner.getAverageY1() < averageY2) {
          newTable.add(newTable.indexOf(line), new ArrayList<Cell>());
          newTable.get(newTable.indexOf(line) - 1).add(rowSpanner.getCellObject());
          break;
        }
      }
    }

    for (Line subHeader : semanticFramework.getRowSpanners()) {
      for (ArrayList<Cell> line : newTable) {
        for (Cell cell : line) {
          totalY2 = totalY2 + cell.getY2();
        }
        int firstMappedColumn = 0;
        averageY2 = totalY2 / line.size();
        if (subHeader.getAverageY1() < averageY2) {
          newTable.add(newTable.indexOf(line), new ArrayList<Cell>());
          for (Column2 column : dataInColumns) {
            newTable.get(newTable.indexOf(line) - 1).add(new Cell());
            if (column.touchesColumn(subHeader.getFirstX1(), subHeader.getLastX2())) {
              firstMappedColumn = dataInColumns.indexOf(column);
            }
          }
          newTable
              .get(newTable.indexOf(line) - 1)
              .set(firstMappedColumn, subHeader.getCellObject());
          break;
        }
      }
    }

    System.out.println("NEWTABLE:");
    System.out.println(newTable);
    return newTable;
  }