Ejemplo n.º 1
0
 /**
  * This method finds lines that are above the line that was detected in the title and deletes
  * those lines after giving an error message.
  */
 private void filterLinesThatAreAboveY1() {
   ArrayList<Line> removedLines = new ArrayList<Line>();
   for (Line line : data) {
     if (maxY1 > line.getY1OfFirstWord() || maxY1 > line.getY1OfLastWord()) {
       LOGGER.info(
           "Something is wrong, I detected the following line, which was above the title!");
       LOGGER.info(line.toString() + " " + maxY1 + " " + line.getY1OfFirstWord());
       removedLines.add(line);
     }
   }
   for (Line line : removedLines) {
     data.remove(line);
   }
 }
Ejemplo n.º 2
0
  /**
   * This is the new write method that uses XML methods to generate an XML file.
   *
   * @param location The location where the new file should be stored.
   * @param file The file that was being used to create this table.
   * @param tableID The ID of the table.
   * @param semanticFramework The semantic framework object. This contains get methods that are
   *     required for the output File.
   * @throws IOException If the given location doesn't exist.
   */
  private void write2(String location, File file, int tableID, SemanticFramework semanticFramework)
      throws IOException {
    try {
      DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
      DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
      Document doc = docBuilder.newDocument();

      org.w3c.dom.Element rootElement = doc.createElement("TEAFile");
      doc.appendChild(rootElement);

      // provenance:
      org.w3c.dom.Element provenance = doc.createElement("provenance");
      rootElement.appendChild(provenance);

      org.w3c.dom.Element detectionID = doc.createElement("DetectionID");
      detectionID.appendChild(doc.createTextNode(tableID + ""));
      provenance.appendChild(detectionID);
      org.w3c.dom.Element fromFile = doc.createElement("fromFile");
      fromFile.appendChild(doc.createTextNode(file.getName()));
      provenance.appendChild(fromFile);
      org.w3c.dom.Element fromPath = doc.createElement("fromPath");
      fromPath.appendChild(doc.createTextNode(file.getAbsolutePath()));
      provenance.appendChild(fromPath);
      org.w3c.dom.Element fromPage = doc.createElement("fromPage");
      fromPage.appendChild(doc.createTextNode(Integer.toString(pageNumber)));
      provenance.appendChild(fromPage);

      org.w3c.dom.Element horizontalThresholdModifier =
          doc.createElement("horizontalThresholdModifier");
      horizontalThresholdModifier.appendChild(
          doc.createTextNode(this.horizontalThresholdModifier + ""));
      provenance.appendChild(horizontalThresholdModifier);
      org.w3c.dom.Element verticalThresholdModifier =
          doc.createElement("horizontalThresholdModifier");
      verticalThresholdModifier.appendChild(
          doc.createTextNode(this.verticalThresholdModifier + ""));
      provenance.appendChild(verticalThresholdModifier);

      // Results:
      org.w3c.dom.Element results = doc.createElement("results");
      rootElement.appendChild(results);

      org.w3c.dom.Element minX1 = doc.createElement("TableBoundaryX1");
      minX1.appendChild(doc.createTextNode(this.minX1 + ""));
      results.appendChild(minX1);
      org.w3c.dom.Element maxX2 = doc.createElement("TableBoundaryX2");
      maxX2.appendChild(doc.createTextNode(this.maxX2 + ""));
      results.appendChild(maxX2);
      org.w3c.dom.Element minY1 = doc.createElement("TableBoundaryY1");
      minY1.appendChild(doc.createTextNode(this.minY1 + ""));
      results.appendChild(minY1);
      org.w3c.dom.Element maxY2 = doc.createElement("TableBoundaryY2");
      maxY2.appendChild(doc.createTextNode(this.maxY2 + ""));
      results.appendChild(maxY2);

      org.w3c.dom.Element title1 = doc.createElement("title1");
      title1.appendChild(doc.createTextNode(name));
      results.appendChild(title1);
      org.w3c.dom.Element title2 = doc.createElement("title2");
      title2.appendChild(doc.createTextNode(titleAndHeaders.toString()));
      results.appendChild(title2);

      org.w3c.dom.Element columns = doc.createElement("columns");
      results.appendChild(columns);

      for (Column2 columnContent : dataInColumns) {
        org.w3c.dom.Element column = doc.createElement("column");
        column.appendChild(doc.createTextNode(columnContent.toString().replace("�", "")));
        columns.appendChild(column);
      }

      org.w3c.dom.Element lines = doc.createElement("lines");
      results.appendChild(lines);

      ArrayList<ArrayList<Cell>> table = recreateTableLines(semanticFramework);

      for (ArrayList<Cell> line : table) {
        org.w3c.dom.Element XMLLine = doc.createElement("line");
        XMLLine.appendChild(doc.createTextNode(line.toString().replace("�", "")));
        lines.appendChild(XMLLine);
      }

      /*System.out.println("table:" + table);

                  for(Line line : data){
                      if(line.getHighestY2()<=this.maxY2&&line.getLowestY1()>=this.minY1&&line.getClusterSize()>1){
                          org.w3c.dom.Element XMLLine = doc.createElement("line");
                          XMLLine.appendChild(doc.createTextNode(line.toString().replace("�", "")));
                          lines.appendChild(XMLLine);
                      }
                      else{
                          System.out.println("NO GOOD: " + line);
                      }
                  }
      */
      if (rowSpanners.size() > 0) {
        org.w3c.dom.Element rowSpanners = doc.createElement("rowSpanners");
        results.appendChild(rowSpanners);
        for (Line line : this.rowSpanners) {
          org.w3c.dom.Element rowSpanner = doc.createElement("rowSpanner");
          rowSpanner.appendChild(doc.createTextNode(line.toString()));
          rowSpanners.appendChild(rowSpanner);
        }
      }

      // Semantics:
      org.w3c.dom.Element semantics = doc.createElement("tableSemantics");
      rootElement.appendChild(semantics);
      org.w3c.dom.Element title = doc.createElement("title");
      title.appendChild(doc.createTextNode(semanticFramework.getTitle().toString()));
      semantics.appendChild(title);
      org.w3c.dom.Element titleConfidence = doc.createElement("titleConfidence");
      Double semanticFrameworkDouble = semanticFramework.getTitleConfidence();
      titleConfidence.appendChild(doc.createTextNode(semanticFrameworkDouble.toString()));
      semantics.appendChild(titleConfidence);

      if (!semanticFramework.getRowSpanners().isEmpty()) {
        org.w3c.dom.Element rowSpanners = doc.createElement("subHeaders");
        rowSpanners.appendChild(doc.createTextNode(semanticFramework.getRowSpanners().toString()));
        semantics.appendChild(rowSpanners);
        org.w3c.dom.Element IdentifiersConfidenceAlignment =
            doc.createElement("subHeadersConfidenceAlignment");
        IdentifiersConfidenceAlignment.appendChild(
            doc.createTextNode(semanticFramework.getIdentifiersConfidenceAlignment().toString()));
        semantics.appendChild(IdentifiersConfidenceAlignment);
        org.w3c.dom.Element getIdentifiersConfidenceColumnsSpanned =
            doc.createElement("subHeadersConfidenceColumnsSpanned");
        getIdentifiersConfidenceColumnsSpanned.appendChild(
            doc.createTextNode(
                semanticFramework.getIdentifiersConfidenceColumnsSpanned().toString()));
        semantics.appendChild(getIdentifiersConfidenceColumnsSpanned);
        org.w3c.dom.Element IdentifiersConfidenceLineDistance =
            doc.createElement("subHeadersConfidenceLineDistance");
        IdentifiersConfidenceLineDistance.appendChild(
            doc.createTextNode(
                semanticFramework.getIdentifiersConfidenceLineDistance().toString()));
        semantics.appendChild(IdentifiersConfidenceLineDistance);
      }
      if (!semanticFramework.getValidatedRowSpanners().isEmpty()) {
        org.w3c.dom.Element rowSpanners = doc.createElement("rowSpanners");
        rowSpanners.appendChild(
            doc.createTextNode(semanticFramework.getValidatedRowSpanners().toString()));
        semantics.appendChild(rowSpanners);
        org.w3c.dom.Element rowSpannersConfidenceAlignment =
            doc.createElement("rowSpannersConfidenceAlignment");
        rowSpannersConfidenceAlignment.appendChild(
            doc.createTextNode(semanticFramework.getRowSpannersConfidenceAlignment().toString()));
        semantics.appendChild(rowSpannersConfidenceAlignment);
        org.w3c.dom.Element rowSpannersConfidenceColumnsSpanned =
            doc.createElement("rowSpannersConfidenceColumnsSpanned");
        rowSpannersConfidenceColumnsSpanned.appendChild(
            doc.createTextNode(
                semanticFramework.getRowSpannersConfidenceColumnsSpanned().toString()));
        semantics.appendChild(rowSpannersConfidenceColumnsSpanned);
        org.w3c.dom.Element rowSpannersConfidenceLineDistance =
            doc.createElement("rowSpannersConfidenceLineDistance");
        rowSpannersConfidenceLineDistance.appendChild(
            doc.createTextNode(
                semanticFramework.getRowSpannersConfidenceLineDistance().toString()));
        semantics.appendChild(rowSpannersConfidenceLineDistance);
      }
      org.w3c.dom.Element headers = doc.createElement("headers");
      headers.appendChild(doc.createTextNode(semanticFramework.getHeaders().toString()));
      semantics.appendChild(headers);
      org.w3c.dom.Element headersConfidence = doc.createElement("headersConfidence");
      headersConfidence.appendChild(
          doc.createTextNode(semanticFramework.getHeaderConfidence().toString()));
      semantics.appendChild(headersConfidence);

      // validation:
      org.w3c.dom.Element validation = doc.createElement("validation");
      rootElement.appendChild(validation);

      org.w3c.dom.Element clusterCertainty = doc.createElement("columnConfidence");
      clusterCertainty.appendChild(
          doc.createTextNode(this.validation.getClusterCertainty().toString()));
      validation.appendChild(clusterCertainty);
      org.w3c.dom.Element mostFrequentNumberOfClusters =
          doc.createElement("mostFrequentNumberOfClusters");
      mostFrequentNumberOfClusters.appendChild(
          doc.createTextNode(this.validation.getMostFrequentNumberOfClusters() + ""));
      validation.appendChild(mostFrequentNumberOfClusters);
      org.w3c.dom.Element highestAmountOfClusters = doc.createElement("highestAmountOfClusters");
      highestAmountOfClusters.appendChild(
          doc.createTextNode(this.validation.getHighestAmountOfClusters() + ""));
      validation.appendChild(highestAmountOfClusters);
      org.w3c.dom.Element highestAmountOfClustersOccurrences =
          doc.createElement("highestAmountOfClustersOccurrences");
      highestAmountOfClustersOccurrences.appendChild(
          doc.createTextNode(this.validation.getHighestAmountOfClustersOccurrences() + ""));
      validation.appendChild(highestAmountOfClustersOccurrences);
      org.w3c.dom.Element clusterThreshold = doc.createElement("clusterThreshold");
      clusterThreshold.appendChild(doc.createTextNode(this.validation.getLineThreshold() + ""));
      validation.appendChild(clusterThreshold);
      org.w3c.dom.Element cellsWithMissingDataAdded =
          doc.createElement("cellsWithMissingDataAdded");
      cellsWithMissingDataAdded.appendChild(
          doc.createTextNode(this.validation.getCellsWithMissingDataAdded() + ""));
      validation.appendChild(cellsWithMissingDataAdded);
      if (this.validation.getCellsWithMissingDataAdded() > 0) {
        org.w3c.dom.Element cellsWithMissingDataAddedScores =
            doc.createElement("cellsWithMissingDataAddedScores");
        cellsWithMissingDataAddedScores.appendChild(
            doc.createTextNode(this.validation.getCellsWithMissingDataAddedObjects() + ""));
        validation.appendChild(cellsWithMissingDataAddedScores);
      }
      org.w3c.dom.Element averageDistanceBetweenRows =
          doc.createElement("averageDistanceBetweenRows");
      averageDistanceBetweenRows.appendChild(
          doc.createTextNode(this.validation.getAverageDistanceBetweenRows() + ""));
      validation.appendChild(averageDistanceBetweenRows);
      if (this.validation.getTitleConfidence().size() > 0) {
        org.w3c.dom.Element TitleConfidence = doc.createElement("TitleConfidence");
        TitleConfidence.appendChild(doc.createTextNode(this.validation.getTitleConfidence() + ""));
        validation.appendChild(TitleConfidence);
      }
      org.w3c.dom.Element falsePositive = doc.createElement("falsePositive");
      falsePositive.appendChild(doc.createTextNode(this.validation.getFalsePositive() + ""));
      validation.appendChild(falsePositive);

      TransformerFactory transformerFactory = TransformerFactory.newInstance();
      Transformer transformer = transformerFactory.newTransformer();
      DOMSource source = new DOMSource(doc);
      LOGGER.info(
          "Written file: "
              + location
              + "\\results\\"
              + file.getName().substring(0, file.getName().length() - 5)
              + "-"
              + tableID
              + ".xml");
      File file2 =
          new File(
              location
                  + "\\results\\"
                  + file.getName().substring(0, file.getName().length() - 5)
                  + "-"
                  + tableID
                  + ".xml");
      Writer output = new BufferedWriter(new FileWriter(file2));
      StreamResult result = new StreamResult(output);

      // Output to console for testing
      transformer.setOutputProperty(OutputKeys.INDENT, "yes");
      transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
      transformer.transform(source, result);
      output.close();
      System.out.println("File saved.");

    } catch (ParserConfigurationException pce) {
      pce.printStackTrace();
    } catch (TransformerConfigurationException e) {
      e.printStackTrace();
    } catch (TransformerException e) {
      e.printStackTrace();
    }
  }