Esempio n. 1
0
  /**
   * This method is used for extraction of tables with lot of empty cells in it. It is required for
   * the successful extraction of most Matrix tables.
   */
  private void fillBlankCells() {
    // We say: cells get a line number. If a column does not contain a cell on a certain line, add a
    // whitespace.
    // Any cell that is not filled must be empty:
    for (Line line : data) {
      int lineNumber = line.getLineNumber();
      COLUMNLOOP:
      for (Column2 column : dataInColumns) {
        for (Cell cell : column.getCellObjects()) {
          if (cell.getLineNumber() == lineNumber) {
            break;
          }
          if (cell.getLineNumber() > line.getLineNumber()) { // the last cell?
            // Add a blank cell to this column.
            //                        System.out.println("Add line to :" + column + " in line: " +
            // line.getLineNumber());

            // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span>

            Tag t = Tag.valueOf("span");
            Attributes attributes = new Attributes();
            attributes.put("class", "ocrx_word");
            attributes.put("id", "word_ADDEDBYTEA");
            attributes.put(
                "title",
                "bbox "
                    + column.getAverageX1()
                    + " "
                    + (int) line.getAverageY1()
                    + " "
                    + column.getAverageX2()
                    + " "
                    + (int) line.getAverageY2());

            Element newElement = new Element(t, "localhost:8080", attributes);
            newElement.text(" ");
            ArrayList<Element> newCell = new ArrayList<Element>();
            newCell.add(newElement);
            //                        System.out.println("adding: " +newElement.text());
            column.addCell(newCell);
            break COLUMNLOOP;
          }
        }
      }
    }
  }
Esempio n. 2
0
  // This method will try to recreate the original lines of the table using all the previous
  // information about the table.
  private ArrayList<ArrayList<Cell>> recreateTableLines(SemanticFramework semanticFramework) {
    ArrayList<ArrayList<Cell>> table = new ArrayList<ArrayList<Cell>>();
    for (Column2 column : dataInColumns) {
      ArrayList<Cell> cells = column.getCellObjects();
      table.add(cells);
    }
    int maxLength = Integer.MIN_VALUE;
    for (ArrayList<Cell> column : table) {
      int length = column.size();
      if (length > maxLength) {
        maxLength = length;
      }
    }
    ArrayList<ArrayList<Cell>> newTable = new ArrayList<ArrayList<Cell>>();
    ArrayList<Column2> incompleteRowColumns = new ArrayList<Column2>();
    ArrayList<Cell> col = new ArrayList<Cell>();

    int counter = 0;
    while (counter < maxLength) {
      ArrayList<Cell> line = new ArrayList<Cell>();
      newTable.add(line);
      counter++;
    }

    for (Column2 column : dataInColumns) {
      if (column.getCellObjects().size() == maxLength) {
        col = column.getCellObjects();
        for (ArrayList<Cell> line : newTable) {
          if (line.size() >= dataInColumns.indexOf(column)) {
            line.add(dataInColumns.indexOf(column), col.get(newTable.indexOf(line)));
          } else {
            line.add(col.get(newTable.indexOf(line)));
          }
        }
      } else {
        incompleteRowColumns.add(column); // gonna be mapped
      }
    }
    for (Column2 column : incompleteRowColumns) {
      ArrayList<Cell> cells = column.getCellObjects();
      for (Cell cellOfNewColumn : cells) {
        System.out.println(cellOfNewColumn);
        boolean breaking = false;
        Cells:
        for (ArrayList<Cell> line : newTable) {
          for (Cell cell : line) {
            if (cell.getY1()
                >= cellOfNewColumn
                    .getY2()) { // ||(CommonMethods.calcDistance(cell.getY2(),
                                // cellOfNewColumn.getY1())>(averageLineDistance*verticalThresholdModifier))&&line.indexOf(cellOfNewColumn)!=0)
              System.out.println(
                  newTable.indexOf(line)
                      + " "
                      + cell
                      + " "
                      + cellOfNewColumn
                      + " "
                      + newTable.size()
                      + " "
                      + dataInColumns.indexOf(column)
                      + " "
                      + line.size());
              if (newTable.indexOf(line) != 0) {
                System.out.println(
                    newTable.get(newTable.indexOf(line) - 1).size()
                        + " "
                        + dataInColumns.indexOf(column));
                if (newTable.get(newTable.indexOf(line) - 1).size()
                    >= dataInColumns.indexOf(column)) {
                  newTable
                      .get(newTable.indexOf(line) - 1)
                      .add(dataInColumns.indexOf(column), cellOfNewColumn);
                  breaking = true;
                  break Cells;
                } else {
                  newTable.get(newTable.indexOf(line) - 1).add(cellOfNewColumn);
                  breaking = true;
                  break Cells;
                }
              } else {
                newTable.add(0, new ArrayList<Cell>());
                newTable.get(0).add(cellOfNewColumn);
                breaking = true;
                break Cells;
              }
            }
          }
        }
        if (!breaking) {
          System.out.println("wasn't breaking. Now it is!");
          System.out.println(
              newTable.size()
                  + " "
                  + dataInColumns.indexOf(column)
                  + " "
                  + newTable.get(newTable.size() - 1).size());
          System.out.println(dataInColumns.indexOf(column) + " " + (newTable.size() - 1));
          if (dataInColumns.indexOf(column) <= newTable.get(newTable.size() - 1).size() - 1) {
            newTable.get(newTable.size() - 1).add(dataInColumns.indexOf(column), cellOfNewColumn);
            breaking = true;
          } else {
            newTable.get(newTable.size() - 1).add(cellOfNewColumn);
            breaking = true;
          }
        }
      }
    }

    // TODO: Implement missing data by looking at the touchcolumn of the rowspan and add all the
    // columns from data in columns as empty cells.
    double totalY2 = 0;
    double averageY2;
    for (Line rowSpanner : semanticFramework.getValidatedRowSpanners()) {
      for (ArrayList<Cell> line : newTable) {
        for (Cell cell : line) {
          totalY2 = totalY2 + cell.getY2();
        }
        averageY2 = totalY2 / line.size();
        if (rowSpanner.getAverageY1() < averageY2) {
          newTable.add(newTable.indexOf(line), new ArrayList<Cell>());
          newTable.get(newTable.indexOf(line) - 1).add(rowSpanner.getCellObject());
          break;
        }
      }
    }

    for (Line subHeader : semanticFramework.getRowSpanners()) {
      for (ArrayList<Cell> line : newTable) {
        for (Cell cell : line) {
          totalY2 = totalY2 + cell.getY2();
        }
        int firstMappedColumn = 0;
        averageY2 = totalY2 / line.size();
        if (subHeader.getAverageY1() < averageY2) {
          newTable.add(newTable.indexOf(line), new ArrayList<Cell>());
          for (Column2 column : dataInColumns) {
            newTable.get(newTable.indexOf(line) - 1).add(new Cell());
            if (column.touchesColumn(subHeader.getFirstX1(), subHeader.getLastX2())) {
              firstMappedColumn = dataInColumns.indexOf(column);
            }
          }
          newTable
              .get(newTable.indexOf(line) - 1)
              .set(firstMappedColumn, subHeader.getCellObject());
          break;
        }
      }
    }

    System.out.println("NEWTABLE:");
    System.out.println(newTable);
    return newTable;
  }