public static Header consumeHeaders(List<Line> remainingMessage) {
    final int headerLineNumber =
        remainingMessage.size() != 0 ? remainingMessage.get(0).getLineNumber() : 0;
    final Header headers = new Header(headerLineNumber);
    final Iterator<Line> iter = remainingMessage.iterator();
    Line currentLine;
    boolean isHeader = true;

    while (iter.hasNext() && isHeader) {
      currentLine = iter.next();
      final Matcher headerMatcher = PATTERN_HEADER_LINE.matcher(currentLine.toString());

      if (headerMatcher.matches() && headerMatcher.groupCount() == 2) {
        iter.remove();

        String headerName = headerMatcher.group(1).trim();
        String headerValue = headerMatcher.group(2).trim();

        headers.addHeader(
            headerName, Header.splitValuesByComma(headerValue), currentLine.getLineNumber());
      } else {
        isHeader = false;
      }
    }

    return headers;
  }
 public static Line removeEndingCRLF(final Line line) {
   Matcher matcher = PATTERN_LAST_CRLF.matcher(line.toString());
   if (matcher.matches()) {
     return new Line(matcher.group(1), line.getLineNumber());
   } else {
     return line;
   }
 }
Beispiel #3
0
  /**
   * This method is used for extraction of tables with lot of empty cells in it. It is required for
   * the successful extraction of most Matrix tables.
   */
  private void fillBlankCells() {
    // We say: cells get a line number. If a column does not contain a cell on a certain line, add a
    // whitespace.
    // Any cell that is not filled must be empty:
    for (Line line : data) {
      int lineNumber = line.getLineNumber();
      COLUMNLOOP:
      for (Column2 column : dataInColumns) {
        for (Cell cell : column.getCellObjects()) {
          if (cell.getLineNumber() == lineNumber) {
            break;
          }
          if (cell.getLineNumber() > line.getLineNumber()) { // the last cell?
            // Add a blank cell to this column.
            //                        System.out.println("Add line to :" + column + " in line: " +
            // line.getLineNumber());

            // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span>

            Tag t = Tag.valueOf("span");
            Attributes attributes = new Attributes();
            attributes.put("class", "ocrx_word");
            attributes.put("id", "word_ADDEDBYTEA");
            attributes.put(
                "title",
                "bbox "
                    + column.getAverageX1()
                    + " "
                    + (int) line.getAverageY1()
                    + " "
                    + column.getAverageX2()
                    + " "
                    + (int) line.getAverageY2());

            Element newElement = new Element(t, "localhost:8080", attributes);
            newElement.text(" ");
            ArrayList<Element> newCell = new ArrayList<Element>();
            newCell.add(newElement);
            //                        System.out.println("adding: " +newElement.text());
            column.addCell(newCell);
            break COLUMNLOOP;
          }
        }
      }
    }
  }
Beispiel #4
0
 /**
  * This method adds lines with missing partitions to the current columns. It loops trough lines
  * that were flagged for containing missing data and then adds the ones to columns that they fit
  * in (or to columns that fit in the partition). If this fails it will also try to check if the
  * partition merely touches a column, although this will return a lower validation score if
  * successful.
  */
 private void addLinesWithMissingDataToColumns() {
   ArrayList<Cell> cellsWithMissingDataAdded = new ArrayList<Cell>();
   ArrayList<Column2> newDataInColumns = new ArrayList<Column2>();
   for (Line line : linesWithMissingData) {
     ArrayList<ArrayList<Element>> clusters = line.getClusters();
     for (ArrayList<Element> cluster : clusters) {
       for (Column2 column : dataInColumns) {
         if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))
             && column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) {
           // then we need to add this cluster to that column:
           newDataInColumns.remove(column);
           column.addCell(cluster);
           newDataInColumns.add(column);
           Cell cell = new Cell(cluster, 3, line.getLineNumber());
           cellsWithMissingDataAdded.add(cell);
         } else if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))
             || column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) {
           // then we need to add this cluster to that column:
           newDataInColumns.remove(column);
           column.addCell(cluster);
           newDataInColumns.add(column);
           Cell cell = new Cell(cluster, 2, line.getLineNumber());
           cellsWithMissingDataAdded.add(cell);
         } else if (column.touchesColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) {
           //                        System.out.println("CLUSTER: " + cluster + " touches: " +
           // column);
           newDataInColumns.remove(column);
           column.addCell(cluster);
           newDataInColumns.add(column);
           Cell cell = new Cell(cluster, 1, line.getLineNumber());
           cellsWithMissingDataAdded.add(cell);
         }
       }
     }
   }
   validation.setCellsWithMissingDataAdded(cellsWithMissingDataAdded.size());
   validation.setCellsWithMissingDataAddedScores(cellsWithMissingDataAdded);
 }
Beispiel #5
0
  /**
   * This method creates a list of cell objects derived from the partitions in this line.
   *
   * @return A list containing the cell objects in this line.
   */
  public ArrayList<Cell> getCellObjects() {
    ArrayList<Cell> cellObjects = new ArrayList<Cell>();
    for (ArrayList<Element> cell : cells) {
      // Now we need to find out in which line this cell is:
      String pos;
      String[] positions;
      pos = cell.get(0).attr("title");
      positions = pos.split("\\s+");
      int cellY1 = Integer.parseInt(positions[4]);
      int lineNumber = 0;

      for (Line line : data) {
        if (line.getAverageY1() > cellY1) {
          break;
        }
        lineNumber = line.getLineNumber();
      }

      Cell currentCell = new Cell(cell, lineNumber);
      cellObjects.add(currentCell);
    }
    return cellObjects;
  }