public static Header consumeHeaders(List<Line> remainingMessage) { final int headerLineNumber = remainingMessage.size() != 0 ? remainingMessage.get(0).getLineNumber() : 0; final Header headers = new Header(headerLineNumber); final Iterator<Line> iter = remainingMessage.iterator(); Line currentLine; boolean isHeader = true; while (iter.hasNext() && isHeader) { currentLine = iter.next(); final Matcher headerMatcher = PATTERN_HEADER_LINE.matcher(currentLine.toString()); if (headerMatcher.matches() && headerMatcher.groupCount() == 2) { iter.remove(); String headerName = headerMatcher.group(1).trim(); String headerValue = headerMatcher.group(2).trim(); headers.addHeader( headerName, Header.splitValuesByComma(headerValue), currentLine.getLineNumber()); } else { isHeader = false; } } return headers; }
public static Line removeEndingCRLF(final Line line) { Matcher matcher = PATTERN_LAST_CRLF.matcher(line.toString()); if (matcher.matches()) { return new Line(matcher.group(1), line.getLineNumber()); } else { return line; } }
/** * This method is used for extraction of tables with lot of empty cells in it. It is required for * the successful extraction of most Matrix tables. */ private void fillBlankCells() { // We say: cells get a line number. If a column does not contain a cell on a certain line, add a // whitespace. // Any cell that is not filled must be empty: for (Line line : data) { int lineNumber = line.getLineNumber(); COLUMNLOOP: for (Column2 column : dataInColumns) { for (Cell cell : column.getCellObjects()) { if (cell.getLineNumber() == lineNumber) { break; } if (cell.getLineNumber() > line.getLineNumber()) { // the last cell? // Add a blank cell to this column. // System.out.println("Add line to :" + column + " in line: " + // line.getLineNumber()); // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span> Tag t = Tag.valueOf("span"); Attributes attributes = new Attributes(); attributes.put("class", "ocrx_word"); attributes.put("id", "word_ADDEDBYTEA"); attributes.put( "title", "bbox " + column.getAverageX1() + " " + (int) line.getAverageY1() + " " + column.getAverageX2() + " " + (int) line.getAverageY2()); Element newElement = new Element(t, "localhost:8080", attributes); newElement.text(" "); ArrayList<Element> newCell = new ArrayList<Element>(); newCell.add(newElement); // System.out.println("adding: " +newElement.text()); column.addCell(newCell); break COLUMNLOOP; } } } } }
/** * This method adds lines with missing partitions to the current columns. It loops trough lines * that were flagged for containing missing data and then adds the ones to columns that they fit * in (or to columns that fit in the partition). If this fails it will also try to check if the * partition merely touches a column, although this will return a lower validation score if * successful. */ private void addLinesWithMissingDataToColumns() { ArrayList<Cell> cellsWithMissingDataAdded = new ArrayList<Cell>(); ArrayList<Column2> newDataInColumns = new ArrayList<Column2>(); for (Line line : linesWithMissingData) { ArrayList<ArrayList<Element>> clusters = line.getClusters(); for (ArrayList<Element> cluster : clusters) { for (Column2 column : dataInColumns) { if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) && column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 3, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } else if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) || column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 2, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } else if (column.touchesColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // System.out.println("CLUSTER: " + cluster + " touches: " + // column); newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 1, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } } } } validation.setCellsWithMissingDataAdded(cellsWithMissingDataAdded.size()); validation.setCellsWithMissingDataAddedScores(cellsWithMissingDataAdded); }
/** * This method creates a list of cell objects derived from the partitions in this line. * * @return A list containing the cell objects in this line. */ public ArrayList<Cell> getCellObjects() { ArrayList<Cell> cellObjects = new ArrayList<Cell>(); for (ArrayList<Element> cell : cells) { // Now we need to find out in which line this cell is: String pos; String[] positions; pos = cell.get(0).attr("title"); positions = pos.split("\\s+"); int cellY1 = Integer.parseInt(positions[4]); int lineNumber = 0; for (Line line : data) { if (line.getAverageY1() > cellY1) { break; } lineNumber = line.getLineNumber(); } Cell currentCell = new Cell(cell, lineNumber); cellObjects.add(currentCell); } return cellObjects; }