/** * Sorts the given pageLines vertically (y coordinate) in page space. . * * @param pageLines page lines to sort, not directly sorted, new array is created for sorted data. * @return new array of sorted pages lines */ private ArrayList<LineText> sortLinesVertically(ArrayList<LineText> pageLines) { ArrayList<LineText> sortedPageLines = new ArrayList<LineText>(64); // move over all for (LineText pageLine : pageLines) { // all page words will be on one line java.util.List<WordText> words = pageLine.getWords(); if (words != null && words.size() > 0) { // break the words into lines on every change of y double lastY = Math.round(words.get(0).getTextExtractionBounds().y); int start = 0, end = 0; double currentY, diff; for (WordText word : words) { currentY = Math.round(word.getTextExtractionBounds().getY()); // little bit of tolerance for detecting a line, basically anything that is // > then half the current word height / 2 will be marked as a break. // this works well enough sub and super script and inconsistencies // on table base text. diff = Math.abs(currentY - lastY); if (diff != 0 && diff > word.getTextExtractionBounds().getHeight() / 2) { LineText lineText = new LineText(); lineText.addAll(words.subList(start, end)); sortedPageLines.add(lineText); start = end; } end++; lastY = currentY; } if (start < end) { LineText lineText = new LineText(); lineText.addAll(words.subList(start, end)); sortedPageLines.add(lineText); } } } return sortedPageLines; }