/** * Takes the raw page lines represented as one continuous line and sorts the text by the y access * of the word bounds. The words are then sliced into separate lines base on y changes. And * finally each newly sorted line is sorted once more by each words x coordinate. */ public void sortAndFormatText() { ArrayList<LineText> visiblePageLines = new ArrayList<LineText>(pageLines); // create new array for storing the sorted lines ArrayList<LineText> sortedPageLines = sortLinesVertically(visiblePageLines); // try and insert the option words on existing lines if (sortedPageLines.size() == 0) { sortedPageLines = getVisiblePageLines(true); } else { insertOptionalLines(sortedPageLines); } // sort again sortedPageLines = sortLinesVertically(sortedPageLines); // do a rough check for duplicate strings that are sometimes generated // by Chrystal Reports. Enable with // -Dorg.icepdf.core.views.page.text.trim.duplicates=true if (checkForDuplicates) { for (final LineText lineText : sortedPageLines) { final List<WordText> words = lineText.getWords(); if (words.size() > 0) { final List<WordText> trimmedWords = new ArrayList<WordText>(); final Set<String> refs = new HashSet<String>(); for (final WordText wordText : words) { // use regular rectangle so get a little rounding. final String key = wordText.getText() + wordText.getBounds().getBounds(); if (refs.add(key)) { trimmedWords.add(wordText); } } lineText.setWords(trimmedWords); } } } // sort each line by x coordinate. if (sortedPageLines.size() > 0) { for (LineText lineText : sortedPageLines) { Collections.sort(lineText.getWords(), new WordPositionComparator()); } } // recalculate the line bounds. if (sortedPageLines.size() > 0) { for (LineText lineText : sortedPageLines) { lineText.getBounds(); } } // sort the lines if (sortedPageLines.size() > 0 && !preserveColumns) { Collections.sort(sortedPageLines, new LinePositionComparator()); } // assign back the sorted lines. this.sortedPageLines = sortedPageLines; }
public String toString() { StringBuilder extractedText = new StringBuilder(); for (LineText lineText : pageLines) { for (WordText wordText : lineText.getWords()) { extractedText.append(wordText.getText()); } extractedText.append('\n'); } return extractedText.toString(); }