Beispiel #1
0
  /**
   * Takes the raw page lines represented as one continuous line and sorts the text by the y access
   * of the word bounds. The words are then sliced into separate lines base on y changes. And
   * finally each newly sorted line is sorted once more by each words x coordinate.
   */
  public void sortAndFormatText() {
    ArrayList<LineText> visiblePageLines = new ArrayList<LineText>(pageLines);
    // create new array for storing the sorted lines
    ArrayList<LineText> sortedPageLines = sortLinesVertically(visiblePageLines);
    // try and insert the option words on existing lines
    if (sortedPageLines.size() == 0) {
      sortedPageLines = getVisiblePageLines(true);
    } else {
      insertOptionalLines(sortedPageLines);
    }

    // sort again
    sortedPageLines = sortLinesVertically(sortedPageLines);

    // do a rough check for duplicate strings that are sometimes generated
    // by Chrystal Reports.  Enable with
    // -Dorg.icepdf.core.views.page.text.trim.duplicates=true
    if (checkForDuplicates) {
      for (final LineText lineText : sortedPageLines) {
        final List<WordText> words = lineText.getWords();
        if (words.size() > 0) {
          final List<WordText> trimmedWords = new ArrayList<WordText>();
          final Set<String> refs = new HashSet<String>();
          for (final WordText wordText : words) {
            // use regular rectangle so get a little rounding.
            final String key = wordText.getText() + wordText.getBounds().getBounds();
            if (refs.add(key)) {
              trimmedWords.add(wordText);
            }
          }
          lineText.setWords(trimmedWords);
        }
      }
    }

    // sort each line by x coordinate.
    if (sortedPageLines.size() > 0) {
      for (LineText lineText : sortedPageLines) {
        Collections.sort(lineText.getWords(), new WordPositionComparator());
      }
    }

    // recalculate the line bounds.
    if (sortedPageLines.size() > 0) {
      for (LineText lineText : sortedPageLines) {
        lineText.getBounds();
      }
    }

    // sort the lines
    if (sortedPageLines.size() > 0 && !preserveColumns) {
      Collections.sort(sortedPageLines, new LinePositionComparator());
    }
    // assign back the sorted lines.
    this.sortedPageLines = sortedPageLines;
  }