Java LineText.getWords Examples

Programming Language: Java

Namespace/Package Name: java.util

Class/Type: LineText

Method/Function: getWords

Examples at hotexamples.com: 7

Java LineText.getWords - 7 examples found. These are the top rated real world Java examples of java.util.LineText.getWords extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getWords(7)

getBounds(3)

addAll(3)

clearSelected(2)

addText(1)

clearBounds(1)

clearCurrentWord(1)

clearHighlighted(1)

getSelected(1)

selectAll(1)

setWords(1)

Example #1

0

Show file

File: PageText.java Project: svn2github/icepdf

  /**
   * Takes the raw page lines represented as one continuous line and sorts the text by the y access
   * of the word bounds. The words are then sliced into separate lines base on y changes. And
   * finally each newly sorted line is sorted once more by each words x coordinate.
   */
  public void sortAndFormatText() {
    ArrayList<LineText> visiblePageLines = new ArrayList<LineText>(pageLines);
    // create new array for storing the sorted lines
    ArrayList<LineText> sortedPageLines = sortLinesVertically(visiblePageLines);
    // try and insert the option words on existing lines
    if (sortedPageLines.size() == 0) {
      sortedPageLines = getVisiblePageLines(true);
    } else {
      insertOptionalLines(sortedPageLines);
    }

    // sort again
    sortedPageLines = sortLinesVertically(sortedPageLines);

    // do a rough check for duplicate strings that are sometimes generated
    // by Chrystal Reports.  Enable with
    // -Dorg.icepdf.core.views.page.text.trim.duplicates=true
    if (checkForDuplicates) {
      for (final LineText lineText : sortedPageLines) {
        final List<WordText> words = lineText.getWords();
        if (words.size() > 0) {
          final List<WordText> trimmedWords = new ArrayList<WordText>();
          final Set<String> refs = new HashSet<String>();
          for (final WordText wordText : words) {
            // use regular rectangle so get a little rounding.
            final String key = wordText.getText() + wordText.getBounds().getBounds();
            if (refs.add(key)) {
              trimmedWords.add(wordText);
            }
          }
          lineText.setWords(trimmedWords);
        }
      }
    }

    // sort each line by x coordinate.
    if (sortedPageLines.size() > 0) {
      for (LineText lineText : sortedPageLines) {
        Collections.sort(lineText.getWords(), new WordPositionComparator());
      }
    }

    // recalculate the line bounds.
    if (sortedPageLines.size() > 0) {
      for (LineText lineText : sortedPageLines) {
        lineText.getBounds();
      }
    }

    // sort the lines
    if (sortedPageLines.size() > 0 && !preserveColumns) {
      Collections.sort(sortedPageLines, new LinePositionComparator());
    }
    // assign back the sorted lines.
    this.sortedPageLines = sortedPageLines;
  }

Example #2

0

Show file

File: PageText.java Project: svn2github/icepdf

 /**
  * Insert optional content into the main LineText array, basically we are trying to consolidate
  * all the visible text in the document.
  *
  * @param sortedPageLines List of LineText to add visible optional content to.
  */
 private void insertOptionalLines(ArrayList<LineText> sortedPageLines) {
   ArrayList<LineText> optionalPageLines = getVisiblePageLines(true);
   if (optionalPageLines != null) {
     for (LineText optionalPageLine : optionalPageLines) {
       float yOptional = optionalPageLine.getBounds().y;
       boolean found = false;
       for (LineText sortedPageLine : sortedPageLines) {
         Rectangle sortedBounds = sortedPageLine.getBounds().getBounds();
         float height = sortedBounds.height;
         float y = sortedBounds.y;
         float diff = Math.abs(yOptional - y);
         // corner case inclusion of a word and a space which is out of order from the
         // rest of the text in the document.
         if (diff < height) {
           sortedPageLine.addAll(optionalPageLine.getWords());
           found = true;
           break;
         }
       }
       if (!found) {
         sortedPageLines.add(optionalPageLine);
       }
     }
   }
 }

Example #3

0

Show file

File: PageText.java Project: svn2github/icepdf

 public void newLine() {
   // make sure we don't insert a new line if the previous has no words.
   if (currentLine != null && currentLine.getWords().size() == 0) {
     return;
   }
   currentLine = new LineText();
   pageLines.add(currentLine);
 }

Example #4

0

Show file

File: PageText.java Project: svn2github/icepdf

  public String toString() {
    StringBuilder extractedText = new StringBuilder();
    for (LineText lineText : pageLines) {

      for (WordText wordText : lineText.getWords()) {
        extractedText.append(wordText.getText());
      }
      extractedText.append('\n');
    }
    return extractedText.toString();
  }

Example #5

0

Show file

File: PageText.java Project: svn2github/icepdf

 /**
  * Utility method to normalize text created in a Xform content stream and is only called from the
  * contentParser when parsing 'Do' token.
  *
  * @param transform do matrix transform
  */
 public void applyXObjectTransform(AffineTransform transform) {
   for (LineText lineText : pageLines) {
     lineText.clearBounds();
     for (WordText wordText : lineText.getWords()) {
       wordText.clearBounds();
       for (GlyphText glyph : wordText.getGlyphs()) {
         glyph.normalizeToUserSpace(transform, null);
       }
     }
   }
 }

Example #6

0

Show file

File: PageText.java Project: svn2github/icepdf

 private ArrayList<LineText> getAllPageLines() {
   ArrayList<LineText> visiblePageLines = new ArrayList<LineText>(pageLines);
   // add optional content text that is visible.
   // check optional content.
   if (optionalPageLines != null) {
     // iterate over optional content keys and extract text from visible groups
     Set<OptionalContents> keys = optionalPageLines.keySet();
     LineText currentLine = new LineText();
     visiblePageLines.add(currentLine);
     for (OptionalContents key : keys) {
       if (key != null) {
         ArrayList<LineText> pageLines = optionalPageLines.get(key).getVisiblePageLines(true);
         for (LineText lineText : pageLines) {
           currentLine.addAll(lineText.getWords());
         }
       }
     }
     // recalculate the bounds.
     currentLine.getBounds();
   }
   return visiblePageLines;
 }

Example #7

0

Show file

File: PageText.java Project: svn2github/icepdf

 /**
  * Sorts the given pageLines vertically (y coordinate) in page space. .
  *
  * @param pageLines page lines to sort, not directly sorted, new array is created for sorted data.
  * @return new array of sorted pages lines
  */
 private ArrayList<LineText> sortLinesVertically(ArrayList<LineText> pageLines) {
   ArrayList<LineText> sortedPageLines = new ArrayList<LineText>(64);
   // move over all
   for (LineText pageLine : pageLines) {
     // all page words will be on one line
     java.util.List<WordText> words = pageLine.getWords();
     if (words != null && words.size() > 0) {
       // break the words into lines on every change of y
       double lastY = Math.round(words.get(0).getTextExtractionBounds().y);
       int start = 0, end = 0;
       double currentY, diff;
       for (WordText word : words) {
         currentY = Math.round(word.getTextExtractionBounds().getY());
         // little bit of tolerance for detecting a line,  basically anything that is
         // >  then half the current word height / 2 will be marked as a break.
         // this works well enough sub and super script and inconsistencies
         // on table base text.
         diff = Math.abs(currentY - lastY);
         if (diff != 0 && diff > word.getTextExtractionBounds().getHeight() / 2) {
           LineText lineText = new LineText();
           lineText.addAll(words.subList(start, end));
           sortedPageLines.add(lineText);
           start = end;
         }
         end++;
         lastY = currentY;
       }
       if (start < end) {
         LineText lineText = new LineText();
         lineText.addAll(words.subList(start, end));
         sortedPageLines.add(lineText);
       }
     }
   }
   return sortedPageLines;
 }