Java LineText.getWords示例

编程语言: Java

命名空间/包名称: java.util

类/类型: LineText

方法/功能: getWords

hotexamples.com的示例: 7

Java LineText.getWords - 已找到7个示例。这些是从开源项目中提取的最受好评的java.util.LineText.getWords现实Java示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

getWords(7)

getBounds(3)

addAll(3)

clearSelected(2)

addText(1)

clearBounds(1)

clearCurrentWord(1)

clearHighlighted(1)

getSelected(1)

selectAll(1)

setWords(1)

示例#1

0

显示文件

文件： PageText.java 项目： svn2github/icepdf

  /**
   * Takes the raw page lines represented as one continuous line and sorts the text by the y access
   * of the word bounds. The words are then sliced into separate lines base on y changes. And
   * finally each newly sorted line is sorted once more by each words x coordinate.
   */
  public void sortAndFormatText() {
    ArrayList<LineText> visiblePageLines = new ArrayList<LineText>(pageLines);
    // create new array for storing the sorted lines
    ArrayList<LineText> sortedPageLines = sortLinesVertically(visiblePageLines);
    // try and insert the option words on existing lines
    if (sortedPageLines.size() == 0) {
      sortedPageLines = getVisiblePageLines(true);
    } else {
      insertOptionalLines(sortedPageLines);
    }

    // sort again
    sortedPageLines = sortLinesVertically(sortedPageLines);

    // do a rough check for duplicate strings that are sometimes generated
    // by Chrystal Reports.  Enable with
    // -Dorg.icepdf.core.views.page.text.trim.duplicates=true
    if (checkForDuplicates) {
      for (final LineText lineText : sortedPageLines) {
        final List<WordText> words = lineText.getWords();
        if (words.size() > 0) {
          final List<WordText> trimmedWords = new ArrayList<WordText>();
          final Set<String> refs = new HashSet<String>();
          for (final WordText wordText : words) {
            // use regular rectangle so get a little rounding.
            final String key = wordText.getText() + wordText.getBounds().getBounds();
            if (refs.add(key)) {
              trimmedWords.add(wordText);
            }
          }
          lineText.setWords(trimmedWords);
        }
      }
    }

    // sort each line by x coordinate.
    if (sortedPageLines.size() > 0) {
      for (LineText lineText : sortedPageLines) {
        Collections.sort(lineText.getWords(), new WordPositionComparator());
      }
    }

    // recalculate the line bounds.
    if (sortedPageLines.size() > 0) {
      for (LineText lineText : sortedPageLines) {
        lineText.getBounds();
      }
    }

    // sort the lines
    if (sortedPageLines.size() > 0 && !preserveColumns) {
      Collections.sort(sortedPageLines, new LinePositionComparator());
    }
    // assign back the sorted lines.
    this.sortedPageLines = sortedPageLines;
  }

示例#2

0

显示文件

文件： PageText.java 项目： svn2github/icepdf

 /**
  * Insert optional content into the main LineText array, basically we are trying to consolidate
  * all the visible text in the document.
  *
  * @param sortedPageLines List of LineText to add visible optional content to.
  */
 private void insertOptionalLines(ArrayList<LineText> sortedPageLines) {
   ArrayList<LineText> optionalPageLines = getVisiblePageLines(true);
   if (optionalPageLines != null) {
     for (LineText optionalPageLine : optionalPageLines) {
       float yOptional = optionalPageLine.getBounds().y;
       boolean found = false;
       for (LineText sortedPageLine : sortedPageLines) {
         Rectangle sortedBounds = sortedPageLine.getBounds().getBounds();
         float height = sortedBounds.height;
         float y = sortedBounds.y;
         float diff = Math.abs(yOptional - y);
         // corner case inclusion of a word and a space which is out of order from the
         // rest of the text in the document.
         if (diff < height) {
           sortedPageLine.addAll(optionalPageLine.getWords());
           found = true;
           break;
         }
       }
       if (!found) {
         sortedPageLines.add(optionalPageLine);
       }
     }
   }
 }

示例#3

0

显示文件

文件： PageText.java 项目： svn2github/icepdf

 public void newLine() {
   // make sure we don't insert a new line if the previous has no words.
   if (currentLine != null && currentLine.getWords().size() == 0) {
     return;
   }
   currentLine = new LineText();
   pageLines.add(currentLine);
 }

示例#4

0

显示文件

文件： PageText.java 项目： svn2github/icepdf

  public String toString() {
    StringBuilder extractedText = new StringBuilder();
    for (LineText lineText : pageLines) {

      for (WordText wordText : lineText.getWords()) {
        extractedText.append(wordText.getText());
      }
      extractedText.append('\n');
    }
    return extractedText.toString();
  }

示例#5

0

显示文件

文件： PageText.java 项目： svn2github/icepdf

 /**
  * Utility method to normalize text created in a Xform content stream and is only called from the
  * contentParser when parsing 'Do' token.
  *
  * @param transform do matrix transform
  */
 public void applyXObjectTransform(AffineTransform transform) {
   for (LineText lineText : pageLines) {
     lineText.clearBounds();
     for (WordText wordText : lineText.getWords()) {
       wordText.clearBounds();
       for (GlyphText glyph : wordText.getGlyphs()) {
         glyph.normalizeToUserSpace(transform, null);
       }
     }
   }
 }

示例#6

0

显示文件

文件： PageText.java 项目： svn2github/icepdf

 private ArrayList<LineText> getAllPageLines() {
   ArrayList<LineText> visiblePageLines = new ArrayList<LineText>(pageLines);
   // add optional content text that is visible.
   // check optional content.
   if (optionalPageLines != null) {
     // iterate over optional content keys and extract text from visible groups
     Set<OptionalContents> keys = optionalPageLines.keySet();
     LineText currentLine = new LineText();
     visiblePageLines.add(currentLine);
     for (OptionalContents key : keys) {
       if (key != null) {
         ArrayList<LineText> pageLines = optionalPageLines.get(key).getVisiblePageLines(true);
         for (LineText lineText : pageLines) {
           currentLine.addAll(lineText.getWords());
         }
       }
     }
     // recalculate the bounds.
     currentLine.getBounds();
   }
   return visiblePageLines;
 }

示例#7

0

显示文件

文件： PageText.java 项目： svn2github/icepdf

 /**
  * Sorts the given pageLines vertically (y coordinate) in page space. .
  *
  * @param pageLines page lines to sort, not directly sorted, new array is created for sorted data.
  * @return new array of sorted pages lines
  */
 private ArrayList<LineText> sortLinesVertically(ArrayList<LineText> pageLines) {
   ArrayList<LineText> sortedPageLines = new ArrayList<LineText>(64);
   // move over all
   for (LineText pageLine : pageLines) {
     // all page words will be on one line
     java.util.List<WordText> words = pageLine.getWords();
     if (words != null && words.size() > 0) {
       // break the words into lines on every change of y
       double lastY = Math.round(words.get(0).getTextExtractionBounds().y);
       int start = 0, end = 0;
       double currentY, diff;
       for (WordText word : words) {
         currentY = Math.round(word.getTextExtractionBounds().getY());
         // little bit of tolerance for detecting a line,  basically anything that is
         // >  then half the current word height / 2 will be marked as a break.
         // this works well enough sub and super script and inconsistencies
         // on table base text.
         diff = Math.abs(currentY - lastY);
         if (diff != 0 && diff > word.getTextExtractionBounds().getHeight() / 2) {
           LineText lineText = new LineText();
           lineText.addAll(words.subList(start, end));
           sortedPageLines.add(lineText);
           start = end;
         }
         end++;
         lastY = currentY;
       }
       if (start < end) {
         LineText lineText = new LineText();
         lineText.addAll(words.subList(start, end));
         sortedPageLines.add(lineText);
       }
     }
   }
   return sortedPageLines;
 }