Beispiel #1
0
  /** Returns true, if the given character should be considered on extraction. */
  public static boolean considerPdfCharacter(PdfCharacter character) {
    // Don't consider the character, if it is a diacritic.
    if (character.isDiacritic()) {
      return false;
    }

    PdfColor color = character.getColor();

    // Don't consider the character, if its color is white.
    if (color != null && color.isWhite(0.05f)) {
      return false;
    }

    // Don't consider the character "|" because it is used as PARA_ADDENDUM in
    // tex-paragraph-parser.
    if ("|".equals(character.getUnicode().trim()) && character.getFontsize() < 6.5f) { // TODO
      return false;
    }

    // Don't consider the character, if its orientation isn't 0.
    if (character.getOrientation() != 0) {
      return false;
    }

    // Don't consider the character, if its a proper rectangle.
    if (character.getRectangle().getWidth() <= 0 || character.getRectangle().getHeight() <= 0) {
      return false;
    }

    // If the character has a valid encoding, we have to consider it.
    if (character.hasEncoding()) {
      return true;
    }

    // Don't consider the character, if its doesn't hold any content.
    if (character.getUnicode().trim().isEmpty()) {
      return false;
    }

    return true;
  }
Beispiel #2
0
  /** Returns true, if the given shape should be considered on extraction. */
  public static boolean considerPdfShape(PdfShape shape, Rectangle clippingPath) {
    if (shape == null) {
      return false;
    }

    PdfColor color = shape.getColor();

    if (color == null) {
      return false;
    }

    if (color.isWhite()) {
      return false;
    }

    // If the clipping path is given, it must contain the shape.
    if (clippingPath != null) {
      return clippingPath.overlaps(shape.getRectangle());
    }

    return true;
  }