/** Returns true, if the given character should be considered on extraction. */ public static boolean considerPdfCharacter(PdfCharacter character) { // Don't consider the character, if it is a diacritic. if (character.isDiacritic()) { return false; } PdfColor color = character.getColor(); // Don't consider the character, if its color is white. if (color != null && color.isWhite(0.05f)) { return false; } // Don't consider the character "|" because it is used as PARA_ADDENDUM in // tex-paragraph-parser. if ("|".equals(character.getUnicode().trim()) && character.getFontsize() < 6.5f) { // TODO return false; } // Don't consider the character, if its orientation isn't 0. if (character.getOrientation() != 0) { return false; } // Don't consider the character, if its a proper rectangle. if (character.getRectangle().getWidth() <= 0 || character.getRectangle().getHeight() <= 0) { return false; } // If the character has a valid encoding, we have to consider it. if (character.hasEncoding()) { return true; } // Don't consider the character, if its doesn't hold any content. if (character.getUnicode().trim().isEmpty()) { return false; } return true; }
/** Returns true, if the given shape should be considered on extraction. */ public static boolean considerPdfShape(PdfShape shape, Rectangle clippingPath) { if (shape == null) { return false; } PdfColor color = shape.getColor(); if (color == null) { return false; } if (color.isWhite()) { return false; } // If the clipping path is given, it must contain the shape. if (clippingPath != null) { return clippingPath.overlaps(shape.getRectangle()); } return true; }