Example #1
0
  /** Serializes the given color. */
  protected String serializeColor(PdfColor color) {
    if (color == null) {
      return null;
    }

    String id = color.getId();

    if (id == null) {
      return null;
    }

    List<String> fields = new ArrayList<>();

    // Add feature name.
    fields.add("color");

    // Add id.
    fields.add(id);

    // Add rgb
    fields.add(String.valueOf(color.getR()));
    fields.add(String.valueOf(color.getG()));
    fields.add(String.valueOf(color.getB()));

    return CollectionUtils.join(fields, "\t");
  }
Example #2
0
  /** Serializes the given (non-text) element. */
  protected String serializeNonTextElement(PdfElement element) {
    if (element == null) {
      return null;
    }

    List<String> fields = new ArrayList<>();

    // Add feature name.
    PdfFeature feature = element.getFeature();
    fields.add(feature != null ? feature.getFieldName() : "");

    // Add bounding box.
    Rectangle rect = element.getRectangle();
    fields.add(rect != null ? String.valueOf(rect.getMinX()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMinY()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMaxX()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMaxY()) : "");

    // Add color.
    PdfColor color = element.getColor();
    fields.add(color != null ? color.getId() : "");

    // Add role.
    PdfRole role = element.getRole();
    fields.add(role != null ? role.name : "");

    return CollectionUtils.join(fields, "\t");
  }
Example #3
0
  /** Returns true, if the given character should be considered on extraction. */
  public static boolean considerPdfCharacter(PdfCharacter character) {
    // Don't consider the character, if it is a diacritic.
    if (character.isDiacritic()) {
      return false;
    }

    PdfColor color = character.getColor();

    // Don't consider the character, if its color is white.
    if (color != null && color.isWhite(0.05f)) {
      return false;
    }

    // Don't consider the character "|" because it is used as PARA_ADDENDUM in
    // tex-paragraph-parser.
    if ("|".equals(character.getUnicode().trim()) && character.getFontsize() < 6.5f) { // TODO
      return false;
    }

    // Don't consider the character, if its orientation isn't 0.
    if (character.getOrientation() != 0) {
      return false;
    }

    // Don't consider the character, if its a proper rectangle.
    if (character.getRectangle().getWidth() <= 0 || character.getRectangle().getHeight() <= 0) {
      return false;
    }

    // If the character has a valid encoding, we have to consider it.
    if (character.hasEncoding()) {
      return true;
    }

    // Don't consider the character, if its doesn't hold any content.
    if (character.getUnicode().trim().isEmpty()) {
      return false;
    }

    return true;
  }
Example #4
0
  /** Returns true, if the given shape should be considered on extraction. */
  public static boolean considerPdfShape(PdfShape shape, Rectangle clippingPath) {
    if (shape == null) {
      return false;
    }

    PdfColor color = shape.getColor();

    if (color == null) {
      return false;
    }

    if (color.isWhite()) {
      return false;
    }

    // If the clipping path is given, it must contain the shape.
    if (clippingPath != null) {
      return clippingPath.overlaps(shape.getRectangle());
    }

    return true;
  }
Example #5
0
  /** Serializes the given text element. */
  protected String serializeTextElement(PdfTextElement element) {
    if (element == null) {
      return null;
    }

    String text =
        element.getText(
            getSerializePunctuationMarks(), getSerializeSubscripts(), getSerializeSuperscripts());

    if (text == null || text.trim().isEmpty()) {
      return null;
    }

    List<String> fields = new ArrayList<>();

    // Add feature name.
    PdfFeature feature = element.getFeature();
    fields.add(feature != null ? feature.getFieldName() : "");

    // Add text.
    fields.add(text.replaceAll("\t", " "));

    // Add page.
    PdfPage page = element.getPage();
    fields.add(page != null ? String.valueOf(page.getPageNumber()) : "");

    // Add bounding box.
    Rectangle rect = element.getRectangle();
    fields.add(rect != null ? String.valueOf(rect.getMinX()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMinY()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMaxX()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMaxY()) : "");

    // Add most common font.
    PdfFont font = element.getFont();
    fields.add(font != null ? font.getId() : "");
    fields.add(String.valueOf(element.getFontsize()));

    // Add most common color.
    PdfColor color = element.getColor();
    fields.add(color != null ? color.getId() : "");

    // Add font of first and last character.
    PdfFont firstCharacterFont = null;
    PdfFont lastCharacterFont = null;
    PdfColor firstCharacterColor = null;
    PdfColor lastCharacterColor = null;
    float firstCharacterFontsize = 0;
    float lastCharacterFontsize = 0;
    List<PdfCharacter> characters = element.getTextCharacters();
    if (characters != null && !characters.isEmpty()) {
      Collections.sort(characters, new Comparators.MinXComparator());

      PdfCharacter firstCharacter = characters.get(0);
      PdfCharacter lastCharacter = characters.get(characters.size() - 1);

      if (firstCharacter != null) {
        firstCharacterFont = firstCharacter.getFont();
        firstCharacterFontsize = firstCharacter.getFontsize();
        firstCharacterColor = firstCharacter.getColor();
      }
      if (lastCharacter != null) {
        lastCharacterFont = lastCharacter.getFont();
        lastCharacterFontsize = lastCharacter.getFontsize();
        lastCharacterColor = lastCharacter.getColor();
      }
    }
    // Append font and color of first character.
    fields.add(firstCharacterFont != null ? firstCharacterFont.getId() : "");
    fields.add(String.valueOf(firstCharacterFontsize));
    fields.add(firstCharacterColor != null ? firstCharacterColor.getId() : "");
    // Append font and color of last character.
    fields.add(lastCharacterFont != null ? lastCharacterFont.getId() : "");
    fields.add(String.valueOf(lastCharacterFontsize));
    fields.add(lastCharacterColor != null ? lastCharacterColor.getId() : "");

    // Add role.
    PdfRole role = element.getRole();
    fields.add(role != null ? role.name : "");

    return CollectionUtils.join(fields, "\t");
  }