예제 #1
0
  /** Serializes the given page. */
  protected List<String> serializePage(PdfPage page) {
    if (page == null) {
      return null;
    }

    List<String> lines = new ArrayList<>();

    // Obtain the features to serialize.
    List<PdfFeature> features = getFeaturesToSerialize();

    if (features != null) {
      for (PdfFeature f : features) {
        /**
         * This feature was added for David. He needs the paragraphs with the associated lines.
         * Remove it if it not needed anymore.
         */
        if (f == PdfFeature.paragraphs_with_lines) {
          List<PdfTextParagraph> paragraphs = page.getParagraphs();
          for (PdfElement para : paragraphs) {
            lines.add(serializeElement(para));
            for (PdfTextLine line : para.getTextLines()) {
              lines.add(serializeElement(line));
            }
          }
          continue;
        }

        // Obtain the elements by feature.
        List<? extends PdfElement> elements = page.getElementsByFeature(f);

        List<String> serialized = serializeElementsWithRespectToRoles(elements);
        if (serialized != null && !serialized.isEmpty()) {
          lines.addAll(serialized);
        }
      }
    }

    return lines;
  }
예제 #2
0
  /** Serializes the given text element. */
  protected String serializeTextElement(PdfTextElement element) {
    if (element == null) {
      return null;
    }

    String text =
        element.getText(
            getSerializePunctuationMarks(), getSerializeSubscripts(), getSerializeSuperscripts());

    if (text == null || text.trim().isEmpty()) {
      return null;
    }

    List<String> fields = new ArrayList<>();

    // Add feature name.
    PdfFeature feature = element.getFeature();
    fields.add(feature != null ? feature.getFieldName() : "");

    // Add text.
    fields.add(text.replaceAll("\t", " "));

    // Add page.
    PdfPage page = element.getPage();
    fields.add(page != null ? String.valueOf(page.getPageNumber()) : "");

    // Add bounding box.
    Rectangle rect = element.getRectangle();
    fields.add(rect != null ? String.valueOf(rect.getMinX()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMinY()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMaxX()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMaxY()) : "");

    // Add most common font.
    PdfFont font = element.getFont();
    fields.add(font != null ? font.getId() : "");
    fields.add(String.valueOf(element.getFontsize()));

    // Add most common color.
    PdfColor color = element.getColor();
    fields.add(color != null ? color.getId() : "");

    // Add font of first and last character.
    PdfFont firstCharacterFont = null;
    PdfFont lastCharacterFont = null;
    PdfColor firstCharacterColor = null;
    PdfColor lastCharacterColor = null;
    float firstCharacterFontsize = 0;
    float lastCharacterFontsize = 0;
    List<PdfCharacter> characters = element.getTextCharacters();
    if (characters != null && !characters.isEmpty()) {
      Collections.sort(characters, new Comparators.MinXComparator());

      PdfCharacter firstCharacter = characters.get(0);
      PdfCharacter lastCharacter = characters.get(characters.size() - 1);

      if (firstCharacter != null) {
        firstCharacterFont = firstCharacter.getFont();
        firstCharacterFontsize = firstCharacter.getFontsize();
        firstCharacterColor = firstCharacter.getColor();
      }
      if (lastCharacter != null) {
        lastCharacterFont = lastCharacter.getFont();
        lastCharacterFontsize = lastCharacter.getFontsize();
        lastCharacterColor = lastCharacter.getColor();
      }
    }
    // Append font and color of first character.
    fields.add(firstCharacterFont != null ? firstCharacterFont.getId() : "");
    fields.add(String.valueOf(firstCharacterFontsize));
    fields.add(firstCharacterColor != null ? firstCharacterColor.getId() : "");
    // Append font and color of last character.
    fields.add(lastCharacterFont != null ? lastCharacterFont.getId() : "");
    fields.add(String.valueOf(lastCharacterFontsize));
    fields.add(lastCharacterColor != null ? lastCharacterColor.getId() : "");

    // Add role.
    PdfRole role = element.getRole();
    fields.add(role != null ? role.name : "");

    return CollectionUtils.join(fields, "\t");
  }