Example #1
0
  /** Serializes the given (non-text) element. */
  protected String serializeNonTextElement(PdfElement element) {
    if (element == null) {
      return null;
    }

    List<String> fields = new ArrayList<>();

    // Add feature name.
    PdfFeature feature = element.getFeature();
    fields.add(feature != null ? feature.getFieldName() : "");

    // Add bounding box.
    Rectangle rect = element.getRectangle();
    fields.add(rect != null ? String.valueOf(rect.getMinX()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMinY()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMaxX()) : "");
    fields.add(rect != null ? String.valueOf(rect.getMaxY()) : "");

    // Add color.
    PdfColor color = element.getColor();
    fields.add(color != null ? color.getId() : "");

    // Add role.
    PdfRole role = element.getRole();
    fields.add(role != null ? role.name : "");

    return CollectionUtils.join(fields, "\t");
  }
Example #2
0
  /** Serializes the given elements with respect to the given roles. */
  protected List<String> serializeElementsWithRespectToRoles(List<? extends PdfElement> elements) {
    if (elements == null) {
      return null;
    }

    List<String> lines = new ArrayList<>();
    for (PdfElement element : elements) {
      // Serialize the element only if its role is included in the given roles.
      if (getRolesToSerialize().contains(element.getRole())) {
        String serialized = serializeElement(element);
        if (serialized != null && !serialized.isEmpty()) {
          lines.add(serialized);
        }
      }
    }
    return lines;
  }
Example #3
0
  /** Serializes the given page. */
  protected List<String> serializePage(PdfPage page) {
    if (page == null) {
      return null;
    }

    List<String> lines = new ArrayList<>();

    // Obtain the features to serialize.
    List<PdfFeature> features = getFeaturesToSerialize();

    if (features != null) {
      for (PdfFeature f : features) {
        /**
         * This feature was added for David. He needs the paragraphs with the associated lines.
         * Remove it if it not needed anymore.
         */
        if (f == PdfFeature.paragraphs_with_lines) {
          List<PdfTextParagraph> paragraphs = page.getParagraphs();
          for (PdfElement para : paragraphs) {
            lines.add(serializeElement(para));
            for (PdfTextLine line : para.getTextLines()) {
              lines.add(serializeElement(line));
            }
          }
          continue;
        }

        // Obtain the elements by feature.
        List<? extends PdfElement> elements = page.getElementsByFeature(f);

        List<String> serialized = serializeElementsWithRespectToRoles(elements);
        if (serialized != null && !serialized.isEmpty()) {
          lines.addAll(serialized);
        }
      }
    }

    return lines;
  }
Example #4
0
  /** Serializes the given element. */
  protected String serializeElement(PdfElement element) {
    if (element == null) {
      return null;
    }

    // TODO: Get rid of ignore method. Is still needed for dehyphenation.
    if (element.ignore()) {
      return null;
    }

    if (element instanceof PdfTextElement) {
      return serializeTextElement((PdfTextElement) element);
    } else {
      return serializeNonTextElement(element);
    }
  }