/** Serializes the given (non-text) element. */ protected String serializeNonTextElement(PdfElement element) { if (element == null) { return null; } List<String> fields = new ArrayList<>(); // Add feature name. PdfFeature feature = element.getFeature(); fields.add(feature != null ? feature.getFieldName() : ""); // Add bounding box. Rectangle rect = element.getRectangle(); fields.add(rect != null ? String.valueOf(rect.getMinX()) : ""); fields.add(rect != null ? String.valueOf(rect.getMinY()) : ""); fields.add(rect != null ? String.valueOf(rect.getMaxX()) : ""); fields.add(rect != null ? String.valueOf(rect.getMaxY()) : ""); // Add color. PdfColor color = element.getColor(); fields.add(color != null ? color.getId() : ""); // Add role. PdfRole role = element.getRole(); fields.add(role != null ? role.name : ""); return CollectionUtils.join(fields, "\t"); }
/** Serializes the given elements with respect to the given roles. */ protected List<String> serializeElementsWithRespectToRoles(List<? extends PdfElement> elements) { if (elements == null) { return null; } List<String> lines = new ArrayList<>(); for (PdfElement element : elements) { // Serialize the element only if its role is included in the given roles. if (getRolesToSerialize().contains(element.getRole())) { String serialized = serializeElement(element); if (serialized != null && !serialized.isEmpty()) { lines.add(serialized); } } } return lines; }
/** Serializes the given page. */ protected List<String> serializePage(PdfPage page) { if (page == null) { return null; } List<String> lines = new ArrayList<>(); // Obtain the features to serialize. List<PdfFeature> features = getFeaturesToSerialize(); if (features != null) { for (PdfFeature f : features) { /** * This feature was added for David. He needs the paragraphs with the associated lines. * Remove it if it not needed anymore. */ if (f == PdfFeature.paragraphs_with_lines) { List<PdfTextParagraph> paragraphs = page.getParagraphs(); for (PdfElement para : paragraphs) { lines.add(serializeElement(para)); for (PdfTextLine line : para.getTextLines()) { lines.add(serializeElement(line)); } } continue; } // Obtain the elements by feature. List<? extends PdfElement> elements = page.getElementsByFeature(f); List<String> serialized = serializeElementsWithRespectToRoles(elements); if (serialized != null && !serialized.isEmpty()) { lines.addAll(serialized); } } } return lines; }
/** Serializes the given element. */ protected String serializeElement(PdfElement element) { if (element == null) { return null; } // TODO: Get rid of ignore method. Is still needed for dehyphenation. if (element.ignore()) { return null; } if (element instanceof PdfTextElement) { return serializeTextElement((PdfTextElement) element); } else { return serializeNonTextElement(element); } }