/** * Searches for a tag in a page. * * @param tag the name of the tag * @param object an identifier to find the marked content * @param page a page dictionary * @throws IOException */ public void parseTag(String tag, PdfObject object, PdfDictionary page) throws IOException { PRStream stream = (PRStream) page.getAsStream(PdfName.CONTENTS); // if the identifier is a number, we can extract the content right away if (object instanceof PdfNumber) { PdfNumber mcid = (PdfNumber) object; RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue()); TextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, filter); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); processor.processContent(PdfReader.getStreamBytes(stream), page.getAsDict(PdfName.RESOURCES)); out.print(SimpleXMLParser.escapeXML(listener.getResultantText(), true)); } // if the identifier is an array, we call the parseTag method // recursively else if (object instanceof PdfArray) { PdfArray arr = (PdfArray) object; int n = arr.size(); for (int i = 0; i < n; i++) { parseTag(tag, arr.getPdfObject(i), page); if (i < n - 1) out.println(); } } // if the identifier is a dictionary, we get the resources from the // dictionary else if (object instanceof PdfDictionary) { PdfDictionary mcr = (PdfDictionary) object; parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr.getAsDict(PdfName.PG)); } }
/** * Parses a string with structured content. * * @param reader the PdfReader that has access to the PDF file * @param os the OutputStream to which the resulting xml will be written * @param charset the charset to encode the data * @since 5.0.5 */ public void convertToXml(PdfReader reader, OutputStream os, String charset) throws IOException { this.reader = reader; OutputStreamWriter outs = new OutputStreamWriter(os, charset); out = new PrintWriter(outs); // get the StructTreeRoot from the root object PdfDictionary catalog = reader.getCatalog(); PdfDictionary struct = catalog.getAsDict(PdfName.STRUCTTREEROOT); // Inspect the child or children of the StructTreeRoot inspectChild(struct.getDirectObject(PdfName.K)); out.flush(); out.close(); }
/** * If the child of a structured element is a dictionary, we inspect the child; we may also draw a * tag. * * @param k the child dictionary to inspect */ public void inspectChildDictionary(PdfDictionary k) throws IOException { if (k == null) return; PdfName s = k.getAsName(PdfName.S); if (s != null) { String tagN = PdfName.decodeName(s.toString()); String tag = fixTagName(tagN); out.print("<"); out.print(tag); out.print(">"); PdfDictionary dict = k.getAsDict(PdfName.PG); if (dict != null) parseTag(tagN, k.getDirectObject(PdfName.K), dict); inspectChild(k.get(PdfName.K)); out.print("</"); out.print(tag); out.println(">"); } else inspectChild(k.get(PdfName.K)); }