/** * Searches for a tag in a page. * * @param tag the name of the tag * @param object an identifier to find the marked content * @param page a page dictionary * @throws IOException */ public void parseTag(String tag, PdfObject object, PdfDictionary page) throws IOException { PRStream stream = (PRStream) page.getAsStream(PdfName.CONTENTS); // if the identifier is a number, we can extract the content right away if (object instanceof PdfNumber) { PdfNumber mcid = (PdfNumber) object; RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue()); TextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, filter); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); processor.processContent(PdfReader.getStreamBytes(stream), page.getAsDict(PdfName.RESOURCES)); out.print(SimpleXMLParser.escapeXML(listener.getResultantText(), true)); } // if the identifier is an array, we call the parseTag method // recursively else if (object instanceof PdfArray) { PdfArray arr = (PdfArray) object; int n = arr.size(); for (int i = 0; i < n; i++) { parseTag(tag, arr.getPdfObject(i), page); if (i < n - 1) out.println(); } } // if the identifier is a dictionary, we get the resources from the // dictionary else if (object instanceof PdfDictionary) { PdfDictionary mcr = (PdfDictionary) object; parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr.getAsDict(PdfName.PG)); } }
/** * Processes content from the specified page number using the specified listener * * @param <E> the type of the renderListener - this makes it easy to chain calls * @param pageNumber the page number to process * @param renderListener the listener that will receive render callbacks * @return the provided renderListener * @throws IOException if operations on the reader fail */ public <E extends RenderListener> E processContent(int pageNumber, E renderListener) throws IOException { PdfDictionary pageDic = reader.getPageN(pageNumber); PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListener); processor.processContent( ContentByteUtils.getContentBytesForPage(reader, pageNumber), resourcesDic); return renderListener; }