/**
  * Searches for a tag in a page.
  *
  * @param tag the name of the tag
  * @param object an identifier to find the marked content
  * @param page a page dictionary
  * @throws IOException
  */
 public void parseTag(String tag, PdfObject object, PdfDictionary page) throws IOException {
   PRStream stream = (PRStream) page.getAsStream(PdfName.CONTENTS);
   // if the identifier is a number, we can extract the content right away
   if (object instanceof PdfNumber) {
     PdfNumber mcid = (PdfNumber) object;
     RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue());
     TextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
     FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, filter);
     PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
     processor.processContent(PdfReader.getStreamBytes(stream), page.getAsDict(PdfName.RESOURCES));
     out.print(SimpleXMLParser.escapeXML(listener.getResultantText(), true));
   }
   // if the identifier is an array, we call the parseTag method
   // recursively
   else if (object instanceof PdfArray) {
     PdfArray arr = (PdfArray) object;
     int n = arr.size();
     for (int i = 0; i < n; i++) {
       parseTag(tag, arr.getPdfObject(i), page);
       if (i < n - 1) out.println();
     }
   }
   // if the identifier is a dictionary, we get the resources from the
   // dictionary
   else if (object instanceof PdfDictionary) {
     PdfDictionary mcr = (PdfDictionary) object;
     parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr.getAsDict(PdfName.PG));
   }
 }
 /**
  * Parses a string with structured content.
  *
  * @param reader the PdfReader that has access to the PDF file
  * @param os the OutputStream to which the resulting xml will be written
  * @param charset the charset to encode the data
  * @since 5.0.5
  */
 public void convertToXml(PdfReader reader, OutputStream os, String charset) throws IOException {
   this.reader = reader;
   OutputStreamWriter outs = new OutputStreamWriter(os, charset);
   out = new PrintWriter(outs);
   // get the StructTreeRoot from the root object
   PdfDictionary catalog = reader.getCatalog();
   PdfDictionary struct = catalog.getAsDict(PdfName.STRUCTTREEROOT);
   // Inspect the child or children of the StructTreeRoot
   inspectChild(struct.getDirectObject(PdfName.K));
   out.flush();
   out.close();
 }
 /**
  * If the child of a structured element is a dictionary, we inspect the child; we may also draw a
  * tag.
  *
  * @param k the child dictionary to inspect
  */
 public void inspectChildDictionary(PdfDictionary k) throws IOException {
   if (k == null) return;
   PdfName s = k.getAsName(PdfName.S);
   if (s != null) {
     String tagN = PdfName.decodeName(s.toString());
     String tag = fixTagName(tagN);
     out.print("<");
     out.print(tag);
     out.print(">");
     PdfDictionary dict = k.getAsDict(PdfName.PG);
     if (dict != null) parseTag(tagN, k.getDirectObject(PdfName.K), dict);
     inspectChild(k.get(PdfName.K));
     out.print("</");
     out.print(tag);
     out.println(">");
   } else inspectChild(k.get(PdfName.K));
 }