/**
  * Extracts document level attachments
  *
  * @param filename a file from which document level attachments will be extracted
  * @throws IOException
  */
 public void extractDocLevelAttachments(String filename) throws IOException {
   PdfReader reader = new PdfReader(filename);
   PdfDictionary root = reader.getCatalog();
   PdfDictionary documentnames = root.getAsDict(PdfName.NAMES);
   PdfDictionary embeddedfiles = documentnames.getAsDict(PdfName.EMBEDDEDFILES);
   PdfArray filespecs = embeddedfiles.getAsArray(PdfName.NAMES);
   PdfDictionary filespec;
   PdfDictionary refs;
   FileOutputStream fos;
   PRStream stream;
   for (int i = 0; i < filespecs.size(); ) {
     filespecs.getAsString(i++);
     filespec = filespecs.getAsDict(i++);
     refs = filespec.getAsDict(PdfName.EF);
     for (PdfName key : refs.getKeys()) {
       fos = new FileOutputStream(String.format(PATH, filespec.getAsString(key).toString()));
       stream = (PRStream) PdfReader.getPdfObject(refs.getAsIndirectObject(key));
       fos.write(PdfReader.getStreamBytes(stream));
       fos.flush();
       fos.close();
     }
   }
   reader.close();
 }
 /**
  * Searches for a tag in a page.
  *
  * @param tag the name of the tag
  * @param object an identifier to find the marked content
  * @param page a page dictionary
  * @throws IOException
  */
 public void parseTag(String tag, PdfObject object, PdfDictionary page) throws IOException {
   PRStream stream = (PRStream) page.getAsStream(PdfName.CONTENTS);
   // if the identifier is a number, we can extract the content right away
   if (object instanceof PdfNumber) {
     PdfNumber mcid = (PdfNumber) object;
     RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue());
     TextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
     FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, filter);
     PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
     processor.processContent(PdfReader.getStreamBytes(stream), page.getAsDict(PdfName.RESOURCES));
     out.print(SimpleXMLParser.escapeXML(listener.getResultantText(), true));
   }
   // if the identifier is an array, we call the parseTag method
   // recursively
   else if (object instanceof PdfArray) {
     PdfArray arr = (PdfArray) object;
     int n = arr.size();
     for (int i = 0; i < n; i++) {
       parseTag(tag, arr.getPdfObject(i), page);
       if (i < n - 1) out.println();
     }
   }
   // if the identifier is a dictionary, we get the resources from the
   // dictionary
   else if (object instanceof PdfDictionary) {
     PdfDictionary mcr = (PdfDictionary) object;
     parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr.getAsDict(PdfName.PG));
   }
 }
Example #3
0
 private void addFieldResources(PdfDictionary catalog) throws IOException {
   if (fieldArray == null) return;
   PdfDictionary acroForm = new PdfDictionary();
   catalog.put(PdfName.ACROFORM, acroForm);
   acroForm.put(PdfName.FIELDS, fieldArray);
   acroForm.put(PdfName.DA, new PdfString("/Helv 0 Tf 0 g "));
   if (fieldTemplates.isEmpty()) return;
   PdfDictionary dr = new PdfDictionary();
   acroForm.put(PdfName.DR, dr);
   for (PdfTemplate template : fieldTemplates) {
     PdfFormField.mergeResources(dr, (PdfDictionary) template.getResources());
   }
   // if (dr.get(PdfName.ENCODING) == null) dr.put(PdfName.ENCODING, PdfName.WIN_ANSI_ENCODING);
   PdfDictionary fonts = dr.getAsDict(PdfName.FONT);
   if (fonts == null) {
     fonts = new PdfDictionary();
     dr.put(PdfName.FONT, fonts);
   }
   if (!fonts.contains(PdfName.HELV)) {
     PdfDictionary dic = new PdfDictionary(PdfName.FONT);
     dic.put(PdfName.BASEFONT, PdfName.HELVETICA);
     dic.put(PdfName.ENCODING, PdfName.WIN_ANSI_ENCODING);
     dic.put(PdfName.NAME, PdfName.HELV);
     dic.put(PdfName.SUBTYPE, PdfName.TYPE1);
     fonts.put(PdfName.HELV, addToBody(dic).getIndirectReference());
   }
   if (!fonts.contains(PdfName.ZADB)) {
     PdfDictionary dic = new PdfDictionary(PdfName.FONT);
     dic.put(PdfName.BASEFONT, PdfName.ZAPFDINGBATS);
     dic.put(PdfName.NAME, PdfName.ZADB);
     dic.put(PdfName.SUBTYPE, PdfName.TYPE1);
     fonts.put(PdfName.ZADB, addToBody(dic).getIndirectReference());
   }
 }
  /**
   * Processes content from the specified page number using the specified listener
   *
   * @param <E> the type of the renderListener - this makes it easy to chain calls
   * @param pageNumber the page number to process
   * @param renderListener the listener that will receive render callbacks
   * @return the provided renderListener
   * @throws IOException if operations on the reader fail
   */
  public <E extends RenderListener> E processContent(int pageNumber, E renderListener)
      throws IOException {
    PdfDictionary pageDic = reader.getPageN(pageNumber);
    PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);

    PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListener);
    processor.processContent(
        ContentByteUtils.getContentBytesForPage(reader, pageNumber), resourcesDic);
    return renderListener;
  }
Example #5
0
 public PdfContentByte getOverContent() {
   if (over == null) {
     if (pageResources == null) {
       pageResources = new PageResources();
       PdfDictionary resources = pageN.getAsDict(PdfName.RESOURCES);
       pageResources.setOriginalResources(resources, cstp.namePtr);
     }
     over = new PdfCopy.StampContent(cstp, pageResources);
   }
   return over;
 }
  private String extractTextFromPdf(byte pdfAsByteArray[]) throws IOException, IOException {
    PdfReader reader = new PdfReader(pdfAsByteArray);
    TextRenderListener listener = new TextRenderListener();
    PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
    PdfDictionary pageDic = reader.getPageN(1);
    PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);

    byte array[] = ContentByteUtils.getContentBytesForPage(reader, 1);
    processor.processContent(array, resourcesDic);

    return listener.getSb();
  }
 /**
  * Parses a string with structured content.
  *
  * @param reader the PdfReader that has access to the PDF file
  * @param os the OutputStream to which the resulting xml will be written
  * @param charset the charset to encode the data
  * @since 5.0.5
  */
 public void convertToXml(PdfReader reader, OutputStream os, String charset) throws IOException {
   this.reader = reader;
   OutputStreamWriter outs = new OutputStreamWriter(os, charset);
   out = new PrintWriter(outs);
   // get the StructTreeRoot from the root object
   PdfDictionary catalog = reader.getCatalog();
   PdfDictionary struct = catalog.getAsDict(PdfName.STRUCTTREEROOT);
   // Inspect the child or children of the StructTreeRoot
   inspectChild(struct.getDirectObject(PdfName.K));
   out.flush();
   out.close();
 }
 /**
  * If the child of a structured element is a dictionary, we inspect the child; we may also draw a
  * tag.
  *
  * @param k the child dictionary to inspect
  */
 public void inspectChildDictionary(PdfDictionary k) throws IOException {
   if (k == null) return;
   PdfName s = k.getAsName(PdfName.S);
   if (s != null) {
     String tagN = PdfName.decodeName(s.toString());
     String tag = fixTagName(tagN);
     out.print("<");
     out.print(tag);
     out.print(">");
     PdfDictionary dict = k.getAsDict(PdfName.PG);
     if (dict != null) parseTag(tagN, k.getDirectObject(PdfName.K), dict);
     inspectChild(k.get(PdfName.K));
     out.print("</");
     out.print(tag);
     out.println(">");
   } else inspectChild(k.get(PdfName.K));
 }
 /** @since 2.1.5; before 2.1.5 the method was private */
 protected void updateCalculationOrder(PdfReader reader) {
   PdfDictionary catalog = reader.getCatalog();
   PdfDictionary acro = catalog.getAsDict(PdfName.ACROFORM);
   if (acro == null) return;
   PdfArray co = acro.getAsArray(PdfName.CO);
   if (co == null || co.size() == 0) return;
   AcroFields af = reader.getAcroFields();
   for (int k = 0; k < co.size(); ++k) {
     PdfObject obj = co.getPdfObject(k);
     if (obj == null || !obj.isIndirect()) continue;
     String name = getCOName(reader, (PRIndirectReference) obj);
     if (af.getFieldItem(name) == null) continue;
     name = "." + name;
     if (calculationOrder.contains(name)) continue;
     calculationOrder.add(name);
   }
 }
Example #10
0
 private void doType1TT() {
   CMapToUnicode toUnicode = null;
   PdfObject enc = PdfReader.getPdfObject(font.get(PdfName.ENCODING));
   if (enc == null) {
     fillEncoding(null);
     try {
       toUnicode = processToUnicode();
       if (toUnicode != null) {
         Map<Integer, Integer> rm = toUnicode.createReverseMapping();
         for (Map.Entry<Integer, Integer> kv : rm.entrySet()) {
           uni2byte.put(kv.getKey().intValue(), kv.getValue().intValue());
         }
       }
     } catch (Exception ex) {
       throw new ExceptionConverter(ex);
     }
   } else {
     if (enc.isName()) fillEncoding((PdfName) enc);
     else if (enc.isDictionary()) {
       PdfDictionary encDic = (PdfDictionary) enc;
       enc = PdfReader.getPdfObject(encDic.get(PdfName.BASEENCODING));
       if (enc == null) fillEncoding(null);
       else fillEncoding((PdfName) enc);
       PdfArray diffs = encDic.getAsArray(PdfName.DIFFERENCES);
       if (diffs != null) {
         diffmap = new IntHashtable();
         int currentNumber = 0;
         for (int k = 0; k < diffs.size(); ++k) {
           PdfObject obj = diffs.getPdfObject(k);
           if (obj.isNumber()) currentNumber = ((PdfNumber) obj).intValue();
           else {
             int c[] = GlyphList.nameToUnicode(PdfName.decodeName(((PdfName) obj).toString()));
             if (c != null && c.length > 0) {
               uni2byte.put(c[0], currentNumber);
               diffmap.put(c[0], currentNumber);
             } else {
               if (toUnicode == null) {
                 toUnicode = processToUnicode();
                 if (toUnicode == null) {
                   toUnicode = new CMapToUnicode();
                 }
               }
               final String unicode = toUnicode.lookup(new byte[] {(byte) currentNumber}, 0, 1);
               if ((unicode != null) && (unicode.length() == 1)) {
                 this.uni2byte.put(unicode.charAt(0), currentNumber);
                 this.diffmap.put(unicode.charAt(0), currentNumber);
               }
             }
             ++currentNumber;
           }
         }
       }
     }
   }
   PdfArray newWidths = font.getAsArray(PdfName.WIDTHS);
   PdfNumber first = font.getAsNumber(PdfName.FIRSTCHAR);
   PdfNumber last = font.getAsNumber(PdfName.LASTCHAR);
   if (BuiltinFonts14.containsKey(fontName)) {
     BaseFont bf;
     try {
       bf = BaseFont.createFont(fontName, WINANSI, false);
     } catch (Exception e) {
       throw new ExceptionConverter(e);
     }
     int e[] = uni2byte.toOrderedKeys();
     for (int k = 0; k < e.length; ++k) {
       int n = uni2byte.get(e[k]);
       widths[n] = bf.getRawWidth(n, GlyphList.unicodeToName(e[k]));
     }
     if (diffmap != null) { // widths for diffmap must override existing ones
       e = diffmap.toOrderedKeys();
       for (int k = 0; k < e.length; ++k) {
         int n = diffmap.get(e[k]);
         widths[n] = bf.getRawWidth(n, GlyphList.unicodeToName(e[k]));
       }
       diffmap = null;
     }
     ascender = bf.getFontDescriptor(ASCENT, 1000);
     capHeight = bf.getFontDescriptor(CAPHEIGHT, 1000);
     descender = bf.getFontDescriptor(DESCENT, 1000);
     italicAngle = bf.getFontDescriptor(ITALICANGLE, 1000);
     fontWeight = bf.getFontDescriptor(FONT_WEIGHT, 1000);
     llx = bf.getFontDescriptor(BBOXLLX, 1000);
     lly = bf.getFontDescriptor(BBOXLLY, 1000);
     urx = bf.getFontDescriptor(BBOXURX, 1000);
     ury = bf.getFontDescriptor(BBOXURY, 1000);
   }
   if (first != null && last != null && newWidths != null) {
     int f = first.intValue();
     int nSize = f + newWidths.size();
     if (widths.length < nSize) {
       int[] tmp = new int[nSize];
       System.arraycopy(widths, 0, tmp, 0, f);
       widths = tmp;
     }
     for (int k = 0; k < newWidths.size(); ++k) {
       widths[f + k] = newWidths.getAsNumber(k).intValue();
     }
   }
   fillFontDesc(font.getAsDict(PdfName.FONTDESCRIPTOR));
 }