Ejemplo n.º 1
0
  /**
   * Reads a PDF file for retrieving its metadata.
   *
   * @param file File
   */
  public void createTextFromPDF(File file) {
    if (file.exists()) {
      int page = 1;
      PdfReader reader = null;

      try {
        reader = new PdfReader(new RandomAccessFileOrArray(file.getAbsolutePath()), null);
        HashMap<String, String> pdfinfo = reader.getInfo();

        StringBuffer sb = new StringBuffer();
        sb.append("<html>=== Document Information ===<p>");
        sb.append(
            reader.getCropBox(page).getHeight() + "*" + reader.getCropBox(page).getWidth() + "<p>");
        sb.append("PDF Version: " + reader.getPdfVersion() + "<p>");
        sb.append("Number of pages: " + reader.getNumberOfPages() + "<p>");
        sb.append("Number of PDF objects: " + reader.getXrefSize() + "<p>");
        sb.append("File length: " + reader.getFileLength() + "<p>");
        sb.append("Encrypted= " + reader.isEncrypted() + "<p>");
        if (pdfinfo.get("Title") != null) {
          sb.append("Title= " + pdfinfo.get("Title") + "<p>");
        }
        if (pdfinfo.get("Author") != null) {
          sb.append("Author= " + pdfinfo.get("Author") + "<p>");
        }
        if (pdfinfo.get("Subject") != null) {
          sb.append("Subject= " + pdfinfo.get("Subject") + "<p>");
        }
        if (pdfinfo.get("Producer") != null) {
          sb.append("Producer= " + pdfinfo.get("Producer") + "<p>");
        }
        if (pdfinfo.get("ModDate") != null) {
          sb.append("ModDate= " + PdfDate.decode(pdfinfo.get("ModDate")).getTime() + "<p>");
        }
        if (pdfinfo.get("CreationDate") != null) {
          sb.append(
              "CreationDate= " + PdfDate.decode(pdfinfo.get("CreationDate")).getTime() + "<p>");
        }
        sb.append("</html>");
        label.setText(sb.toString());
      } catch (IOException ex) {
        label.setText("");
      }
    }
  }
Ejemplo n.º 2
0
  /**
   * This method searches for all image objects from the currently processed PDF file and stores
   * them as PDF in the given export directory or in the same directory where the original PDF file
   * is stored.
   *
   * <p>The filename of the images is build based on the original PDF filename (without extension)
   * and additional details like page number, image number and if available the internal image name.
   *
   * @param fullExportDirectoryPath The optional full export path where the images should be stored.
   *     If not given, the location of the original PDF file is used.
   * @throws Exception
   */
  private void imageExtractor(String fullExportDirectoryPath) throws Exception {
    if (fullExportDirectoryPath != null) {
      fullExportDirectoryPath = GlobalTools.checkDirectoryPath(fullExportDirectoryPath);
      File exportDirectory = new File(fullExportDirectoryPath);
      if (!exportDirectory.exists()) {
        exportDirectory.mkdirs();
      } // end if
    } // end if

    int totalNumberOfPDFObjects = pdfReader.getXrefSize();
    for (int pdfObjectCounter = 0; pdfObjectCounter < totalNumberOfPDFObjects; pdfObjectCounter++) {
      PdfObject pdfObject = pdfReader.getPdfObject(pdfObjectCounter);
      if (pdfObject != null) {
        if (pdfObject.isStream()) {
          PdfStream pdfStream = (PdfStream) pdfObject;
          PdfObject pdfObjectSubType = pdfStream.get(PdfName.SUBTYPE);

          if (pdfObject == null) {
            logger.debug("The internal PDF object is null.");
          } // end if

          if (!pdfObject.isStream()) {
            logger.debug("The internal PDF object is not representing a stream object.");
          } // end if

          // Check PDF subtype and make sure it's an Image type
          if (pdfObjectSubType != null
              && pdfObjectSubType.toString().equals(PdfName.IMAGE.toString())) {
            // Now we have a PDF stream object with an image but what is that exactly?
            // byte[] byteArrayImage = PdfReader.getStreamBytesRaw((PRStream) pdfStream);
            byte[] byteArrayImage = null;
            if (PdfName.FLATEDECODE.equals(pdfStream.getAsName(PdfName.FILTER))) {
              byteArrayImage = PdfReader.getStreamBytes((PRStream) pdfStream);
              // else if other filter (not supported yet)
            } else {
              byteArrayImage = PdfReader.getStreamBytesRaw((PRStream) pdfStream);
            } // end if..else

            // Test PdfImage - START
            /*
            PdfImage pdfImage = (PdfImage) ((PdfStream)((PRStream)pdfStream));
            logger.trace("pdfImage --> pdfName --> Width...........: " + pdfStream.get(PdfName.WIDTH));
            logger.trace("pdfImage --> pdfName --> Height..........: " + pdfStream.get(PdfName.HEIGHT));
            logger.trace("pdfImage --> pdfName --> Id..............: " + pdfImage.get(PdfName.ID));
            logger.trace("pdfImage --> pdfName --> Image...........: " + pdfImage.get(PdfName.IMAGE));
            logger.trace("pdfImage --> pdfName --> ImageB..........: " + pdfImage.get(PdfName.IMAGEB));
            logger.trace("pdfImage --> pdfName --> ImageC..........: " + pdfImage.get(PdfName.IMAGEC));
            logger.trace("pdfImage --> pdfName --> ImageI..........: " + pdfImage.get(PdfName.IMAGEI));
            logger.trace("pdfImage --> pdfName --> Imagemask.......: " + pdfImage.get(PdfName.IMAGEMASK));
            logger.trace("pdfImage --> pdfName --> Info............: " + pdfImage.get(PdfName.INFO));
            logger.trace("pdfImage --> pdfName --> Name............: " + pdfImage.get(PdfName.NAME));
            logger.trace("pdfImage --> pdfName --> Named...........: " + pdfImage.get(PdfName.NAMED));
            logger.trace("pdfImage --> pdfName --> Page............: " + pdfStream.get(PdfName.PAGE));
            */
            // Test PdfImage - STOP

            // STREAM
            /*
            logger.trace("pdfObject.toString()): " + pdfObject.toString());
            logger.trace("pdfObjectCounter.................: " + pdfObjectCounter);
            logger.trace("pdfObject.getIndRef().getNumber(): " + (pdfObject.getIndRef()!=null?pdfObject.getIndRef().toString():"null"));
            logger.trace("pdfStream.getIndRef().getNumber(): " + (pdfStream.getIndRef()!=null?pdfStream.getIndRef().toString():"null"));
            logger.trace("pdfStream --> pdfName --> toString........: " + pdfStream.toString());
            logger.trace("pdfStream --> pdfName --> Width...........: " + pdfStream.get(PdfName.WIDTH));
            logger.trace("pdfStream --> pdfName --> Height..........: " + pdfStream.get(PdfName.HEIGHT));
            logger.trace("pdfStream --> pdfName --> BitsPerComponent: " + pdfStream.get(PdfName.BITSPERCOMPONENT));
            logger.trace("pdfStream --> pdfName --> BitsPerSample...: " + pdfStream.get(PdfName.BITSPERSAMPLE));
            logger.trace("pdfStream --> pdfName --> ColorSpace......: " + pdfStream.get(PdfName.COLORSPACE));
            logger.trace("pdfStream --> pdfName --> CCITTFaxDecode..: " + pdfStream.get(PdfName.CCITTFAXDECODE));
            logger.trace("pdfStream --> pdfName --> Document........: " + pdfStream.get(PdfName.DOCUMENT));
            logger.trace("pdfStream --> pdfName --> Decode..........: " + pdfStream.get(PdfName.DECODE));
            logger.trace("pdfStream --> pdfName --> DecodeParms.....: " + pdfStream.get(PdfName.DECODEPARMS));
            logger.trace("pdfStream --> pdfName --> DeviceGray......: " + pdfStream.get(PdfName.DEVICEGRAY));
            logger.trace("pdfStream --> pdfName --> DeviceCMYK......: " + pdfStream.get(PdfName.DEVICECMYK));
            logger.trace("pdfStream --> pdfName --> DeviceRGB.......: " + pdfStream.get(PdfName.DEVICERGB));
            logger.trace("pdfStream --> pdfName --> Filter..........: " + pdfStream.get(PdfName.FILTER));
            logger.trace("pdfStream --> pdfName --> Filter - as name: " + pdfStream.getAsName(PdfName.FILTER));

            logger.trace("pdfStream --> pdfName --> Id..............: " + pdfStream.get(PdfName.ID));
            logger.trace("pdfStream --> pdfName --> Image...........: " + pdfStream.get(PdfName.IMAGE));
            logger.trace("pdfStream --> pdfName --> ImageB..........: " + pdfStream.get(PdfName.IMAGEB));
            logger.trace("pdfStream --> pdfName --> ImageC..........: " + pdfStream.get(PdfName.IMAGEC));
            logger.trace("pdfStream --> pdfName --> ImageI..........: " + pdfStream.get(PdfName.IMAGEI));
            logger.trace("pdfStream --> pdfName --> Imagemask.......: " + pdfStream.get(PdfName.IMAGEMASK));
            logger.trace("pdfStream --> pdfName --> Info............: " + pdfStream.get(PdfName.INFO));
            logger.trace("pdfStream --> pdfName --> Name............: " + pdfStream.get(PdfName.NAME));
            logger.trace("pdfStream --> pdfName --> Named...........: " + pdfStream.get(PdfName.NAMED));
            logger.trace("pdfStream --> pdfName --> Named - as name.: " + pdfStream.getAsName(PdfName.NAMED));

            logger.trace("pdfStream --> pdfName --> Page............: " + pdfStream.get(PdfName.PAGE));
            logger.trace("pdfStream --> pdfName --> PageElement.....: " + pdfStream.get(PdfName.PAGEELEMENT));
            logger.trace("pdfStream --> pdfName --> Pdf.............: " + pdfStream.get(PdfName.PDF));
            logger.trace("pdfStream --> pdfName --> PdfDocEncoding..: " + pdfStream.get(PdfName.PDFDOCENCODING));
            logger.trace("pdfStream --> pdfName --> Position........: " + pdfStream.get(PdfName.POSITION));
            logger.trace("pdfStream --> pdfName --> Producer........: " + pdfStream.get(PdfName.PRODUCER));
            logger.trace("pdfStream --> pdfName --> Properties......: " + pdfStream.get(PdfName.PROPERTIES));
            logger.trace("pdfStream --> pdfName --> Sect............: " + pdfStream.get(PdfName.SECT));
            logger.trace("pdfStream --> pdfName --> SubType.........: " + pdfStream.get(PdfName.SUBTYPE));
            logger.trace("pdfStream --> pdfName --> Supplement......: " + pdfStream.get(PdfName.SUPPLEMENT));
            logger.trace("pdfStream --> pdfName --> Title...........: " + pdfStream.get(PdfName.TITLE));
            */
            // logger.trace("pdfImage --> pdfName --> : " + (pdfName.).toString());

            // Extract the image name
            String streamImageName =
                (pdfStream.get(PdfName.NAME) == null
                    ? null
                    : pdfStream.get(PdfName.NAME).toString());
            if (streamImageName != null
                && streamImageName.length() > 1
                && streamImageName.startsWith("/")) {
              streamImageName = streamImageName.substring(1);
            } else {
              streamImageName = null;
            } // end if..else

            String exportFileWithoutExtension =
                (fullExportDirectoryPath != null
                        ? fullExportDirectoryPath
                        : this.fullPDFDirectoryPath)
                    + GlobalTools.getFileNameWithoutExtension(this.fullPDFFilePath)
                    + "_("
                    + "p000"
                    + "_ref"
                    + REF_NUMBER_FORMAT.format(pdfObjectCounter)
                    + (streamImageName == null ? "_unk" : "_" + streamImageName)
                    + ")";

            // Test
            FileOutputStream fileOutputStream =
                new FileOutputStream(exportFileWithoutExtension + ".jpg");

            /*
             * Write given byte array to a file.
             */
            fileOutputStream.write(byteArrayImage);
            fileOutputStream.flush();
            fileOutputStream.close();
            fileOutputStream = null;

            /*
             * Check image details
             */
            int pdfImageBitsPerComponent = -1;
            try {
              if (pdfStream.get(PdfName.BITSPERCOMPONENT).isNumber()) {
                pdfImageBitsPerComponent =
                    new Integer(pdfStream.get(PdfName.BITSPERCOMPONENT).toString()).intValue();
              }
            } catch (NumberFormatException ex) {
            }

            int pdfImageHeight = -1;
            try {
              if (pdfStream.get(PdfName.HEIGHT).isNumber()) {
                pdfImageHeight = new Integer(pdfStream.get(PdfName.HEIGHT).toString()).intValue();
              }
            } catch (NumberFormatException ex) {
            }

            int pdfImageWidth = -1;
            try {
              if (pdfStream.get(PdfName.WIDTH).isNumber()) {
                pdfImageWidth = new Integer(pdfStream.get(PdfName.WIDTH).toString()).intValue();
              }
            } catch (NumberFormatException ex) {
            }

            logger.debug("Height..........:" + pdfImageHeight);
            logger.debug("Width...........:" + pdfImageWidth);
            logger.debug("BitsPerComponent:" + pdfImageBitsPerComponent);

            // or you could try making a java.awt.Image from the array:

            if (PdfName.DEVICERGB.equals(pdfStream.get(PdfName.COLORSPACE))
                && PdfName.FLATEDECODE.equals(pdfStream.get(PdfName.FILTER))
                && pdfImageBitsPerComponent > 0
                && pdfImageWidth > 0
                && pdfImageHeight > 0) {

              BufferedImage bufferedImage =
                  ImageProcessingTools.toBufferedImage(
                      byteArrayImage, pdfImageWidth, pdfImageHeight, pdfImageBitsPerComponent);
              if (bufferedImage != null) {
                ImageIO.write(
                    bufferedImage,
                    "PNG",
                    new FileOutputStream(exportFileWithoutExtension + "_imageIO" + ".png"));
              } // end if
            } else if (PdfName.DEVICEGRAY.equals(pdfStream.get(PdfName.COLORSPACE))
                && PdfName.RUNLENGTHDECODE.equals(pdfStream.get(PdfName.FILTER))
                && pdfImageBitsPerComponent > 0
                && pdfImageWidth > 0) {

              BufferedImage bufferedImage =
                  ImageProcessingTools.toBufferedImage(
                      ImageProcessingTools.runLengthDecode(byteArrayImage),
                      pdfImageWidth,
                      2233,
                      pdfImageBitsPerComponent);
              if (bufferedImage != null) {
                ImageIO.write(
                    bufferedImage,
                    "PNG",
                    new FileOutputStream(exportFileWithoutExtension + "_imageIO" + ".png"));
              } // end if
            } // end if...

            /*
            Image image = Toolkit.getDefaultToolkit().createImage(imageByteArray);


            BufferedImage bufferedImage = ImageProcessingTools.toBufferedImage(image, pdfImageWidth, pdfImageHeight);
            if (bufferedImage != null) {
            	System.out.println("Image-Height....:" + bufferedImage.getHeight());
            	System.out.println("Image-Width.....:" + bufferedImage.getWidth());
            	System.out.println("Image-isAlphaP..:" + bufferedImage.isAlphaPremultiplied());
            	File pngOutputFile = new File(exportFileWithoutExtension + "_imageIO.jpg");
            	ImageIO.write(bufferedImage, "jpg", pngOutputFile);
            }
            */
            /**/

          } // end if
        } // end if
      } // end if
    } // end for
  }
Ejemplo n.º 3
0
 /**
  * Creates a list that will contain all the indirect objects in a PDF document.
  *
  * @param reader the reader that will read the PDF document
  */
 public IndirectObjectFactory(PdfReader reader) {
   this.reader = reader;
   current = -1;
   n = reader.getXrefSize();
 }