public void parse(File file, int maxPaths, ProgressMonitor monitor) throws Exception {
    monitor.beginTask(tr("Parsing PDF", 1));

    PDDocument document = PDDocument.load(file);

    if (document.isEncrypted()) {
      throw new Exception(tr("Encrypted documents not supported."));
    }

    List<?> allPages = document.getDocumentCatalog().getAllPages();

    if (allPages.size() != 1) {
      throw new Exception(tr("The PDF file must have exactly one page."));
    }

    PDPage page = (PDPage) allPages.get(0);
    PDRectangle pageSize = page.findMediaBox();
    Integer rotationVal = page.getRotation();
    int rotation = 0;
    if (rotationVal != null) {
      rotation = rotationVal.intValue();
    }

    GraphicsProcessor p = new GraphicsProcessor(target, rotation, maxPaths, monitor);
    PageDrawer drawer = new PageDrawer();
    drawer.drawPage(p, page);
    this.target.bounds =
        new Rectangle2D.Double(
            pageSize.getLowerLeftX(),
            pageSize.getLowerLeftY(),
            pageSize.getWidth(),
            pageSize.getHeight());

    monitor.finishTask();
  }
  /**
   * 目次情報をPDFに挿入する。
   *
   * @param chapterList 目次情報の配列
   * @param destinationFileName 挿入先のPDFのファイル名
   * @throws Exception
   */
  public void createIndex(List<ChapterModel> chapterList, String destinationFileName)
      throws Exception {
    PDDocument document = PDDocument.load(destinationFileName);
    try {
      PDDocumentOutline outline = new PDDocumentOutline();
      document.getDocumentCatalog().setDocumentOutline(outline);
      PDOutlineItem pagesOutline = new PDOutlineItem();
      pagesOutline.setTitle("All Pages");
      outline.appendChild(pagesOutline);
      List pages = document.getDocumentCatalog().getAllPages();
      for (int i = 0; i < pages.size(); i++) {
        for (ChapterModel model : chapterList) {
          if (i == model.getPageNum()) {
            PDPage page = (PDPage) pages.get(i);
            PDPageFitWidthDestination dest = new PDPageFitWidthDestination();
            dest.setPage(page);
            PDOutlineItem bookmark = new PDOutlineItem();
            bookmark.setDestination(dest);
            bookmark.setTitle(model.getTitle());
            pagesOutline.appendChild(bookmark);
          }
        }
      }
      pagesOutline.openNode();
      outline.openNode();

      document.save(destinationFileName);
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      document.close();
    }
  }
  private void extractEmbeddedDocuments(PDDocument document)
      throws IOException, SAXException, TikaException {
    PDDocumentNameDictionary namesDictionary =
        new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
    if (efTree == null) {
      return;
    }

    Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
    // For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
    // This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    // If there is a need we could add a fully recursive search to find a non-null
    // Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
      processEmbeddedDocNames(embeddedFileNames);
    } else {
      List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
      if (kids == null) {
        return;
      }
      for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
        embeddedFileNames = node.getNames();
        if (embeddedFileNames != null) {
          processEmbeddedDocNames(embeddedFileNames);
        }
      }
    }
  }
 /**
  * extract clickable links from pdf
  *
  * @param pdf the document to parse
  * @return all detected links
  */
 private Collection<AnchorURL>[] extractPdfLinks(final PDDocument pdf) {
   @SuppressWarnings("unchecked")
   List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages();
   @SuppressWarnings("unchecked")
   Collection<AnchorURL>[] linkCollections =
       (Collection<AnchorURL>[]) new Collection<?>[allPages.size()];
   int pagecount = 0;
   for (PDPage page : allPages) {
     final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
     try {
       List<PDAnnotation> annotations = page.getAnnotations();
       if (annotations != null) {
         for (PDAnnotation pdfannotation : annotations) {
           if (pdfannotation instanceof PDAnnotationLink) {
             PDAction link = ((PDAnnotationLink) pdfannotation).getAction();
             if (link != null && link instanceof PDActionURI) {
               PDActionURI pdflinkuri = (PDActionURI) link;
               String uristr = pdflinkuri.getURI();
               AnchorURL url = new AnchorURL(uristr);
               pdflinks.add(url);
             }
           }
         }
       }
     } catch (IOException ex) {
     }
     linkCollections[pagecount++] = pdflinks;
   }
   return linkCollections;
 }
  private void _generateImagesPB(FileVersion fileVersion, InputStream inputStream)
      throws Exception {

    boolean generatePreview = _isGeneratePreview(fileVersion);
    boolean generateThumbnail = _isGenerateThumbnail(fileVersion);

    PDDocument pdDocument = null;

    try {
      pdDocument = PDDocument.load(inputStream);

      PDDocumentCatalog pdDocumentCatalog = pdDocument.getDocumentCatalog();

      List<PDPage> pdPages = pdDocumentCatalog.getAllPages();

      for (int i = 0; i < pdPages.size(); i++) {
        PDPage pdPage = pdPages.get(i);

        if (generateThumbnail && (i == 0)) {
          _generateImagesPB(
              fileVersion,
              pdPage,
              PropsValues.DL_FILE_ENTRY_THUMBNAIL_DPI,
              PropsValues.DL_FILE_ENTRY_THUMBNAIL_HEIGHT,
              PropsValues.DL_FILE_ENTRY_THUMBNAIL_WIDTH,
              true,
              0);

          if (_log.isInfoEnabled()) {
            _log.info("PDFBox generated a thumbnail for " + fileVersion.getFileVersionId());
          }
        }

        if (!generatePreview) {
          break;
        }

        _generateImagesPB(
            fileVersion,
            pdPage,
            PropsValues.DL_FILE_ENTRY_PREVIEW_DOCUMENT_DPI,
            PropsValues.DL_FILE_ENTRY_PREVIEW_DOCUMENT_HEIGHT,
            PropsValues.DL_FILE_ENTRY_PREVIEW_DOCUMENT_WIDTH,
            false,
            i + 1);
      }

      if (_log.isInfoEnabled() && generatePreview) {
        _log.info(
            "PDFBox generated "
                + getPreviewFileCount(fileVersion)
                + " preview pages for "
                + fileVersion.getFileVersionId());
      }
    } finally {
      if (pdDocument != null) {
        pdDocument.close();
      }
    }
  }
Example #6
0
 public static String extractText(InputStream src) throws IOException {
   StringBuilder text = new StringBuilder();
   COSDocument cosDoc = null;
   PDDocument pdDoc = null;
   try {
     PDFParser parser = new PDFParser(src);
     parser.parse();
     cosDoc = parser.getDocument();
     PDFTextStripper stripper = new PDFTextStripper();
     pdDoc = new PDDocument(cosDoc);
     int nbPages = pdDoc.getDocumentCatalog().getPages().getCount();
     for (int i = 0; i < nbPages; i++) {
       stripper.setStartPage(i + 1);
       stripper.setEndPage(i + 1);
       text.append(stripper.getText(pdDoc));
     }
   } finally {
     try {
       if (cosDoc != null) {
         cosDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
     try {
       if (pdDoc != null) {
         pdDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
   }
   return text.toString();
 }
Example #7
0
 /**
  * This will print the documents data.
  *
  * @param args The command line arguments.
  * @throws Exception If there is an error parsing the document.
  */
 public static void main(String[] args) throws Exception {
   if (args.length != 1) {
     usage();
   } else {
     PDDocument document = null;
     try {
       document = PDDocument.load(args[0]);
       if (document.isEncrypted()) {
         try {
           document.decrypt("");
         } catch (InvalidPasswordException e) {
           System.err.println("Error: Document is encrypted with a password.");
           System.exit(1);
         }
       }
       PrintTextLocations printer = new PrintTextLocations();
       List allPages = document.getDocumentCatalog().getAllPages();
       for (int i = 0; i < allPages.size(); i++) {
         PDPage page = (PDPage) allPages.get(i);
         System.out.println("Processing page: " + i);
         PDStream contents = page.getContents();
         if (contents != null) {
           printer.processStream(page, page.findResources(), page.getContents().getStream());
         }
       }
     } finally {
       if (document != null) {
         document.close();
       }
     }
   }
 }
  private void breakPage(PDDocument document, PrintCursor cursor, PrintData printData)
      throws IOException {
    if (cursor.currentStream != null) {
      cursor.currentStream.close();
    }

    if (printData.templateResource == null) {
      document.addPage(new PDPage(printData.pageConfig.getPageSize()));
    } else {
      PDDocument templateDoc = PDDocument.load(printData.templateResource.getInputStream());
      cursor.cacheTempalte(templateDoc);
      PDPage templatePage = templateDoc.getDocumentCatalog().getPages().get(0);
      document.importPage(templatePage);
    }
    PDPage currPage = document.getDocumentCatalog().getPages().get(++cursor.currentPageNumber);
    cursor.currentStream =
        new PDPageContentStream(document, currPage, PDPageContentStream.AppendMode.APPEND, false);
    cursor.yPos = printData.pageConfig.getStartY(cursor.currentPageNumber);
    cursor.xPos = printData.pageConfig.getStartX();
  }
Example #9
0
  @SuppressWarnings("unchecked")
  public static void main_3(String[] args) throws IOException {

    PDDocument doc = PDDocument.load(iconFile);

    List<PDPage> pages = doc.getDocumentCatalog().getAllPages();

    List<COSObject> objects = doc.getDocument().getObjects();

    for (COSObject cosObject : objects) {

      COSBase cosbase = cosObject.getObject();

      if (cosObject.getObject() instanceof COSStream) {

        COSStream cosstream = (COSStream) cosbase;

        COSBase filter = cosstream.getDictionaryObject(COSName.FILTER);

        COSBase subtype = cosstream.getDictionaryObject(COSName.SUBTYPE);

        if (subtype != null && subtype.equals(COSName.IMAGE)) {

          System.out.println(filter);

          InputStream filtered = cosstream.getFilteredStream();
          // PDStream stream = new PDStream(costream);

          System.out.println(Hex.encodeHex(IOUtils.toByteArray(filtered)));
        }
      }
    }

    for (PDPage pdPage : pages) {

      PDResources resources = pdPage.getResources();

      Map<String, PDXObject> images = resources.getXObjects();

      Set<String> keys = images.keySet();

      for (String key : keys) {

        PDXObject image = images.get(key);

        byte[] imgData = image.getPDStream().getByteArray();

        System.out.println(Hex.encodeHex(imgData));
      }
    }
  }
Example #10
0
  void extractAcroForm(PDDocument pdf) throws IOException, SAXException {
    // Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
    // this code derives from Ben's code
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();

    if (catalog == null) return;

    PDAcroForm form = catalog.getAcroForm();
    if (form == null) return;

    // if it has xfa, try that.
    // if it doesn't exist or there's an exception,
    // go with traditional AcroForm
    PDXFAResource pdxfa = form.getXFA();

    if (pdxfa != null) {
      // if successful, return
      XFAExtractor xfaExtractor = new XFAExtractor();
      try (InputStream is = new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes()))) {
        xfaExtractor.extract(is, xhtml, metadata, context);
        return;
      } catch (XMLStreamException | IOException e) {
        // if there was an xml parse exception in xfa, try the AcroForm
      }
    }

    @SuppressWarnings("rawtypes")
    List fields = form.getFields();

    if (fields == null) return;

    @SuppressWarnings("rawtypes")
    ListIterator itr = fields.listIterator();

    if (itr == null) return;

    xhtml.startElement("div", "class", "acroform");
    xhtml.startElement("ol");

    while (itr.hasNext()) {
      Object obj = itr.next();
      if (obj != null && obj instanceof PDField) {
        processAcroField((PDField) obj, 0);
      }
    }
    xhtml.endElement("ol");
    xhtml.endElement("div");
  }
Example #11
0
 private void openPDFFile(String file) throws Exception {
   if (document != null) {
     document.close();
     documentPanel.removeAll();
   }
   InputStream input = null;
   File f = new File(file);
   input = new FileInputStream(f);
   document = parseDocument(input);
   pages = document.getDocumentCatalog().getAllPages();
   numberOfPages = pages.size();
   // AH* Sidantal till GUI:
   sumPan.Sidantal.setText("" + 1 + " Av " + numberOfPages);
   sumPan.sidnrantal = numberOfPages;
   currentFilename = f.getAbsolutePath(); // AH* Borttagen i senare version.
   currentPage = 0;
   updateTitle();
   showPage(0);
 }
  /*
   * The following methods are overwritten from the PDTextStripper
   */
  public void initialize(final PDDocument pdf) throws IOException {
    try {
      resetEngine();
      document = pdf;
      textCache = new TextCache();

      if (getAddMoreFormatting()) {
        setParagraphEnd(getLineSeparator());
        setPageStart(getLineSeparator());
        setArticleStart(getLineSeparator());
        setArticleEnd(getLineSeparator());
      }
      startDocument(pdf);
      processPages(pdf.getDocumentCatalog().getAllPages());
      endDocument(pdf);
    } catch (Exception e) {
      e.printStackTrace();
    } catch (Error e1) {
      e1.printStackTrace();
    }
  }
Example #13
0
  private static void extract(InputStream in) throws Exception {
    PDDocument document = null;
    try {
      PDFParser parser = new PDFParser(in);
      parser.parse();
      document = parser.getPDDocument();
      if (document.isEncrypted()) {
        System.err.println("Document is Encrypted!");
      }
      PDDocumentCatalog cat = document.getDocumentCatalog();
      PDMetadata metadata = cat.getMetadata();
      if (metadata != null) {
        // System.out.println(metadata.getStream().getStreamTokens());

        // Levantamos la MetaData
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        Document doc = dBuilder.parse(metadata.createInputStream());

        // Buscamos el tag de SEmployee y el Element -> CUIT
        NodeList nList = doc.getElementsByTagName("foaf:SEmployee");
        Element elem = (Element) nList.item(0);
        String cuit = elem.getElementsByTagName("foaf:cuit").item(0).getTextContent();

        System.out.println(cuit);

        System.out.println("---");
        System.out.println(metadata.getInputStreamAsString());
      }
    } catch (Exception err) {
      throw err;
    } finally {
      if (document != null)
        try {
          document.close();
        } catch (Throwable err2) {
        }
    }
  }
Example #14
0
 /**
  * Converts a given page range of a PDF document to bitmap images.
  *
  * @param document the PDF document
  * @param imageFormat the target format (ex. "png")
  * @param password the password (needed if the PDF is encrypted)
  * @param startPage the start page (1 is the first page)
  * @param endPage the end page (set to Integer.MAX_VALUE for all pages)
  * @param outputPrefix used to construct the filename for the individual images
  * @param imageType the image type (see {@link BufferedImage}.TYPE_*)
  * @param resolution the resolution in dpi (dots per inch)
  * @return true if the images were produced, false if there was an error
  * @throws IOException if an I/O error occurs
  */
 public boolean writeImage(
     PDDocument document,
     String imageFormat,
     String password,
     int startPage,
     int endPage,
     String outputPrefix,
     int imageType,
     int resolution)
     throws IOException {
   boolean bSuccess = true;
   List<PDPage> pages = document.getDocumentCatalog().getAllPages();
   int pagesSize = pages.size();
   for (int i = startPage - 1; i < endPage && i < pagesSize; i++) {
     PDPage page = pages.get(i);
     BufferedImage image = page.convertToImage(imageType, resolution);
     String fileName = outputPrefix + (i + 1);
     LOG.info("Writing: " + fileName + "." + imageFormat);
     bSuccess &= ImageIOUtil.writeImage(image, imageFormat, fileName, imageType, resolution);
   }
   return bSuccess;
 }
Example #15
0
  @Override
  public void addDssDictionary(
      InputStream inputStream, OutputStream outpuStream, List<DSSDictionaryCallback> callbacks) {
    File toSignFile = null;
    File signedFile = null;
    FileInputStream fis = null;
    PDDocument pdDocument = null;
    try {
      toSignFile = DSSPDFUtils.getFileFromPdfData(inputStream);
      pdDocument = PDDocument.load(toSignFile);

      signedFile = File.createTempFile("sd-dss-", "-signed.pdf");

      final FileOutputStream fileOutputStream =
          DSSPDFUtils.getFileOutputStream(toSignFile, signedFile);

      if (CollectionUtils.isNotEmpty(callbacks)) {
        final COSDictionary cosDictionary = pdDocument.getDocumentCatalog().getCOSDictionary();
        cosDictionary.setItem("DSS", buildDSSDictionary(callbacks));
        cosDictionary.setNeedToBeUpdate(true);
      }

      if (pdDocument.getDocumentId() == null) {
        pdDocument.setDocumentId(0L);
      }
      pdDocument.saveIncremental(inputStream, fileOutputStream);

      fis = new FileInputStream(signedFile);
      IOUtils.copy(fis, outpuStream);
    } catch (Exception e) {
      throw new DSSException(e);
    } finally {
      IOUtils.closeQuietly(pdDocument);
      IOUtils.closeQuietly(fis);
      DSSUtils.delete(toSignFile);
      DSSUtils.delete(signedFile);
    }
  }
Example #16
0
  private boolean isDSSDictionaryPresentInPreviousRevision(byte[] originalBytes) {
    ByteArrayInputStream bais = null;
    PDDocument doc = null;
    PdfDssDict dssDictionary = null;
    try {
      bais = new ByteArrayInputStream(originalBytes);
      doc = PDDocument.load(bais);
      List<PDSignature> pdSignatures = doc.getSignatureDictionaries();
      if (CollectionUtils.isNotEmpty(pdSignatures)) {
        PdfDict catalog = new PdfBoxDict(doc.getDocumentCatalog().getCOSDictionary(), doc);
        dssDictionary = PdfDssDict.extract(catalog);
      }
    } catch (Exception e) {
      logger.warn(
          "Cannot check in previous revisions if DSS dictionary already exist : " + e.getMessage(),
          e);
    } finally {
      IOUtils.closeQuietly(bais);
      IOUtils.closeQuietly(doc);
    }

    return dssDictionary != null;
  }
  /**
   * @param pageConfig page config
   * @param report the report to print
   * @return the printed PdfBox document
   * @throws java.io.IOException
   */
  public PDDocument generate(
      PdfPageLayout pageConfig,
      Resource templateResource,
      PdfReportStructure report,
      PDDocument document)
      throws IOException {

    PrintData printData = new PrintData(templateResource, pageConfig);
    PrintCursor cursor = new PrintCursor();

    breakPage(document, cursor, printData);
    float maxWidth = pageConfig.getUsableWidth();

    int reportElementIndex = 0;
    ReportElement currentReportElement =
        report.getElements().isEmpty() ? null : report.getElements().get(reportElementIndex);
    ReportElement nextReportElement = null;

    while (currentReportElement != null) {
      boolean forceBreak = false;
      // currentReportElement.setFontLib(fontLibrary);
      float height = currentReportElement.getHeight(maxWidth);
      if (cursor.yPos - height < pageConfig.getLastY(cursor.currentPageNumber)) {
        // out of bounds
        if (currentReportElement.isSplitable()
            && currentReportElement instanceof ReportTable
            && (cursor.yPos - currentReportElement.getFirstSegmentHeight(maxWidth))
                >= pageConfig.getLastY(cursor.currentPageNumber)) {
          // it's a Table out of bounds, so we also do a height split
          ReportElement[] twoElements =
              currentReportElement.split(
                  maxWidth, cursor.yPos - pageConfig.getLastY(cursor.currentPageNumber));
          if (twoElements.length != 2) {
            throw new IllegalStateException("The split method should always two parts.");
          }
          currentReportElement = twoElements[0];
          nextReportElement = twoElements[1];
          if (((ReportTable) currentReportElement).getExtraSplitting()) {
            forceBreak = true;
          }
        } else if (currentReportElement.isSplitable()
            && (cursor.yPos - currentReportElement.getFirstSegmentHeight(maxWidth)
                >= pageConfig.getLastY(cursor.currentPageNumber))) {
          ReportElement[] twoElements = currentReportElement.split(maxWidth);
          if (twoElements.length != 2) {
            throw new IllegalStateException("The split method should always two parts.");
          }
          currentReportElement = twoElements[0];
          nextReportElement = twoElements[1];
        } else {
          breakPage(document, cursor, printData);
          continue;
        }
      }

      // without this block pdfbox 2.0.2 does not render properly
      // TODO: find a more elegant solution
      cursor.currentStream.close();
      PDPageTree pageTree = document.getDocumentCatalog().getPages();
      PDPage currPage = pageTree.get(pageTree.getCount() - 1);
      cursor.currentStream =
          new PDPageContentStream(document, currPage, PDPageContentStream.AppendMode.APPEND, false);
      // ---

      float nextY =
          currentReportElement.print(
              document,
              cursor.currentStream,
              cursor.currentPageNumber,
              cursor.xPos,
              cursor.yPos,
              maxWidth);
      nextY -= pageConfig.getLineDistance();
      cursor.imageList.addAll(currentReportElement.getImageIntents());

      currentReportElement = nextReportElement;
      nextReportElement = null;
      if (currentReportElement == null && reportElementIndex + 1 < report.getElements().size()) {
        currentReportElement = report.getElements().get(++reportElementIndex);
      }
      cursor.yPos = nextY;
      if (forceBreak) {
        breakPage(document, cursor, printData);
      }
    }
    cursor.currentStream.close();

    report.expandPagesStaticElements(cursor.currentPageNumber + 1);

    for (ReportElementStatic staticElem : report.getStaticElements()) {
      staticElem.print(document, null, 0, 0, 0, 0);
    }

    printImages(document, cursor);

    return document;
  }
  /**
   * Starts the text extraction.
   *
   * @param args the commandline arguments.
   * @throws IOException if there is an error reading the document or extracting the text.
   */
  public void startExtraction(String[] args) throws IOException {
    boolean toConsole = false;
    boolean toHTML = false;
    boolean sort = false;
    boolean separateBeads = true;
    String password = "";
    String encoding = "UTF-8";
    String pdfFile = null;
    String outputFile = null;
    // Defaults to text files
    String ext = ".txt";
    int startPage = 1;
    int endPage = Integer.MAX_VALUE;
    for (int i = 0; i < args.length; i++) {
      switch (args[i]) {
        case PASSWORD:
          i++;
          if (i >= args.length) {
            usage();
          }
          password = args[i];
          break;
        case ENCODING:
          i++;
          if (i >= args.length) {
            usage();
          }
          encoding = args[i];
          break;
        case START_PAGE:
          i++;
          if (i >= args.length) {
            usage();
          }
          startPage = Integer.parseInt(args[i]);
          break;
        case HTML:
          toHTML = true;
          ext = ".html";
          break;
        case SORT:
          sort = true;
          break;
        case IGNORE_BEADS:
          separateBeads = false;
          break;
        case DEBUG:
          debug = true;
          break;
        case END_PAGE:
          i++;
          if (i >= args.length) {
            usage();
          }
          endPage = Integer.parseInt(args[i]);
          break;
        case CONSOLE:
          toConsole = true;
          break;
        default:
          if (pdfFile == null) {
            pdfFile = args[i];
          } else {
            outputFile = args[i];
          }
          break;
      }
    }

    if (pdfFile == null) {
      usage();
    } else {

      Writer output = null;
      PDDocument document = null;
      try {
        long startTime = startProcessing("Loading PDF " + pdfFile);
        if (outputFile == null && pdfFile.length() > 4) {
          outputFile = new File(pdfFile.substring(0, pdfFile.length() - 4) + ext).getAbsolutePath();
        }
        document = PDDocument.load(new File(pdfFile), password);

        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
          throw new IOException("You do not have permission to extract text");
        }

        stopProcessing("Time for loading: ", startTime);

        if (toConsole) {
          output = new OutputStreamWriter(System.out, encoding);
        } else {
          output = new OutputStreamWriter(new FileOutputStream(outputFile), encoding);
        }

        PDFTextStripper stripper;
        if (toHTML) {
          stripper = new PDFText2HTML();
        } else {
          stripper = new PDFTextStripper();
        }
        stripper.setSortByPosition(sort);
        stripper.setShouldSeparateByBeads(separateBeads);
        stripper.setStartPage(startPage);
        stripper.setEndPage(endPage);

        startTime = startProcessing("Starting text extraction");
        if (debug) {
          System.err.println("Writing to " + outputFile);
        }

        // Extract text for main document:
        stripper.writeText(document, output);

        // ... also for any embedded PDFs:
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDDocumentNameDictionary names = catalog.getNames();
        if (names != null) {
          PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
          if (embeddedFiles != null) {
            Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
            if (embeddedFileNames != null) {
              for (Map.Entry<String, PDComplexFileSpecification> ent :
                  embeddedFileNames.entrySet()) {
                if (debug) {
                  System.err.println("Processing embedded file " + ent.getKey() + ":");
                }
                PDComplexFileSpecification spec = ent.getValue();
                PDEmbeddedFile file = spec.getEmbeddedFile();
                if (file != null && "application/pdf".equals(file.getSubtype())) {
                  if (debug) {
                    System.err.println("  is PDF (size=" + file.getSize() + ")");
                  }
                  InputStream fis = file.createInputStream();
                  PDDocument subDoc = null;
                  try {
                    subDoc = PDDocument.load(fis);
                  } finally {
                    fis.close();
                  }
                  try {
                    stripper.writeText(subDoc, output);
                  } finally {
                    IOUtils.closeQuietly(subDoc);
                  }
                }
              }
            }
          }
        }
        stopProcessing("Time for extraction: ", startTime);
      } finally {
        IOUtils.closeQuietly(output);
        IOUtils.closeQuietly(document);
      }
    }
  }
Example #19
0
  private List<PdfSignatureOrDocTimestampInfo> getSignatures(
      CertificatePool validationCertPool, byte[] originalBytes) {
    List<PdfSignatureOrDocTimestampInfo> signatures =
        new ArrayList<PdfSignatureOrDocTimestampInfo>();
    ByteArrayInputStream bais = null;
    PDDocument doc = null;
    try {

      bais = new ByteArrayInputStream(originalBytes);
      doc = PDDocument.load(bais);

      List<PDSignature> pdSignatures = doc.getSignatureDictionaries();
      if (CollectionUtils.isNotEmpty(pdSignatures)) {
        logger.debug("{} signature(s) found", pdSignatures.size());

        PdfDict catalog = new PdfBoxDict(doc.getDocumentCatalog().getCOSDictionary(), doc);
        PdfDssDict dssDictionary = PdfDssDict.extract(catalog);

        for (PDSignature signature : pdSignatures) {
          String subFilter = signature.getSubFilter();
          byte[] cms = signature.getContents(originalBytes);

          if (StringUtils.isEmpty(subFilter) || ArrayUtils.isEmpty(cms)) {
            logger.warn("Wrong signature with empty subfilter or cms.");
            continue;
          }

          byte[] signedContent = signature.getSignedContent(originalBytes);
          int[] byteRange = signature.getByteRange();

          PdfSignatureOrDocTimestampInfo signatureInfo = null;
          if (PdfBoxDocTimeStampService.SUB_FILTER_ETSI_RFC3161.getName().equals(subFilter)) {
            boolean isArchiveTimestamp = false;

            // LT or LTA
            if (dssDictionary != null) {
              // check is DSS dictionary already exist
              if (isDSSDictionaryPresentInPreviousRevision(
                  getOriginalBytes(byteRange, signedContent))) {
                isArchiveTimestamp = true;
              }
            }

            signatureInfo =
                new PdfBoxDocTimestampInfo(
                    validationCertPool,
                    signature,
                    dssDictionary,
                    cms,
                    signedContent,
                    isArchiveTimestamp);
          } else {
            signatureInfo =
                new PdfBoxSignatureInfo(
                    validationCertPool, signature, dssDictionary, cms, signedContent);
          }

          if (signatureInfo != null) {
            signatures.add(signatureInfo);
          }
        }
        Collections.sort(signatures, new PdfSignatureOrDocTimestampInfoComparator());
        linkSignatures(signatures);

        for (PdfSignatureOrDocTimestampInfo sig : signatures) {
          logger.debug(
              "Signature "
                  + sig.uniqueId()
                  + " found with byteRange "
                  + Arrays.toString(sig.getSignatureByteRange())
                  + " ("
                  + sig.getSubFilter()
                  + ")");
        }
      }

    } catch (Exception e) {
      logger.warn("Cannot analyze signatures : " + e.getMessage(), e);
    } finally {
      IOUtils.closeQuietly(bais);
      IOUtils.closeQuietly(doc);
    }

    return signatures;
  }