Пример #1
0
  private void extractEmbeddedDocuments(PDDocument document)
      throws IOException, SAXException, TikaException {
    PDDocumentNameDictionary namesDictionary =
        new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
    if (efTree == null) {
      return;
    }

    Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
    // For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
    // This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    // If there is a need we could add a fully recursive search to find a non-null
    // Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
      processEmbeddedDocNames(embeddedFileNames);
    } else {
      List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
      if (kids == null) {
        return;
      }
      for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
        embeddedFileNames = node.getNames();
        if (embeddedFileNames != null) {
          processEmbeddedDocNames(embeddedFileNames);
        }
      }
    }
  }