private void extractEmbeddedDocuments(PDDocument document)
      throws IOException, SAXException, TikaException {
    PDDocumentNameDictionary namesDictionary =
        new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
    if (efTree == null) {
      return;
    }

    Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
    // For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
    // This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    // If there is a need we could add a fully recursive search to find a non-null
    // Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
      processEmbeddedDocNames(embeddedFileNames);
    } else {
      List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
      if (kids == null) {
        return;
      }
      for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
        embeddedFileNames = node.getNames();
        if (embeddedFileNames != null) {
          processEmbeddedDocNames(embeddedFileNames);
        }
      }
    }
  }
 private void getNamesEmbeddedFiles(List<Object> files, COSDictionary buffer) throws IOException {
   PDEmbeddedFilesNameTreeNode root = null;
   if (buffer != null) {
     root = new PDEmbeddedFilesNameTreeNode(buffer);
   }
   if (root != null) {
     final Set<Map.Entry<String, PDComplexFileSpecification>> entries = root.getNames().entrySet();
     for (Map.Entry<String, PDComplexFileSpecification> entry : entries) {
       files.add(new PBCosFileSpecification(entry.getValue().getCOSObject()));
     }
   }
 }
  /**
   * Starts the text extraction.
   *
   * @param args the commandline arguments.
   * @throws IOException if there is an error reading the document or extracting the text.
   */
  public void startExtraction(String[] args) throws IOException {
    boolean toConsole = false;
    boolean toHTML = false;
    boolean sort = false;
    boolean separateBeads = true;
    String password = "";
    String encoding = "UTF-8";
    String pdfFile = null;
    String outputFile = null;
    // Defaults to text files
    String ext = ".txt";
    int startPage = 1;
    int endPage = Integer.MAX_VALUE;
    for (int i = 0; i < args.length; i++) {
      switch (args[i]) {
        case PASSWORD:
          i++;
          if (i >= args.length) {
            usage();
          }
          password = args[i];
          break;
        case ENCODING:
          i++;
          if (i >= args.length) {
            usage();
          }
          encoding = args[i];
          break;
        case START_PAGE:
          i++;
          if (i >= args.length) {
            usage();
          }
          startPage = Integer.parseInt(args[i]);
          break;
        case HTML:
          toHTML = true;
          ext = ".html";
          break;
        case SORT:
          sort = true;
          break;
        case IGNORE_BEADS:
          separateBeads = false;
          break;
        case DEBUG:
          debug = true;
          break;
        case END_PAGE:
          i++;
          if (i >= args.length) {
            usage();
          }
          endPage = Integer.parseInt(args[i]);
          break;
        case CONSOLE:
          toConsole = true;
          break;
        default:
          if (pdfFile == null) {
            pdfFile = args[i];
          } else {
            outputFile = args[i];
          }
          break;
      }
    }

    if (pdfFile == null) {
      usage();
    } else {

      Writer output = null;
      PDDocument document = null;
      try {
        long startTime = startProcessing("Loading PDF " + pdfFile);
        if (outputFile == null && pdfFile.length() > 4) {
          outputFile = new File(pdfFile.substring(0, pdfFile.length() - 4) + ext).getAbsolutePath();
        }
        document = PDDocument.load(new File(pdfFile), password);

        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
          throw new IOException("You do not have permission to extract text");
        }

        stopProcessing("Time for loading: ", startTime);

        if (toConsole) {
          output = new OutputStreamWriter(System.out, encoding);
        } else {
          output = new OutputStreamWriter(new FileOutputStream(outputFile), encoding);
        }

        PDFTextStripper stripper;
        if (toHTML) {
          stripper = new PDFText2HTML();
        } else {
          stripper = new PDFTextStripper();
        }
        stripper.setSortByPosition(sort);
        stripper.setShouldSeparateByBeads(separateBeads);
        stripper.setStartPage(startPage);
        stripper.setEndPage(endPage);

        startTime = startProcessing("Starting text extraction");
        if (debug) {
          System.err.println("Writing to " + outputFile);
        }

        // Extract text for main document:
        stripper.writeText(document, output);

        // ... also for any embedded PDFs:
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDDocumentNameDictionary names = catalog.getNames();
        if (names != null) {
          PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
          if (embeddedFiles != null) {
            Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
            if (embeddedFileNames != null) {
              for (Map.Entry<String, PDComplexFileSpecification> ent :
                  embeddedFileNames.entrySet()) {
                if (debug) {
                  System.err.println("Processing embedded file " + ent.getKey() + ":");
                }
                PDComplexFileSpecification spec = ent.getValue();
                PDEmbeddedFile file = spec.getEmbeddedFile();
                if (file != null && "application/pdf".equals(file.getSubtype())) {
                  if (debug) {
                    System.err.println("  is PDF (size=" + file.getSize() + ")");
                  }
                  InputStream fis = file.createInputStream();
                  PDDocument subDoc = null;
                  try {
                    subDoc = PDDocument.load(fis);
                  } finally {
                    fis.close();
                  }
                  try {
                    stripper.writeText(subDoc, output);
                  } finally {
                    IOUtils.closeQuietly(subDoc);
                  }
                }
              }
            }
          }
        }
        stopProcessing("Time for extraction: ", startTime);
      } finally {
        IOUtils.closeQuietly(output);
        IOUtils.closeQuietly(document);
      }
    }
  }