Example #1
1
  /**
   * This will add the contents to the lucene document.
   *
   * @param document The document to add the contents to.
   * @param is The stream to get the contents from.
   * @param documentLocation The location of the document, used just for debug messages.
   * @throws IOException If there is an error parsing the document.
   */
  private void addContent(Document document, InputStream is, String documentLocation)
      throws IOException {
    PDDocument pdfDocument = null;
    PDFTextStripper stripper;
    try {
      pdfDocument = PDDocument.load(is);
      if (pdfDocument.isEncrypted()) {
        // Just try using the default password and move on
        pdfDocument.decrypt("");
      }

      // create a writer where to append the text content.
      StringWriter writer = new StringWriter();
      stripper = new PDFTextStripper();
      try {
        stripper.writeText(pdfDocument, writer);

      } catch (Exception e) {
        System.out.println("Error in stripper.writeText()");
      }
      String contents = writer.getBuffer().toString();

      StringReader reader = new StringReader(contents);
      addTextField(document, Indexer.contents, reader);
      PDDocumentInformation info = pdfDocument.getDocumentInformation();
      if (info != null) {
        addTextField(document, Indexer.Author, info.getAuthor());
        try {
          addTextField(document, Indexer.created, info.getCreationDate());
        } catch (IOException io) {
          // ignore, bad date but continue with indexing
        }

        addTextField(document, Indexer.keywords, info.getKeywords());
        try {
          addTextField(document, Indexer.modified, info.getModificationDate());
        } catch (IOException io) {
          // ignore, bad date but continue with indexing
        }
        addTextField(document, "Subject", info.getSubject());
        addTextField(document, Indexer.Title, info.getTitle());
      }
      int summarySize = Math.min(contents.length(), 500);
      String summary = contents.substring(0, summarySize);
      // Add the summary as an UnIndexed field, so that it is stored and
      // returned
      // with hit documents for display.
      addUnindexedField(document, Indexer.summary, summary);
    } catch (CryptographyException e) {
      throw new IOException("Error decrypting document(" + documentLocation + "): " + e);
    } catch (InvalidPasswordException e) {
      // they didn't suppply a password and the default of "" was wrong.
      throw new IOException(
          "Error: The document(" + documentLocation + ") is encrypted and will not be indexed.");
    } finally {
      if (pdfDocument != null) {
        pdfDocument.close();
      }
    }
  }
Example #2
0
 /**
  * This will print the documents data.
  *
  * @param args The command line arguments.
  * @throws Exception If there is an error parsing the document.
  */
 public static void main(String[] args) throws Exception {
   if (args.length != 1) {
     usage();
   } else {
     PDDocument document = null;
     try {
       document = PDDocument.load(args[0]);
       if (document.isEncrypted()) {
         try {
           document.decrypt("");
         } catch (InvalidPasswordException e) {
           System.err.println("Error: Document is encrypted with a password.");
           System.exit(1);
         }
       }
       PrintTextLocations printer = new PrintTextLocations();
       List allPages = document.getDocumentCatalog().getAllPages();
       for (int i = 0; i < allPages.size(); i++) {
         PDPage page = (PDPage) allPages.get(i);
         System.out.println("Processing page: " + i);
         PDStream contents = page.getContents();
         if (contents != null) {
           printer.processStream(page, page.findResources(), page.getContents().getStream());
         }
       }
     } finally {
       if (document != null) {
         document.close();
       }
     }
   }
 }
Example #3
0
  /**
   * This will parse a document.
   *
   * @param input The input stream for the document.
   * @return The document.
   * @throws IOException If there is an error parsing the document.
   */
  private static PDDocument parseDocument(InputStream input) throws IOException {
    PDDocument document = PDDocument.load(input);
    if (document.isEncrypted()) {
      try {
        document.decrypt("");
      } catch (org.apache.pdfbox.exceptions.InvalidPasswordException e) {
        System.err.println("Error: The document is encrypted.");
      } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
        e.printStackTrace();
      }
    }

    return document;
  }
Example #4
0
  private void split(String[] args) throws Exception {
    String password = "";
    String split = null;
    String startPage = null;
    String endPage = null;
    boolean useNonSeqParser = false;
    Splitter splitter = new Splitter();
    String pdfFile = null;
    for (int i = 0; i < args.length; i++) {
      if (args[i].equals(PASSWORD)) {
        i++;
        if (i >= args.length) {
          usage();
        }
        password = args[i];
      } else if (args[i].equals(SPLIT)) {
        i++;
        if (i >= args.length) {
          usage();
        }
        split = args[i];
      } else if (args[i].equals(START_PAGE)) {
        i++;
        if (i >= args.length) {
          usage();
        }
        startPage = args[i];
      } else if (args[i].equals(END_PAGE)) {
        i++;
        if (i >= args.length) {
          usage();
        }
        endPage = args[i];
      } else if (args[i].equals(NONSEQ)) {
        useNonSeqParser = true;
      } else {
        if (pdfFile == null) {
          pdfFile = args[i];
        }
      }
    }

    if (pdfFile == null) {
      usage();
    } else {
      PDDocument document = null;
      List<PDDocument> documents = null;
      try {
        if (useNonSeqParser) {
          document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
        } else {
          document = PDDocument.load(pdfFile);
          if (document.isEncrypted()) {
            try {
              document.decrypt(password);
            } catch (InvalidPasswordException e) {
              if (args.length == 4) // they supplied the wrong password
              {
                System.err.println("Error: The supplied password is incorrect.");
                System.exit(2);
              } else {
                // they didn't supply a password and the default of "" was wrong.
                System.err.println("Error: The document is encrypted.");
                usage();
              }
            }
          }
        }

        int numberOfPages = document.getNumberOfPages();
        boolean startEndPageSet = false;
        if (startPage != null) {
          splitter.setStartPage(Integer.parseInt(startPage));
          if (split == null) {
            splitter.setSplitAtPage(numberOfPages);
          }
        }
        if (endPage != null) {
          splitter.setEndPage(Integer.parseInt(endPage));
          if (split == null) {
            splitter.setSplitAtPage(Integer.parseInt(endPage));
          }
        }
        if (split != null) {
          splitter.setSplitAtPage(Integer.parseInt(split));
        } else {
          if (!startEndPageSet) {
            splitter.setSplitAtPage(1);
          }
        }

        documents = splitter.split(document);
        for (int i = 0; i < documents.size(); i++) {
          PDDocument doc = documents.get(i);
          String fileName = pdfFile.substring(0, pdfFile.length() - 4) + "-" + i + ".pdf";
          writeDocument(doc, fileName);
          doc.close();
        }

      } finally {
        if (document != null) {
          document.close();
        }
        for (int i = 0; documents != null && i < documents.size(); i++) {
          PDDocument doc = (PDDocument) documents.get(i);
          doc.close();
        }
      }
    }
  }