/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * @throws IOException If there is an error parsing the document. */ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; PDFTextStripper stripper; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { // Just try using the default password and move on pdfDocument.decrypt(""); } // create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper = new PDFTextStripper(); try { stripper.writeText(pdfDocument, writer); } catch (Exception e) { System.out.println("Error in stripper.writeText()"); } String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); addTextField(document, Indexer.contents, reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, Indexer.Author, info.getAuthor()); try { addTextField(document, Indexer.created, info.getCreationDate()); } catch (IOException io) { // ignore, bad date but continue with indexing } addTextField(document, Indexer.keywords, info.getKeywords()); try { addTextField(document, Indexer.modified, info.getModificationDate()); } catch (IOException io) { // ignore, bad date but continue with indexing } addTextField(document, "Subject", info.getSubject()); addTextField(document, Indexer.Title, info.getTitle()); } int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and // returned // with hit documents for display. addUnindexedField(document, Indexer.summary, summary); } catch (CryptographyException e) { throw new IOException("Error decrypting document(" + documentLocation + "): " + e); } catch (InvalidPasswordException e) { // they didn't suppply a password and the default of "" was wrong. throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed."); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
/** * This will print the documents data. * * @param args The command line arguments. * @throws Exception If there is an error parsing the document. */ public static void main(String[] args) throws Exception { if (args.length != 1) { usage(); } else { PDDocument document = null; try { document = PDDocument.load(args[0]); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } PrintTextLocations printer = new PrintTextLocations(); List allPages = document.getDocumentCatalog().getAllPages(); for (int i = 0; i < allPages.size(); i++) { PDPage page = (PDPage) allPages.get(i); System.out.println("Processing page: " + i); PDStream contents = page.getContents(); if (contents != null) { printer.processStream(page, page.findResources(), page.getContents().getStream()); } } } finally { if (document != null) { document.close(); } } } }
/** * This will parse a document. * * @param input The input stream for the document. * @return The document. * @throws IOException If there is an error parsing the document. */ private static PDDocument parseDocument(InputStream input) throws IOException { PDDocument document = PDDocument.load(input); if (document.isEncrypted()) { try { document.decrypt(""); } catch (org.apache.pdfbox.exceptions.InvalidPasswordException e) { System.err.println("Error: The document is encrypted."); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { e.printStackTrace(); } } return document; }
private void split(String[] args) throws Exception { String password = ""; String split = null; String startPage = null; String endPage = null; boolean useNonSeqParser = false; Splitter splitter = new Splitter(); String pdfFile = null; for (int i = 0; i < args.length; i++) { if (args[i].equals(PASSWORD)) { i++; if (i >= args.length) { usage(); } password = args[i]; } else if (args[i].equals(SPLIT)) { i++; if (i >= args.length) { usage(); } split = args[i]; } else if (args[i].equals(START_PAGE)) { i++; if (i >= args.length) { usage(); } startPage = args[i]; } else if (args[i].equals(END_PAGE)) { i++; if (i >= args.length) { usage(); } endPage = args[i]; } else if (args[i].equals(NONSEQ)) { useNonSeqParser = true; } else { if (pdfFile == null) { pdfFile = args[i]; } } } if (pdfFile == null) { usage(); } else { PDDocument document = null; List<PDDocument> documents = null; try { if (useNonSeqParser) { document = PDDocument.loadNonSeq(new File(pdfFile), null, password); } else { document = PDDocument.load(pdfFile); if (document.isEncrypted()) { try { document.decrypt(password); } catch (InvalidPasswordException e) { if (args.length == 4) // they supplied the wrong password { System.err.println("Error: The supplied password is incorrect."); System.exit(2); } else { // they didn't supply a password and the default of "" was wrong. System.err.println("Error: The document is encrypted."); usage(); } } } } int numberOfPages = document.getNumberOfPages(); boolean startEndPageSet = false; if (startPage != null) { splitter.setStartPage(Integer.parseInt(startPage)); if (split == null) { splitter.setSplitAtPage(numberOfPages); } } if (endPage != null) { splitter.setEndPage(Integer.parseInt(endPage)); if (split == null) { splitter.setSplitAtPage(Integer.parseInt(endPage)); } } if (split != null) { splitter.setSplitAtPage(Integer.parseInt(split)); } else { if (!startEndPageSet) { splitter.setSplitAtPage(1); } } documents = splitter.split(document); for (int i = 0; i < documents.size(); i++) { PDDocument doc = documents.get(i); String fileName = pdfFile.substring(0, pdfFile.length() - 4) + "-" + i + ".pdf"; writeDocument(doc, fileName); doc.close(); } } finally { if (document != null) { document.close(); } for (int i = 0; documents != null && i < documents.size(); i++) { PDDocument doc = (PDDocument) documents.get(i); doc.close(); } } } }