public void parse(File file, int maxPaths, ProgressMonitor monitor) throws Exception { monitor.beginTask(tr("Parsing PDF", 1)); PDDocument document = PDDocument.load(file); if (document.isEncrypted()) { throw new Exception(tr("Encrypted documents not supported.")); } List<?> allPages = document.getDocumentCatalog().getAllPages(); if (allPages.size() != 1) { throw new Exception(tr("The PDF file must have exactly one page.")); } PDPage page = (PDPage) allPages.get(0); PDRectangle pageSize = page.findMediaBox(); Integer rotationVal = page.getRotation(); int rotation = 0; if (rotationVal != null) { rotation = rotationVal.intValue(); } GraphicsProcessor p = new GraphicsProcessor(target, rotation, maxPaths, monitor); PageDrawer drawer = new PageDrawer(); drawer.drawPage(p, page); this.target.bounds = new Rectangle2D.Double( pageSize.getLowerLeftX(), pageSize.getLowerLeftY(), pageSize.getWidth(), pageSize.getHeight()); monitor.finishTask(); }
public void saveDocumentIncrementally( PAdESSignatureParameters parameters, File signedFile, FileOutputStream fileOutputStream, PDDocument pdDocument) throws DSSException { FileInputStream signedFileInputStream = null; try { signedFileInputStream = new FileInputStream(signedFile); // the document needs to have an ID, if not a ID based on the current system time is used, and // then the // digest of the signed data is // different if (pdDocument.getDocumentId() == null) { final byte[] documentIdBytes = DSSUtils.digest( DigestAlgorithm.MD5, parameters.bLevel().getSigningDate().toString().getBytes()); pdDocument.setDocumentId(DSSUtils.toLong(documentIdBytes)); pdDocument.setDocumentId(0L); } pdDocument.saveIncremental(signedFileInputStream, fileOutputStream); } catch (IOException e) { throw new DSSException(e); } catch (COSVisitorException e) { throw new DSSException(e); } finally { IOUtils.closeQuietly(signedFileInputStream); } }
public static String extractText(InputStream src) throws IOException { StringBuilder text = new StringBuilder(); COSDocument cosDoc = null; PDDocument pdDoc = null; try { PDFParser parser = new PDFParser(src); parser.parse(); cosDoc = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); int nbPages = pdDoc.getDocumentCatalog().getPages().getCount(); for (int i = 0; i < nbPages; i++) { stripper.setStartPage(i + 1); stripper.setEndPage(i + 1); text.append(stripper.getText(pdDoc)); } } finally { try { if (cosDoc != null) { cosDoc.close(); } } catch (IOException e) { // Do nada } try { if (pdDoc != null) { pdDoc.close(); } } catch (IOException e) { // Do nada } } return text.toString(); }
public static String getFileContent(File file) throws FileNotFoundException, IOException { String ext = FilenameUtils.getExtension(file.getName()); String outContent = ""; try { if (ext.toLowerCase().equals("doc")) { if (file != null) { WordExtractor we = new WordExtractor(new FileInputStream(file)); outContent = we.getText(); } else { logger.warning("file not found : " + file); } } else if (ext.toLowerCase().equals("pdf")) { PDDocument doc = PDDocument.load(file); PDFTextStripper text = new PDFTextStripper(); outContent = text.getText(doc); doc.close(); } else if (StringHelper.isHTML(file.getName())) { return loadStringFromFile(file); } } catch (Throwable t) { logger.warning("error when read : " + file + "+ [" + t.getMessage() + "]"); t.printStackTrace(); } return outContent; }
private void _generateImagesPB(FileVersion fileVersion, InputStream inputStream) throws Exception { boolean generatePreview = _isGeneratePreview(fileVersion); boolean generateThumbnail = _isGenerateThumbnail(fileVersion); PDDocument pdDocument = null; try { pdDocument = PDDocument.load(inputStream); PDDocumentCatalog pdDocumentCatalog = pdDocument.getDocumentCatalog(); List<PDPage> pdPages = pdDocumentCatalog.getAllPages(); for (int i = 0; i < pdPages.size(); i++) { PDPage pdPage = pdPages.get(i); if (generateThumbnail && (i == 0)) { _generateImagesPB( fileVersion, pdPage, PropsValues.DL_FILE_ENTRY_THUMBNAIL_DPI, PropsValues.DL_FILE_ENTRY_THUMBNAIL_HEIGHT, PropsValues.DL_FILE_ENTRY_THUMBNAIL_WIDTH, true, 0); if (_log.isInfoEnabled()) { _log.info("PDFBox generated a thumbnail for " + fileVersion.getFileVersionId()); } } if (!generatePreview) { break; } _generateImagesPB( fileVersion, pdPage, PropsValues.DL_FILE_ENTRY_PREVIEW_DOCUMENT_DPI, PropsValues.DL_FILE_ENTRY_PREVIEW_DOCUMENT_HEIGHT, PropsValues.DL_FILE_ENTRY_PREVIEW_DOCUMENT_WIDTH, false, i + 1); } if (_log.isInfoEnabled() && generatePreview) { _log.info( "PDFBox generated " + getPreviewFileCount(fileVersion) + " preview pages for " + fileVersion.getFileVersionId()); } } finally { if (pdDocument != null) { pdDocument.close(); } } }
public static void main(String[] args) throws IOException { try (PDDocument doc = new PDDocument()) { PDPage page = new PDPage(); // Create a landscape page // page.setMediaBox(new PDRectangle(PDRectangle.A4.getHeight(), // PDRectangle.A4.getWidth())); doc.addPage(page); // Initialize table float margin = 10; float tableWidth = page.getMediaBox().getWidth() - (2 * margin); float yStartNewPage = page.getMediaBox().getHeight() - (2 * margin); float yStart = yStartNewPage; float bottomMargin = 0; // Create the data List<List> data = new ArrayList<>(); data.add(new ArrayList<>(Arrays.asList("Key", "Value"))); for (int i = 1; i <= 5; i++) { data.add(new ArrayList<>(Arrays.asList(String.valueOf(i), "value:" + i))); } BaseTable dataTable = new BaseTable( yStart, yStartNewPage, bottomMargin, tableWidth, margin, doc, page, true, true); DataTable t = new DataTable(dataTable, page); t.addListToTable(data, DataTable.HASHEADER); dataTable.draw(); File file = new File("box.pdf"); System.out.println("Sample file saved at : " + file.getAbsolutePath()); doc.save(file); } }
private String pdfToText(InputStream in) { PDFParser parser = null; PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; try { parser = new PDFParser(in); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); return pdfStripper.getText(pdDoc); // System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", "")); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } return null; }
/** * Extracts text from a PDF and stores it in the document. Takes an input stream rather than a * file name. * * @param filesInputStream An input stream pointing to a PDF file. * @throws IOException */ private static char[] loadPDF(InputStream filesInputStream) throws IOException { PDDocument doc = PDDocument.load(filesInputStream); PDFTextStripper pdfStripper = new PDFTextStripper(); pdfStripper.setSortByPosition(false); char[] origText = pdfStripper.getText(doc).toCharArray(); doc.close(); return origText; }
/** * Decrypt the document. * * @param doc The document to decrypt. * @param decryptionMaterial The data used to decrypt the document. * @throws CryptographyException If there is an error during decryption. * @throws IOException If there is an error accessing data. */ public void decryptDocument(PDDocument doc, DecryptionMaterial decryptionMaterial) throws CryptographyException, IOException { this.document = doc; PDEncryptionDictionary dictionary = doc.getEncryptionDictionary(); prepareForDecryption(dictionary, doc.getDocument().getDocumentID(), decryptionMaterial); proceedDecryption(); }
public static String getContent(PDFParser parser) throws IOException { parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); String content = pdfStripper.getText(pdDoc); cosDoc.close(); pdDoc.close(); return content; }
public void printToStream( PdfPageLayout pageConfig, Resource templateResource, PdfReportStructure report, OutputStream stream, PDDocument document) throws IOException { PDDocument page = generate(pageConfig, templateResource, report, document); page.save(stream); page.close(); }
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * @throws IOException If there is an error parsing the document. */ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is, ""); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addTextField(document, "contents", reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, "Author", info.getAuthor()); addTextField(document, "CreationDate", info.getCreationDate()); addTextField(document, "Creator", info.getCreator()); addTextField(document, "Keywords", info.getKeywords()); addTextField(document, "ModificationDate", info.getModificationDate()); addTextField(document, "Producer", info.getProducer()); addTextField(document, "Subject", info.getSubject()); addTextField(document, "Title", info.getTitle()); addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, "summary", summary); } catch (InvalidPasswordException e) { // they didn't suppply a password and the default of "" was wrong. throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed.", e); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
public static String loadPdfToString(String filename) throws IOException { PDDocument document = PDDocument.load(new File(filename)); PDFTextStripper reader = new PDFTextStripper(); String pdfText = reader.getText(document); document.close(); // writing pdf text to a text file // try (FileWriter fw = new FileWriter("data/sigmod.txt")) { // fw.write(pdfText); // } return pdfText; }
@SuppressWarnings("unchecked") public static void main_3(String[] args) throws IOException { PDDocument doc = PDDocument.load(iconFile); List<PDPage> pages = doc.getDocumentCatalog().getAllPages(); List<COSObject> objects = doc.getDocument().getObjects(); for (COSObject cosObject : objects) { COSBase cosbase = cosObject.getObject(); if (cosObject.getObject() instanceof COSStream) { COSStream cosstream = (COSStream) cosbase; COSBase filter = cosstream.getDictionaryObject(COSName.FILTER); COSBase subtype = cosstream.getDictionaryObject(COSName.SUBTYPE); if (subtype != null && subtype.equals(COSName.IMAGE)) { System.out.println(filter); InputStream filtered = cosstream.getFilteredStream(); // PDStream stream = new PDStream(costream); System.out.println(Hex.encodeHex(IOUtils.toByteArray(filtered))); } } } for (PDPage pdPage : pages) { PDResources resources = pdPage.getResources(); Map<String, PDXObject> images = resources.getXObjects(); Set<String> keys = images.keySet(); for (String key : keys) { PDXObject image = images.get(key); byte[] imgData = image.getPDStream().getByteArray(); System.out.println(Hex.encodeHex(imgData)); } } }
/** * This will parse a document. * * @param input The input stream for the document. * @return The document. * @throws IOException If there is an error parsing the document. */ private static PDDocument parseDocument(InputStream input) throws IOException { PDDocument document = PDDocument.load(input); if (document.isEncrypted()) { try { document.decrypt(""); } catch (org.apache.pdfbox.exceptions.InvalidPasswordException e) { System.err.println("Error: The document is encrypted."); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { e.printStackTrace(); } } return document; }
public static String extract(File pdfFile) throws IOException { checkNotNull(pdfFile, "pdfFile"); PDFParser parser = new PDFParser(new FileInputStream(pdfFile)); parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); pdfStripper.setSortByPosition(true); String pdfText = pdfStripper.getText(pdDoc); pdDoc.close(); cosDoc.close(); return pdfText; }
/** * Returns the given page as an RGB image at the given scale. * * @param pageIndex the zero-based index of the page to be converted * @param scale the scaling factor, where 1 = 72 DPI * @param config the bitmap config to create * @return the rendered page image * @throws IOException if the PDF cannot be read */ public Bitmap renderImage(int pageIndex, float scale, Bitmap.Config config) throws IOException { PDPage page = document.getPage(pageIndex); PDRectangle cropbBox = page.getCropBox(); float widthPt = cropbBox.getWidth(); float heightPt = cropbBox.getHeight(); int widthPx = Math.round(widthPt * scale); int heightPx = Math.round(heightPt * scale); int rotationAngle = page.getRotation(); // swap width and height Bitmap image; if (rotationAngle == 90 || rotationAngle == 270) { image = Bitmap.createBitmap(heightPx, widthPx, config); } else { image = Bitmap.createBitmap(widthPx, heightPx, config); } // use a transparent background if the imageType supports alpha Paint paint = new Paint(); Canvas canvas = new Canvas(image); if (config != Bitmap.Config.ARGB_8888) { paint.setColor(Color.WHITE); paint.setStyle(Paint.Style.FILL); canvas.drawRect(0, 0, image.getWidth(), image.getHeight(), paint); paint.reset(); } renderPage(page, paint, canvas, image.getWidth(), image.getHeight(), scale, scale); return image; }
/** * Returns the given page as an RGB or ARGB image at the given scale. * * @param pageIndex the zero-based index of the page to be converted * @param scale the scaling factor, where 1 = 72 DPI * @param imageType the type of image to return * @return the rendered page image * @throws IOException if the PDF cannot be read */ public BufferedImage renderImage(int pageIndex, float scale, ImageType imageType) throws IOException { PDPage page = document.getPage(pageIndex); PDRectangle cropbBox = page.getCropBox(); float widthPt = cropbBox.getWidth(); float heightPt = cropbBox.getHeight(); int widthPx = Math.round(widthPt * scale); int heightPx = Math.round(heightPt * scale); int rotationAngle = page.getRotation(); // swap width and height BufferedImage image; if (rotationAngle == 90 || rotationAngle == 270) { image = new BufferedImage(heightPx, widthPx, imageType.toBufferedImageType()); } else { image = new BufferedImage(widthPx, heightPx, imageType.toBufferedImageType()); } // use a transparent background if the imageType supports alpha Graphics2D g = image.createGraphics(); if (imageType == ImageType.ARGB) { g.setBackground(new Color(0, 0, 0, 0)); } else { g.setBackground(Color.WHITE); } renderPage(page, g, image.getWidth(), image.getHeight(), scale, scale); g.dispose(); return image; }
private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException { PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); if (efTree == null) { return; } Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); // For now, try to get the embeddedFileNames out of embeddedFiles or its kids. // This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java // If there is a need we could add a fully recursive search to find a non-null // Map<String, COSObjectable> that contains the doc info. if (embeddedFileNames != null) { processEmbeddedDocNames(embeddedFileNames); } else { List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); if (kids == null) { return; } for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { embeddedFileNames = node.getNames(); if (embeddedFileNames != null) { processEmbeddedDocNames(embeddedFileNames); } } } }
/** * extract clickable links from pdf * * @param pdf the document to parse * @return all detected links */ private Collection<AnchorURL>[] extractPdfLinks(final PDDocument pdf) { @SuppressWarnings("unchecked") List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages(); @SuppressWarnings("unchecked") Collection<AnchorURL>[] linkCollections = (Collection<AnchorURL>[]) new Collection<?>[allPages.size()]; int pagecount = 0; for (PDPage page : allPages) { final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>(); try { List<PDAnnotation> annotations = page.getAnnotations(); if (annotations != null) { for (PDAnnotation pdfannotation : annotations) { if (pdfannotation instanceof PDAnnotationLink) { PDAction link = ((PDAnnotationLink) pdfannotation).getAction(); if (link != null && link instanceof PDActionURI) { PDActionURI pdflinkuri = (PDActionURI) link; String uristr = pdflinkuri.getURI(); AnchorURL url = new AnchorURL(uristr); pdflinks.add(url); } } } } } catch (IOException ex) { } linkCollections[pagecount++] = pdflinks; } return linkCollections; }
/** * Constructor sets the input and output file and convert the pdf, docx and doc files to text . * * @param infile,outfile * @return */ public TextConvertor(String infile, String outfile) { try { File input = new File(infile); // The file from where you would like to extract FileInputStream fis = new FileInputStream(input.getAbsolutePath()); int x = fis.read(); int y = fis.read(); fis = new FileInputStream(input.getAbsolutePath()); if (x == 37 && y == 80) { filetype = "pdf"; pd = PDDocument.load(input); PDF2Text(outfile); } else if (x == 80 && y == 75) { filetype = "docx"; dx = new XWPFDocument(fis); DOCX2Text(outfile); } else if (x == 208 && y == 207) { filetype = "doc"; dc = new HWPFDocument(fis); DOC2Text(outfile); } } catch (Exception e) { e.printStackTrace(); } }
public void writeText(final CAS aCas, final InputStream aIs) throws IOException { final PDDocument doc = PDDocument.load(aIs); try { if (doc.isEncrypted()) { throw new IOException("Encrypted documents currently not supported"); } cas = aCas; text = new StringBuilder(); writeText(doc); } finally { doc.close(); } }
@Override public void outputReportToFile(String fileName) throws ReportException { try { doc.save(fileName); } catch (IOException e) { throw new ReportException("Error in report save to file: " + e.getMessage()); } }
@Override public void close() { super.close(); try { doc.close(); } catch (IOException e) { e.printStackTrace(); } }
ExtractPageContent(String filePath) { this.filePath = filePath; try { reader = new PdfReader(filePath); parser = new PdfReaderContentParser(reader); getContents(); } catch (Exception e) { try { PDDocument doc = PDDocument.load(filePath); PDFTextStripper stripper = new PDFTextStripper(); this.fileContents = stripper.getText(doc); doc.close(); } catch (IOException e1) { // TODO Auto-generated catch block // e1.printStackTrace(); } } }
/** * Método para la indexación individual de cada fichero PDF * * @param f el fichero PDF * @param writer el IndexWriter * @throws IOException */ public static void indexFile(File f, IndexWriter writer) throws IOException { // Cargamos el fichero mediante PDFBox PDDocument pddDocument = PDDocument.load(f.getAbsolutePath()); PDFTextStripper textStripper = new PDFTextStripper(); int numPages = pddDocument.getNumberOfPages(); String pageContent; // Declaramos un Field propio FieldType fieldText = new FieldType(); fieldText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldText.setStored(false); fieldText.setStoreTermVectorOffsets(true); fieldText.setStoreTermVectorPositions(true); fieldText.setStoreTermVectors(true); // Recorremos e indexamos cada una de las páginas del fichero, almacenando el número de página y // el título del fichero, e indexando el contenido for (int i = 0; i < numPages; i++) { if (i == 0) { i++; } textStripper.setStartPage(i); textStripper.setEndPage(i); // coger una página pageContent = textStripper.getText(pddDocument); if (pageContent != null && !pageContent.isEmpty()) { pageContent = pageContent.toLowerCase(); } if (pageContent != null) { // Declaramos el documento a indexar para esa página // Número de página // Contenido de la página // Título del fichero // Añadimos el documento } } // Cerramos el fichero PDF }
/** * Returns a mapping with computed page labels as keys and corresponding 0-based page indices as * values. The returned map will contain at most as much entries as the document has pages. * * <p><strong>NOTE:</strong> If the document contains duplicate page labels, the returned map will * contain <em>less</em> entries than the document has pages. The page index returned in this case * is the <em>highest</em> index among all pages sharing the same label. * * @return a mapping from labels to 0-based page indices. */ public Map<String, Integer> getPageIndicesByLabels() { final Map<String, Integer> labelMap = new HashMap<String, Integer>(doc.getNumberOfPages()); computeLabels( new LabelHandler() { public void newLabel(int pageIndex, String label) { labelMap.put(label, pageIndex); } }); return labelMap; }
private void openPDFFile(String file) throws Exception { if (document != null) { document.close(); documentPanel.removeAll(); } InputStream input = null; File f = new File(file); input = new FileInputStream(f); document = parseDocument(input); pages = document.getDocumentCatalog().getAllPages(); numberOfPages = pages.size(); // AH* Sidantal till GUI: sumPan.Sidantal.setText("" + 1 + " Av " + numberOfPages); sumPan.sidnrantal = numberOfPages; currentFilename = f.getAbsolutePath(); // AH* Borttagen i senare version. currentPage = 0; updateTitle(); showPage(0); }
@Test public void testAppend() throws Exception { final String originalText = "Test"; final String textToAppend = "Append"; PDDocument document = new PDDocument(); PDPage page = new PDPage(PDPage.PAGE_SIZE_A4); document.addPage(page); PDPageContentStream contentStream = new PDPageContentStream(document, page); contentStream.setFont(PDType1Font.HELVETICA, 12); contentStream.beginText(); contentStream.moveTextPositionByAmount(20, 400); contentStream.drawString(originalText); contentStream.endText(); contentStream.close(); template.sendBodyAndHeader( "direct:start", textToAppend, PdfHeaderConstants.PDF_DOCUMENT_HEADER_NAME, document); resultEndpoint.setExpectedMessageCount(1); resultEndpoint.expectedMessagesMatches( new Predicate() { @Override public boolean matches(Exchange exchange) { Object body = exchange.getIn().getBody(); assertThat(body, instanceOf(ByteArrayOutputStream.class)); try { PDDocument doc = PDDocument.load( new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray())); PDFTextStripper pdfTextStripper = new PDFTextStripper(); String text = pdfTextStripper.getText(doc); assertEquals(2, doc.getNumberOfPages()); assertThat(text, containsString(originalText)); assertThat(text, containsString(textToAppend)); } catch (IOException e) { throw new RuntimeException(e); } return true; } }); resultEndpoint.assertIsSatisfied(); }
@Test public void testDontThrowNPEInShfill() throws IOException { PDDocument pdf_document = PDDocument.load("src/test/resources/technology/tabula/labor.pdf"); ObjectExtractor oe = new ObjectExtractor(pdf_document); PageIterator pi = oe.extract(); try { Page p = pi.next(); } catch (NullPointerException e) { fail("NPE in ObjectExtractor " + e.toString()); } }