public static String extractText(InputStream src) throws IOException { StringBuilder text = new StringBuilder(); COSDocument cosDoc = null; PDDocument pdDoc = null; try { PDFParser parser = new PDFParser(src); parser.parse(); cosDoc = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); int nbPages = pdDoc.getDocumentCatalog().getPages().getCount(); for (int i = 0; i < nbPages; i++) { stripper.setStartPage(i + 1); stripper.setEndPage(i + 1); text.append(stripper.getText(pdDoc)); } } finally { try { if (cosDoc != null) { cosDoc.close(); } } catch (IOException e) { // Do nada } try { if (pdDoc != null) { pdDoc.close(); } } catch (IOException e) { // Do nada } } return text.toString(); }
private String pdfToText(InputStream in) { PDFParser parser = null; PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; try { parser = new PDFParser(in); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); return pdfStripper.getText(pdDoc); // System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", "")); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } return null; }
public static String getContent(PDFParser parser) throws IOException { parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); String content = pdfStripper.getText(pdDoc); cosDoc.close(); pdDoc.close(); return content; }
/** Extracts the textual contents from a PDF file as one long string. */ public String extractPDFContents(File f) throws IOException { FileInputStream fi = new FileInputStream(f); PDFParser parser = new PDFParser(fi); parser.parse(); fi.close(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); String result = stripper.getText(new PDDocument(cd)); cd.close(); return result; }
public static String extract(File pdfFile) throws IOException { checkNotNull(pdfFile, "pdfFile"); PDFParser parser = new PDFParser(new FileInputStream(pdfFile)); parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); pdfStripper.setSortByPosition(true); String pdfText = pdfStripper.getText(pdDoc); pdDoc.close(); cosDoc.close(); return pdfText; }
String pdftoText(String fileName) { System.out.println("Parsing text from PDF file " + fileName + "...."); File f = new File(fileName); if (!f.isFile()) { System.out.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(f)); } catch (Exception e) { System.out.println("Unable to open PDF Parser."); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.out.println("An exception occured in parsing the PDF Document."); e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } return null; } System.out.println("Done."); return parsedText; }