private String pdfToText(InputStream in) {
    PDFParser parser = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;

    try {
      parser = new PDFParser(in);
      parser.parse();
      cosDoc = parser.getDocument();
      pdfStripper = new PDFTextStripper();
      pdDoc = new PDDocument(cosDoc);
      return pdfStripper.getText(pdDoc);
      // System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
    } catch (Exception e) {
      e.printStackTrace();
      try {
        if (cosDoc != null) cosDoc.close();
        if (pdDoc != null) pdDoc.close();
      } catch (Exception e1) {
        e.printStackTrace();
      }
    }
    return null;
  }
Esempio n. 2
0
 public static String extractText(InputStream src) throws IOException {
   StringBuilder text = new StringBuilder();
   COSDocument cosDoc = null;
   PDDocument pdDoc = null;
   try {
     PDFParser parser = new PDFParser(src);
     parser.parse();
     cosDoc = parser.getDocument();
     PDFTextStripper stripper = new PDFTextStripper();
     pdDoc = new PDDocument(cosDoc);
     int nbPages = pdDoc.getDocumentCatalog().getPages().getCount();
     for (int i = 0; i < nbPages; i++) {
       stripper.setStartPage(i + 1);
       stripper.setEndPage(i + 1);
       text.append(stripper.getText(pdDoc));
     }
   } finally {
     try {
       if (cosDoc != null) {
         cosDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
     try {
       if (pdDoc != null) {
         pdDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
   }
   return text.toString();
 }
 /** link to cross reference table properties */
 private List<? extends Object> getXRef() {
   List<CosXRef> xref = new ArrayList<>();
   final COSDocument document = (COSDocument) this.baseObject;
   xref.add(
       new PBCosXRef(document.isXRefSpacingsCompliesPDFA(), document.isXRefEOLCompliesPDFA()));
   return xref;
 }
Esempio n. 4
0
 public static String getContent(PDFParser parser) throws IOException {
   parser.parse();
   COSDocument cosDoc = parser.getDocument();
   PDFTextStripper pdfStripper = new PDFTextStripper();
   PDDocument pdDoc = new PDDocument(cosDoc);
   String content = pdfStripper.getText(pdDoc);
   cosDoc.close();
   pdDoc.close();
   return content;
 }
Esempio n. 5
0
 /** Extracts the textual contents from a PDF file as one long string. */
 public String extractPDFContents(File f) throws IOException {
   FileInputStream fi = new FileInputStream(f);
   PDFParser parser = new PDFParser(fi);
   parser.parse();
   fi.close();
   COSDocument cd = parser.getDocument();
   PDFTextStripper stripper = new PDFTextStripper();
   String result = stripper.getText(new PDDocument(cd));
   cd.close();
   return result;
 }
Esempio n. 6
0
 public static String extract(File pdfFile) throws IOException {
   checkNotNull(pdfFile, "pdfFile");
   PDFParser parser = new PDFParser(new FileInputStream(pdfFile));
   parser.parse();
   COSDocument cosDoc = parser.getDocument();
   PDFTextStripper pdfStripper = new PDFTextStripper();
   PDDocument pdDoc = new PDDocument(cosDoc);
   pdfStripper.setStartPage(1);
   pdfStripper.setEndPage(pdDoc.getNumberOfPages());
   pdfStripper.setSortByPosition(true);
   String pdfText = pdfStripper.getText(pdDoc);
   pdDoc.close();
   cosDoc.close();
   return pdfText;
 }
Esempio n. 7
0
  String pdftoText(String fileName) {

    System.out.println("Parsing text from PDF file " + fileName + "....");
    File f = new File(fileName);

    if (!f.isFile()) {
      System.out.println("File " + fileName + " does not exist.");
      return null;
    }

    try {
      parser = new PDFParser(new FileInputStream(f));
    } catch (Exception e) {
      System.out.println("Unable to open PDF Parser.");
      return null;
    }

    try {
      parser.parse();
      cosDoc = parser.getDocument();
      pdfStripper = new PDFTextStripper();
      pdDoc = new PDDocument(cosDoc);
      parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
      System.out.println("An exception occured in parsing the PDF Document.");
      e.printStackTrace();
      try {
        if (cosDoc != null) cosDoc.close();
        if (pdDoc != null) pdDoc.close();
      } catch (Exception e1) {
        e.printStackTrace();
      }
      return null;
    }
    System.out.println("Done.");
    return parsedText;
  }
 /** @return true if the current document is linearized */
 public Boolean getisLinearized() {
   COSDocument document = (COSDocument) this.baseObject;
   boolean res = document.getTrailer() != document.getLastTrailer() && document.isLinearized();
   return Boolean.valueOf(res);
 }