Java PDFParserConfig Examples

Programming Language: Java

Class/Type: PDFParserConfig

Examples at hotexamples.com: 4

Java PDFParserConfig - 4 examples found. These are the top rated real world Java examples of PDFParserConfig extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getOCRStrategy(2)

getExtractAcroFormContent(1)

getExtractAnnotationText(1)

getOCRDPI(1)

getOCRImageFormatName(1)

getOCRImageType(1)

isCatchIntermediateIOExceptions(1)

Example #1

Show file

File: AbstractPDF2XHTML.java Project: Zarana-Parekh/tika

  void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
    if (config.getOCRStrategy().equals(NO_OCR)) {
      return;
    }
    TesseractOCRConfig tesseractConfig =
        context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);

    TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
    if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
      throw new TikaException(
          "Tesseract is not available. "
              + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
    }

    PDFRenderer renderer = new PDFRenderer(pdDocument);
    TemporaryResources tmp = new TemporaryResources();
    try {
      BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType());
      Path tmpFile = tmp.createTempFile();
      try (OutputStream os = Files.newOutputStream(tmpFile)) {
        // TODO: get output format from TesseractConfig
        ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), os, config.getOCRDPI());
      }
      try (InputStream is = TikaInputStream.get(tmpFile)) {
        tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
      }
    } catch (IOException e) {
      handleCatchableIOE(e);
    } catch (SAXException e) {
      throw new IOExceptionWithCause("error writing OCR content from PDF", e);
    } finally {
      tmp.dispose();
    }
  }

Example #2

Show file

File: AbstractPDF2XHTML.java Project: Zarana-Parekh/tika

  @Override
  protected void endDocument(PDDocument pdf) throws IOException {
    try {
      // Extract text for any bookmarks:
      extractBookmarkText();
      try {
        extractEmbeddedDocuments(pdf);
      } catch (IOException e) {
        handleCatchableIOE(e);
      }

      // extract acroform data at end of doc
      if (config.getExtractAcroFormContent() == true) {
        try {
          extractAcroForm(pdf);
        } catch (IOException e) {
          handleCatchableIOE(e);
        }
      }
      xhtml.endDocument();
    } catch (TikaException e) {
      throw new IOExceptionWithCause("Unable to end a document", e);
    } catch (SAXException e) {
      throw new IOExceptionWithCause("Unable to end a document", e);
    }
  }

Example #3

Show file

File: AbstractPDF2XHTML.java Project: Zarana-Parekh/tika

 void handleCatchableIOE(IOException e) throws IOException {
   if (config.isCatchIntermediateIOExceptions()) {
     String msg = e.getMessage();
     if (msg == null) {
       msg = "IOException, no message";
     }
     metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
     exceptions.add(e);
   } else {
     throw e;
   }
 }

Example #4

Show file

File: AbstractPDF2XHTML.java Project: Zarana-Parekh/tika

  @Override
  protected void endPage(PDPage page) throws IOException {

    try {
      EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
      for (PDAnnotation annotation : page.getAnnotations()) {

        if (annotation instanceof PDAnnotationFileAttachment) {
          PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
          PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
          try {
            extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
          } catch (SAXException e) {
            throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
          } catch (TikaException e) {
            throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
          } catch (IOException e) {
            handleCatchableIOE(e);
          }
        }
        // TODO: remove once PDFBOX-1143 is fixed:
        if (config.getExtractAnnotationText()) {
          if (annotation instanceof PDAnnotationLink) {
            PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
            if (annotationlink.getAction() != null) {
              PDAction action = annotationlink.getAction();
              if (action instanceof PDActionURI) {
                // can't currently associate link to text.
                // for now, extract link and repeat the link as if it
                // were the visible text
                PDActionURI uri = (PDActionURI) action;
                String link = uri.getURI();
                if (link != null && link.trim().length() > 0) {
                  xhtml.startElement("div", "class", "annotation");
                  xhtml.startElement("a", "href", link);
                  xhtml.characters(link);
                  xhtml.endElement("a");
                  xhtml.endElement("div");
                }
              }
            }
          }

          if (annotation instanceof PDAnnotationMarkup) {
            PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
            String title = annotationMarkup.getTitlePopup();
            String subject = annotationMarkup.getSubject();
            String contents = annotationMarkup.getContents();
            // TODO: maybe also annotationMarkup.getRichContents()?
            if (title != null || subject != null || contents != null) {
              xhtml.startElement("div", "class", "annotation");

              if (title != null) {
                xhtml.startElement("div", "class", "annotationTitle");
                xhtml.characters(title);
                xhtml.endElement("div");
              }

              if (subject != null) {
                xhtml.startElement("div", "class", "annotationSubject");
                xhtml.characters(subject);
                xhtml.endElement("div");
              }

              if (contents != null) {
                xhtml.startElement("div", "class", "annotationContents");
                xhtml.characters(contents);
                xhtml.endElement("div");
              }

              xhtml.endElement("div");
            }
          }
        }
      }
      if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
        doOCROnCurrentPage();
      }
      xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
      throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
      exceptions.add(e);
    } finally {
      pageIndex++;
    }
  }