예제 #1
0
  void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
    if (config.getOCRStrategy().equals(NO_OCR)) {
      return;
    }
    TesseractOCRConfig tesseractConfig =
        context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);

    TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
    if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
      throw new TikaException(
          "Tesseract is not available. "
              + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
    }

    PDFRenderer renderer = new PDFRenderer(pdDocument);
    TemporaryResources tmp = new TemporaryResources();
    try {
      BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType());
      Path tmpFile = tmp.createTempFile();
      try (OutputStream os = Files.newOutputStream(tmpFile)) {
        // TODO: get output format from TesseractConfig
        ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), os, config.getOCRDPI());
      }
      try (InputStream is = TikaInputStream.get(tmpFile)) {
        tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
      }
    } catch (IOException e) {
      handleCatchableIOE(e);
    } catch (SAXException e) {
      throw new IOExceptionWithCause("error writing OCR content from PDF", e);
    } finally {
      tmp.dispose();
    }
  }
예제 #2
0
  @Override
  protected void endDocument(PDDocument pdf) throws IOException {
    try {
      // Extract text for any bookmarks:
      extractBookmarkText();
      try {
        extractEmbeddedDocuments(pdf);
      } catch (IOException e) {
        handleCatchableIOE(e);
      }

      // extract acroform data at end of doc
      if (config.getExtractAcroFormContent() == true) {
        try {
          extractAcroForm(pdf);
        } catch (IOException e) {
          handleCatchableIOE(e);
        }
      }
      xhtml.endDocument();
    } catch (TikaException e) {
      throw new IOExceptionWithCause("Unable to end a document", e);
    } catch (SAXException e) {
      throw new IOExceptionWithCause("Unable to end a document", e);
    }
  }
예제 #3
0
 void handleCatchableIOE(IOException e) throws IOException {
   if (config.isCatchIntermediateIOExceptions()) {
     String msg = e.getMessage();
     if (msg == null) {
       msg = "IOException, no message";
     }
     metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
     exceptions.add(e);
   } else {
     throw e;
   }
 }
예제 #4
0
  @Override
  protected void endPage(PDPage page) throws IOException {

    try {
      EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
      for (PDAnnotation annotation : page.getAnnotations()) {

        if (annotation instanceof PDAnnotationFileAttachment) {
          PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
          PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
          try {
            extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
          } catch (SAXException e) {
            throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
          } catch (TikaException e) {
            throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
          } catch (IOException e) {
            handleCatchableIOE(e);
          }
        }
        // TODO: remove once PDFBOX-1143 is fixed:
        if (config.getExtractAnnotationText()) {
          if (annotation instanceof PDAnnotationLink) {
            PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
            if (annotationlink.getAction() != null) {
              PDAction action = annotationlink.getAction();
              if (action instanceof PDActionURI) {
                // can't currently associate link to text.
                // for now, extract link and repeat the link as if it
                // were the visible text
                PDActionURI uri = (PDActionURI) action;
                String link = uri.getURI();
                if (link != null && link.trim().length() > 0) {
                  xhtml.startElement("div", "class", "annotation");
                  xhtml.startElement("a", "href", link);
                  xhtml.characters(link);
                  xhtml.endElement("a");
                  xhtml.endElement("div");
                }
              }
            }
          }

          if (annotation instanceof PDAnnotationMarkup) {
            PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
            String title = annotationMarkup.getTitlePopup();
            String subject = annotationMarkup.getSubject();
            String contents = annotationMarkup.getContents();
            // TODO: maybe also annotationMarkup.getRichContents()?
            if (title != null || subject != null || contents != null) {
              xhtml.startElement("div", "class", "annotation");

              if (title != null) {
                xhtml.startElement("div", "class", "annotationTitle");
                xhtml.characters(title);
                xhtml.endElement("div");
              }

              if (subject != null) {
                xhtml.startElement("div", "class", "annotationSubject");
                xhtml.characters(subject);
                xhtml.endElement("div");
              }

              if (contents != null) {
                xhtml.startElement("div", "class", "annotationContents");
                xhtml.characters(contents);
                xhtml.endElement("div");
              }

              xhtml.endElement("div");
            }
          }
        }
      }
      if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
        doOCROnCurrentPage();
      }
      xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
      throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
      exceptions.add(e);
    } finally {
      pageIndex++;
    }
  }