Ejemplo n.º 1
0
 /**
  * Delegates the call to the matching component parser.
  *
  * <p>Potential {@link RuntimeException}s, {@link IOException}s and {@link SAXException}s
  * unrelated to the given input stream and content handler are automatically wrapped into {@link
  * TikaException}s to better honor the {@link Parser} contract.
  */
 public void parse(
     InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
     throws IOException, SAXException, TikaException {
   Parser parser = getParser(metadata, context);
   TemporaryResources tmp = new TemporaryResources();
   try {
     TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
     TaggedContentHandler taggedHandler =
         handler != null ? new TaggedContentHandler(handler) : null;
     if (parser instanceof ParserDecorator) {
       metadata.add(
           "X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName());
     } else {
       metadata.add("X-Parsed-By", parser.getClass().getName());
     }
     try {
       parser.parse(taggedStream, taggedHandler, metadata, context);
     } catch (RuntimeException e) {
       throw new TikaException("Unexpected RuntimeException from " + parser, e);
     } catch (IOException e) {
       taggedStream.throwIfCauseOf(e);
       throw new TikaException("TIKA-198: Illegal IOException from " + parser, e);
     } catch (SAXException e) {
       if (taggedHandler != null) taggedHandler.throwIfCauseOf(e);
       throw new TikaException("TIKA-237: Illegal SAXException from " + parser, e);
     }
   } finally {
     tmp.dispose();
   }
 }
Ejemplo n.º 2
0
  void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
    if (config.getOCRStrategy().equals(NO_OCR)) {
      return;
    }
    TesseractOCRConfig tesseractConfig =
        context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);

    TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
    if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
      throw new TikaException(
          "Tesseract is not available. "
              + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
    }

    PDFRenderer renderer = new PDFRenderer(pdDocument);
    TemporaryResources tmp = new TemporaryResources();
    try {
      BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType());
      Path tmpFile = tmp.createTempFile();
      try (OutputStream os = Files.newOutputStream(tmpFile)) {
        // TODO: get output format from TesseractConfig
        ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), os, config.getOCRDPI());
      }
      try (InputStream is = TikaInputStream.get(tmpFile)) {
        tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
      }
    } catch (IOException e) {
      handleCatchableIOE(e);
    } catch (SAXException e) {
      throw new IOExceptionWithCause("error writing OCR content from PDF", e);
    } finally {
      tmp.dispose();
    }
  }