/** * Delegates the call to the matching component parser. * * <p>Potential {@link RuntimeException}s, {@link IOException}s and {@link SAXException}s * unrelated to the given input stream and content handler are automatically wrapped into {@link * TikaException}s to better honor the {@link Parser} contract. */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { Parser parser = getParser(metadata, context); TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream taggedStream = TikaInputStream.get(stream, tmp); TaggedContentHandler taggedHandler = handler != null ? new TaggedContentHandler(handler) : null; if (parser instanceof ParserDecorator) { metadata.add( "X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName()); } else { metadata.add("X-Parsed-By", parser.getClass().getName()); } try { parser.parse(taggedStream, taggedHandler, metadata, context); } catch (RuntimeException e) { throw new TikaException("Unexpected RuntimeException from " + parser, e); } catch (IOException e) { taggedStream.throwIfCauseOf(e); throw new TikaException("TIKA-198: Illegal IOException from " + parser, e); } catch (SAXException e) { if (taggedHandler != null) taggedHandler.throwIfCauseOf(e); throw new TikaException("TIKA-237: Illegal SAXException from " + parser, e); } } finally { tmp.dispose(); } }
void doOCROnCurrentPage() throws IOException, TikaException, SAXException { if (config.getOCRStrategy().equals(NO_OCR)) { return; } TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG); TesseractOCRParser tesseractOCRParser = new TesseractOCRParser(); if (!tesseractOCRParser.hasTesseract(tesseractConfig)) { throw new TikaException( "Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); } PDFRenderer renderer = new PDFRenderer(pdDocument); TemporaryResources tmp = new TemporaryResources(); try { BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType()); Path tmpFile = tmp.createTempFile(); try (OutputStream os = Files.newOutputStream(tmpFile)) { // TODO: get output format from TesseractConfig ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), os, config.getOCRDPI()); } try (InputStream is = TikaInputStream.get(tmpFile)) { tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); } } catch (IOException e) { handleCatchableIOE(e); } catch (SAXException e) { throw new IOExceptionWithCause("error writing OCR content from PDF", e); } finally { tmp.dispose(); } }