void doOCROnCurrentPage() throws IOException, TikaException, SAXException { if (config.getOCRStrategy().equals(NO_OCR)) { return; } TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG); TesseractOCRParser tesseractOCRParser = new TesseractOCRParser(); if (!tesseractOCRParser.hasTesseract(tesseractConfig)) { throw new TikaException( "Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); } PDFRenderer renderer = new PDFRenderer(pdDocument); TemporaryResources tmp = new TemporaryResources(); try { BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType()); Path tmpFile = tmp.createTempFile(); try (OutputStream os = Files.newOutputStream(tmpFile)) { // TODO: get output format from TesseractConfig ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), os, config.getOCRDPI()); } try (InputStream is = TikaInputStream.get(tmpFile)) { tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); } } catch (IOException e) { handleCatchableIOE(e); } catch (SAXException e) { throw new IOExceptionWithCause("error writing OCR content from PDF", e); } finally { tmp.dispose(); } }
@Test public void testImageMagick() throws Exception { InputStream stream = TesseractOCRConfig.class.getResourceAsStream("/test-properties/TesseractOCR.properties"); TesseractOCRConfig config = new TesseractOCRConfig(stream); String[] CheckCmd = {config.getImageMagickPath() + TesseractOCRParser.getImageMagickProg()}; assumeTrue(ExternalParser.check(CheckCmd)); }
/* If Tesseract is found, test we retrieve the proper number of supporting Parsers. */ @Test public void offersTypesIfFound() throws Exception { TesseractOCRParser parser = new TesseractOCRParser(); DefaultParser defaultParser = new DefaultParser(); ParseContext parseContext = new ParseContext(); MediaType png = MediaType.image("png"); // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG. assumeTrue(canRun()); assertEquals(5, parser.getSupportedTypes(parseContext).size()); assertTrue(parser.getSupportedTypes(parseContext).contains(png)); // DefaultParser will now select the TesseractOCRParser. assertEquals( TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); }
/* Check that if Tesseract is not found, the TesseractOCRParser claims to not support any file types. So, the standard image parser is called instead. */ @Test public void offersNoTypesIfNotFound() throws Exception { TesseractOCRParser parser = new TesseractOCRParser(); DefaultParser defaultParser = new DefaultParser(); MediaType png = MediaType.image("png"); // With an invalid path, will offer no types TesseractOCRConfig invalidConfig = new TesseractOCRConfig(); invalidConfig.setTesseractPath("/made/up/path"); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, invalidConfig); // No types offered assertEquals(0, parser.getSupportedTypes(parseContext).size()); // And DefaultParser won't use us assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); }