/** * Test of TessBaseAPIGetHOCRText method, of class TessAPI1. * * @throws Exception while getting ocr text from image. */ @Test public void testTessBaseAPIGetHOCRText() throws Exception { logger.info("TessBaseAPIGetHOCRText"); File tiff = new File(this.testResourcesDataPath, "eurotext.tif"); BufferedImage image = ImageIO.read(new FileInputStream(tiff)); // require jai-imageio lib to read TIFF ByteBuffer buf = ImageIOHelper.convertImageData(image); int bpp = image.getColorModel().getPixelSize(); int bytespp = bpp / 8; int bytespl = (int) Math.ceil(image.getWidth() * bpp / 8.0); TessAPI1.TessBaseAPISetPageSegMode(handle, TessPageSegMode.PSM_AUTO); TessAPI1.TessBaseAPIInit3(handle, datapath, language); TessAPI1.TessBaseAPISetImage( handle, buf, image.getWidth(), image.getHeight(), bytespp, bytespl); int page_number = 0; Pointer utf8Text = TessAPI1.TessBaseAPIGetHOCRText(handle, page_number); String result = utf8Text.getString(0); TessAPI1.TessDeleteText(utf8Text); assertTrue(result.contains("<div class='ocr_page'")); }