Esempio n. 1
0
  void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
    if (config.getOCRStrategy().equals(NO_OCR)) {
      return;
    }
    TesseractOCRConfig tesseractConfig =
        context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);

    TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
    if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
      throw new TikaException(
          "Tesseract is not available. "
              + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
    }

    PDFRenderer renderer = new PDFRenderer(pdDocument);
    TemporaryResources tmp = new TemporaryResources();
    try {
      BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType());
      Path tmpFile = tmp.createTempFile();
      try (OutputStream os = Files.newOutputStream(tmpFile)) {
        // TODO: get output format from TesseractConfig
        ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), os, config.getOCRDPI());
      }
      try (InputStream is = TikaInputStream.get(tmpFile)) {
        tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
      }
    } catch (IOException e) {
      handleCatchableIOE(e);
    } catch (SAXException e) {
      throw new IOExceptionWithCause("error writing OCR content from PDF", e);
    } finally {
      tmp.dispose();
    }
  }
 @Test
 public void testImageMagick() throws Exception {
   InputStream stream =
       TesseractOCRConfig.class.getResourceAsStream("/test-properties/TesseractOCR.properties");
   TesseractOCRConfig config = new TesseractOCRConfig(stream);
   String[] CheckCmd = {config.getImageMagickPath() + TesseractOCRParser.getImageMagickProg()};
   assumeTrue(ExternalParser.check(CheckCmd));
 }
  /*
  If Tesseract is found, test we retrieve the proper number of supporting Parsers.
   */
  @Test
  public void offersTypesIfFound() throws Exception {
    TesseractOCRParser parser = new TesseractOCRParser();
    DefaultParser defaultParser = new DefaultParser();

    ParseContext parseContext = new ParseContext();
    MediaType png = MediaType.image("png");

    // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
    assumeTrue(canRun());

    assertEquals(5, parser.getSupportedTypes(parseContext).size());
    assertTrue(parser.getSupportedTypes(parseContext).contains(png));

    // DefaultParser will now select the TesseractOCRParser.
    assertEquals(
        TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
  }
  /*
  Check that if Tesseract is not found, the TesseractOCRParser claims to not support
  any file types. So, the standard image parser is called instead.
   */
  @Test
  public void offersNoTypesIfNotFound() throws Exception {
    TesseractOCRParser parser = new TesseractOCRParser();
    DefaultParser defaultParser = new DefaultParser();
    MediaType png = MediaType.image("png");

    // With an invalid path, will offer no types
    TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
    invalidConfig.setTesseractPath("/made/up/path");

    ParseContext parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, invalidConfig);

    // No types offered
    assertEquals(0, parser.getSupportedTypes(parseContext).size());

    // And DefaultParser won't use us
    assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
  }