@Override public void initialize() throws ResourceInitializationException { super.initialize(); logger = getLogger(); String fileName = (String) getConfigParameterValue(INPUT_FILE_PARAM); try { file = new File(URI.create(fileName)); } catch (Exception e) { ExceptionHandler.logAndRethrow(logger, e); } if (!file.exists()) { ExceptionHandler.logAndThrow(logger, "Input file does not exists"); } }
@Override public void getNext(final CAS aCAS) throws IOException, CollectionException { TikaProcessor processor = new TikaProcessor(); try { processor = TikaProcessor.newInstance(file); } catch (Exception e) { ExceptionHandler.logAndRethrow(logger, "TikaProcessor: ", e); } String documentText = processor.getText(); if (documentText == null || documentText.length() == 0) { ExceptionHandler.logAndThrow(logger, "Document text is null or empty"); } aCAS.setDocumentText(documentText); String textLanguage = processor.getLanguage(); if (!textLanguage.contains("ru")) { ExceptionHandler.logAndThrow(logger, "Document language is not russian"); } aCAS.setDocumentLanguage(textLanguage); }