@Override protected void endDocument(final PDDocument aPdf) throws IOException { cas.setDocumentText(text.toString()); if (log.isTraceEnabled()) { log.trace("</document>"); } }
@Override public void getNext(final CAS aCAS) throws IOException, CollectionException { TikaProcessor processor = new TikaProcessor(); try { processor = TikaProcessor.newInstance(file); } catch (Exception e) { ExceptionHandler.logAndRethrow(logger, "TikaProcessor: ", e); } String documentText = processor.getText(); if (documentText == null || documentText.length() == 0) { ExceptionHandler.logAndThrow(logger, "Document text is null or empty"); } aCAS.setDocumentText(documentText); String textLanguage = processor.getLanguage(); if (!textLanguage.contains("ru")) { ExceptionHandler.logAndThrow(logger, "Document language is not russian"); } aCAS.setDocumentLanguage(textLanguage); }
public void run() throws Exception { // create Asynchronous Client API and initialize it uimaAsEngine = new BaseUIMAAsynchronousEngine_impl(); // callback // uimaAsEngine.addStatusCallbackListener(new StatusCallbackListener()); initializeUimaAsEngine(uimaAsEngine); String filePath = "C:\\WebScience\\Progetti\\K-People\\OntologyController_UIMA\\apache-uima\\examples\\src\\it\\webscience\\uima\\event-2031.xml"; String xml = readFile(filePath); // get an empty CAS from the Cas pool CAS cas = uimaAsEngine.getCAS(); // Initialize it with input data cas.setDocumentText(xml); // Send Cas to service for processing uimaAsEngine.sendCAS(cas); }
@Override public void getNext(CAS cas) throws IOException, CollectionException { this.cumulatedLength += currentDoc.getText().length(); logger.info( "[Stream {}] Processing document {}: {} (total length processed: {})", this.streamName, this.mCurrentIndex, this.currentDoc.getUri(), this.cumulatedLength); SourceDocumentInformation sdi; try { sdi = new SourceDocumentInformation(cas.getJCas()); sdi.setUri(currentDoc.getUri()); cas.setDocumentLanguage(mLanguage.getCode()); cas.setDocumentText(currentDoc.getText()); sdi.setDocumentSize(currentDoc.getText().length()); sdi.setCumulatedDocumentSize(this.cumulatedLength); sdi.setBegin(0); sdi.setEnd(currentDoc.getText().length()); sdi.setOffsetInSource(0); sdi.setDocumentIndex(mCurrentIndex); /* * Cannot be known in case of streaming */ sdi.setCorpusSize(-1); sdi.setNbDocuments(-1); // Cannot know if this is the last sdi.setLastSegment(false); sdi.addToIndexes(); this.mCurrentIndex++; } catch (CASException e) { throw new CollectionException(e); } }
public static void main(String[] args) throws Exception { String sLine; long startTime = System.currentTimeMillis(); URL descUrl = VectorSpaceRetrieval.class.getResource( "/descriptors/retrievalsystem/VectorSpaceRetrieval.xml"); if (descUrl == null) { throw new IllegalArgumentException("Error opening VectorSpaceRetrieval.xml"); } // create AnalysisEngine XMLInputSource input = new XMLInputSource(descUrl); AnalysisEngineDescription desc = UIMAFramework.getXMLParser().parseAnalysisEngineDescription(input); AnalysisEngine anAnalysisEngine = UIMAFramework.produceAnalysisEngine(desc); CAS aCas = anAnalysisEngine.newCAS(); URL docUrl = VectorSpaceRetrieval.class.getResource("/data/documents.txt"); if (docUrl == null) { throw new IllegalArgumentException("Error opening data/documents.txt"); } BufferedReader br = new BufferedReader(new InputStreamReader(docUrl.openStream())); while ((sLine = br.readLine()) != null) { aCas.setDocumentText(sLine); anAnalysisEngine.process(aCas); aCas.reset(); } br.close(); br = null; anAnalysisEngine.collectionProcessComplete(); anAnalysisEngine.destroy(); long endTime = System.currentTimeMillis(); double totalTime = (endTime - startTime) / 1000.0; System.out.println("Total time taken: " + totalTime); }
@Test public void test() throws Exception { String html = "<Parent>\n"; html += "<Child1>Some content</Child1>\n"; html += "<Child2 attribute=“someValue” />\n"; html += "<Child3>More content.</Child3>\n"; html += "</Parent>\n"; URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml"); if (urlA == null) { urlA = HtmlAnnotator.class .getClassLoader() .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml"); } URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml"); if (urlC == null) { urlC = HtmlAnnotator.class .getClassLoader() .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml"); } XMLInputSource inA = new XMLInputSource(urlA); ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA); AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA); aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false); aeA.reconfigure(); XMLInputSource inC = new XMLInputSource(urlC); ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC); AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC); aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false); aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true); aeC.setConfigParameterValue( HtmlConverter.PARAM_GAP_INDUCING_TAGS, new String[] {"child1", "child2", "child3"}); aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_TEXT, "$"); aeC.reconfigure(); CAS cas = aeA.newCAS(); Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG"); AnnotationIndex<AnnotationFS> ai = null; FSIterator<AnnotationFS> iterator = null; cas.setDocumentText(html); aeA.process(cas); aeC.process(cas); CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW); assertEquals("$Some content$$More content.", plainTextCas.getDocumentText()); ai = plainTextCas.getAnnotationIndex(tagType); iterator = ai.iterator(); assertEquals(4, ai.size()); assertEquals("$Some content$$More content.", iterator.next().getCoveredText()); assertEquals("$Some content", iterator.next().getCoveredText()); assertEquals("$", iterator.next().getCoveredText()); assertEquals("$More content.", iterator.next().getCoveredText()); cas.release(); }
@Test public void testExpandOffsets() throws Exception { String html = "<Parent>\n"; html += "<Child1>Some content</Child1>\n"; html += "<Child2 attribute=“someValue” />\n"; html += "<Child3>More content.</Child3>\n"; html += "</Parent>\n"; URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml"); if (urlA == null) { urlA = HtmlAnnotator.class .getClassLoader() .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml"); } URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml"); if (urlC == null) { urlC = HtmlAnnotator.class .getClassLoader() .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml"); } XMLInputSource inA = new XMLInputSource(urlA); ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA); AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA); aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false); aeA.reconfigure(); XMLInputSource inC = new XMLInputSource(urlC); ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC); AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC); aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false); aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true); aeC.setConfigParameterValue(HtmlConverter.PARAM_EXPAND_OFFSETS, true); aeC.reconfigure(); CAS cas = aeA.newCAS(); Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG"); Feature expandedFeature = tagType.getFeatureByBaseName("expandedOffsets"); AnnotationIndex<AnnotationFS> ai = null; FSIterator<AnnotationFS> iterator = null; cas.setDocumentText(html); aeA.process(cas); aeC.process(cas); CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW); assertEquals("Some contentMore content.", plainTextCas.getDocumentText()); ai = plainTextCas.getAnnotationIndex(tagType); iterator = ai.iterator(); assertEquals(4, ai.size()); AnnotationFS next = null; next = iterator.next(); assertEquals(false, next.getBooleanValue(expandedFeature)); assertEquals("Some contentMore content.", next.getCoveredText()); next = iterator.next(); assertEquals(false, next.getBooleanValue(expandedFeature)); assertEquals("Some content", next.getCoveredText()); next = iterator.next(); boolean b1 = next.getBooleanValue(expandedFeature); assertEquals("More content.", next.getCoveredText()); next = iterator.next(); boolean b2 = next.getBooleanValue(expandedFeature); assertEquals("More content.", next.getCoveredText()); // for one of these two annotation (with same offsets) the feature must be set to true assertEquals(true, b1 || b2); cas.release(); }