@Test public void testGetDocumentCas() throws ResourceInitializationException, IOException, SAXException, URISyntaxException, ParserConfigurationException { CAS aCAS = CasCreationUtils.createCas( XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null); corpusDAO.getDocumentCas(new URI("62007.txt"), "1", aCAS); assertThat(aCAS.getDocumentText(), containsString("РИА Новости")); assertEquals(6, CasUtil.selectAll(aCAS).size()); assertEquals( 1, CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon")) .size()); aCAS = CasCreationUtils.createCas( XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null); corpusDAO.getDocumentCas(new URI("62007.txt"), "5", aCAS); assertThat(aCAS.getDocumentText(), containsString("РИА Новости")); assertThat(CasUtil.selectAll(aCAS).size(), equalTo(5)); assertEquals( 0, CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon")) .size()); }
/** * Called when the processing of a Document is completed. <br> * The process status can be looked at and corresponding actions taken. * * @param aCas CAS corresponding to the completed processing * @param aStatus EntityProcessStatus that holds the status of all the events for aEntity */ public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) { if (aStatus.isException()) { List<Exception> exceptions = aStatus.getExceptions(); for (int i = 0; i < exceptions.size(); i++) { ((Throwable) exceptions.get(i)).printStackTrace(); } return; } entityCount++; String docText = aCas.getDocumentText(); if (docText != null) { size += docText.length(); } }
@Test public void test() throws Exception { String html = "<Parent>\n"; html += "<Child1>Some content</Child1>\n"; html += "<Child2 attribute=“someValue” />\n"; html += "<Child3>More content.</Child3>\n"; html += "</Parent>\n"; URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml"); if (urlA == null) { urlA = HtmlAnnotator.class .getClassLoader() .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml"); } URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml"); if (urlC == null) { urlC = HtmlAnnotator.class .getClassLoader() .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml"); } XMLInputSource inA = new XMLInputSource(urlA); ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA); AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA); aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false); aeA.reconfigure(); XMLInputSource inC = new XMLInputSource(urlC); ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC); AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC); aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false); aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true); aeC.setConfigParameterValue( HtmlConverter.PARAM_GAP_INDUCING_TAGS, new String[] {"child1", "child2", "child3"}); aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_TEXT, "$"); aeC.reconfigure(); CAS cas = aeA.newCAS(); Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG"); AnnotationIndex<AnnotationFS> ai = null; FSIterator<AnnotationFS> iterator = null; cas.setDocumentText(html); aeA.process(cas); aeC.process(cas); CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW); assertEquals("$Some content$$More content.", plainTextCas.getDocumentText()); ai = plainTextCas.getAnnotationIndex(tagType); iterator = ai.iterator(); assertEquals(4, ai.size()); assertEquals("$Some content$$More content.", iterator.next().getCoveredText()); assertEquals("$Some content", iterator.next().getCoveredText()); assertEquals("$", iterator.next().getCoveredText()); assertEquals("$More content.", iterator.next().getCoveredText()); cas.release(); }
@Test public void testExpandOffsets() throws Exception { String html = "<Parent>\n"; html += "<Child1>Some content</Child1>\n"; html += "<Child2 attribute=“someValue” />\n"; html += "<Child3>More content.</Child3>\n"; html += "</Parent>\n"; URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml"); if (urlA == null) { urlA = HtmlAnnotator.class .getClassLoader() .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml"); } URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml"); if (urlC == null) { urlC = HtmlAnnotator.class .getClassLoader() .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml"); } XMLInputSource inA = new XMLInputSource(urlA); ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA); AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA); aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false); aeA.reconfigure(); XMLInputSource inC = new XMLInputSource(urlC); ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC); AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC); aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false); aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true); aeC.setConfigParameterValue(HtmlConverter.PARAM_EXPAND_OFFSETS, true); aeC.reconfigure(); CAS cas = aeA.newCAS(); Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG"); Feature expandedFeature = tagType.getFeatureByBaseName("expandedOffsets"); AnnotationIndex<AnnotationFS> ai = null; FSIterator<AnnotationFS> iterator = null; cas.setDocumentText(html); aeA.process(cas); aeC.process(cas); CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW); assertEquals("Some contentMore content.", plainTextCas.getDocumentText()); ai = plainTextCas.getAnnotationIndex(tagType); iterator = ai.iterator(); assertEquals(4, ai.size()); AnnotationFS next = null; next = iterator.next(); assertEquals(false, next.getBooleanValue(expandedFeature)); assertEquals("Some contentMore content.", next.getCoveredText()); next = iterator.next(); assertEquals(false, next.getBooleanValue(expandedFeature)); assertEquals("Some content", next.getCoveredText()); next = iterator.next(); boolean b1 = next.getBooleanValue(expandedFeature); assertEquals("More content.", next.getCoveredText()); next = iterator.next(); boolean b2 = next.getBooleanValue(expandedFeature); assertEquals("More content.", next.getCoveredText()); // for one of these two annotation (with same offsets) the feature must be set to true assertEquals(true, b1 || b2); cas.release(); }
/** Copied and modified from {@link org.apache.uima.util.CasToInlineXml} */ private static String toXML(CAS cas, AnnotationsToElements converter) throws SAXException { ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); XMLSerializer sax2xml = new XMLSerializer(byteArrayOutputStream, false); // get document text String docText = cas.getDocumentText(); char[] docCharArray = docText.toCharArray(); // get iterator over annotations sorted by increasing start position and // decreasing end position FSIterator<AnnotationFS> iterator = cas.getAnnotationIndex().iterator(); // This is basically a recursive algorithm that has had the recursion // removed through the use of an explicit Stack. We iterate over the // annotations, and if an annotation contains other annotations, we // push the parent annotation on the stack, process the children, and // then come back to the parent later. List<AnnotationFS> stack = new ArrayList<AnnotationFS>(); int pos = 0; ContentHandler handler = sax2xml.getContentHandler(); handler.startDocument(); // write the start tag converter.startRootElement(handler); // now use null is a placeholder for this artificial Document annotation AnnotationFS curAnnot = null; while (iterator.isValid()) { AnnotationFS nextAnnot = iterator.get(); if (curAnnot == null || nextAnnot.getBegin() < curAnnot.getEnd()) { // nextAnnot's start point is within the span of curAnnot if (curAnnot == null || nextAnnot.getEnd() <= curAnnot.getEnd()) // crossover span check { // nextAnnot is contained within curAnnot // write text between current pos and beginning of nextAnnot try { handler.characters(docCharArray, pos, nextAnnot.getBegin() - pos); pos = nextAnnot.getBegin(); converter.startAnnotationElement(nextAnnot, handler); // push parent annotation on stack stack.add(curAnnot); // move on to next annotation curAnnot = nextAnnot; } catch (StringIndexOutOfBoundsException e) { System.err.println( "Invalid annotation range: " + nextAnnot.getBegin() + "," + nextAnnot.getEnd() + " in document of length " + docText.length()); } } iterator.moveToNext(); } else { // nextAnnot begins after curAnnot ends // write text between current pos and end of curAnnot try { handler.characters(docCharArray, pos, curAnnot.getEnd() - pos); pos = curAnnot.getEnd(); } catch (StringIndexOutOfBoundsException e) { System.err.println( "Invalid annotation range: " + curAnnot.getBegin() + "," + curAnnot.getEnd() + " in document of length " + docText.length()); } converter.endAnnotationElement(curAnnot, handler); // pop next containing annotation off stack curAnnot = stack.remove(stack.size() - 1); } } // finished writing all start tags, now finish up if (curAnnot != null) { try { handler.characters(docCharArray, pos, curAnnot.getEnd() - pos); pos = curAnnot.getEnd(); } catch (StringIndexOutOfBoundsException e) { System.err.println( "Invalid annotation range: " + curAnnot.getBegin() + "," + curAnnot.getEnd() + "in document of length " + docText.length()); } converter.endAnnotationElement(curAnnot, handler); while (!stack.isEmpty()) { curAnnot = stack.remove(stack.size() - 1); // pop if (curAnnot == null) { break; } try { handler.characters(docCharArray, pos, curAnnot.getEnd() - pos); pos = curAnnot.getEnd(); } catch (StringIndexOutOfBoundsException e) { System.err.println( "Invalid annotation range: " + curAnnot.getBegin() + "," + curAnnot.getEnd() + "in document of length " + docText.length()); } converter.endAnnotationElement(curAnnot, handler); } } if (pos < docCharArray.length) { handler.characters(docCharArray, pos, docCharArray.length - pos); } converter.endRootElement(handler); handler.endDocument(); // return XML string return new String(byteArrayOutputStream.toByteArray()); }
/** * Called when the processing of a Document is completed. <br> * The process status can be looked at and corresponding actions taken. * * @param aCas CAS corresponding to the completed processing * @param aStatus EntityProcessStatus that holds the status of all the events for aEntity */ public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) { if (aStatus != null) { if (aStatus.isException()) { System.err.println("Error on process CAS call to remote service:"); List exceptions = aStatus.getExceptions(); for (int i = 0; i < exceptions.size(); i++) { ((Throwable) exceptions.get(i)).printStackTrace(); } if (!ignoreErrors) { System.err.println("Terminating Client..."); stop(); } } if (logCas) { String ip = "no IP"; List eList = aStatus.getProcessTrace().getEventsByComponentName("UimaEE", false); for (int e = 0; e < eList.size(); e++) { ProcessTraceEvent event = (ProcessTraceEvent) eList.get(e); if (event.getDescription().equals("Service IP")) { ip = event.getResultMessage(); } } String casId = ((UimaASProcessStatus) aStatus).getCasReferenceId(); if (casId != null) { long current = System.nanoTime() / 1000000 - mStartTime; if (casMap.containsKey(casId)) { Object value = casMap.get(casId); if (value != null && value instanceof Long) { long start = ((Long) value).longValue(); System.out.println(ip + "\t" + start + "\t" + (current - start)); } } } } else { System.out.print("."); if (0 == (entityCount + 1) % 50) { System.out.print((entityCount + 1) + " processed\n"); } } } // if output dir specified, dump CAS to XMI if (outputDir != null) { // try to retrieve the filename of the input file from the CAS File outFile = null; Type srcDocInfoType = aCas.getTypeSystem().getType("org.apache.uima.examples.SourceDocumentInformation"); if (srcDocInfoType != null) { FSIterator it = aCas.getIndexRepository().getAllIndexedFS(srcDocInfoType); if (it.hasNext()) { FeatureStructure srcDocInfoFs = it.get(); Feature uriFeat = srcDocInfoType.getFeatureByBaseName("uri"); Feature offsetInSourceFeat = srcDocInfoType.getFeatureByBaseName("offsetInSource"); String uri = srcDocInfoFs.getStringValue(uriFeat); int offsetInSource = srcDocInfoFs.getIntValue(offsetInSourceFeat); File inFile; try { inFile = new File(new URL(uri).getPath()); String outFileName = inFile.getName(); if (offsetInSource > 0) { outFileName += ("_" + offsetInSource); } outFileName += ".xmi"; outFile = new File((String) outputDir, outFileName); } catch (MalformedURLException e1) { // invalid URI, use default processing below } } } if (outFile == null) { outFile = new File((String) outputDir, "doc" + entityCount); } try { FileOutputStream outStream = new FileOutputStream(outFile); try { XmiCasSerializer.serialize(aCas, outStream); } finally { outStream.close(); } } catch (Exception e) { System.err.println("Could not save CAS to XMI file"); e.printStackTrace(); } } // update stats entityCount++; String docText = aCas.getDocumentText(); if (docText != null) { size += docText.length(); } // Called just before sendCas with next CAS from collection reader }