@Test public void testGetDocumentCas() throws ResourceInitializationException, IOException, SAXException, URISyntaxException, ParserConfigurationException { CAS aCAS = CasCreationUtils.createCas( XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null); corpusDAO.getDocumentCas(new URI("62007.txt"), "1", aCAS); assertThat(aCAS.getDocumentText(), containsString("РИА Новости")); assertEquals(6, CasUtil.selectAll(aCAS).size()); assertEquals( 1, CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon")) .size()); aCAS = CasCreationUtils.createCas( XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null); corpusDAO.getDocumentCas(new URI("62007.txt"), "5", aCAS); assertThat(aCAS.getDocumentText(), containsString("РИА Новости")); assertThat(CasUtil.selectAll(aCAS).size(), equalTo(5)); assertEquals( 0, CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon")) .size()); }
/** Performs name finding on the given cas object. */ public final void process(CAS cas) { if (isRemoveExistingAnnotations) { final AnnotationComboIterator sentenceNameCombo = new AnnotationComboIterator(cas, mSentenceType, mNameType); List<AnnotationFS> removeAnnotations = new LinkedList<AnnotationFS>(); for (AnnotationIteratorPair annotationIteratorPair : sentenceNameCombo) { for (AnnotationFS nameAnnotation : annotationIteratorPair.getSubIterator()) { removeAnnotations.add(nameAnnotation); } } for (AnnotationFS annotation : removeAnnotations) { cas.removeFsFromIndexes(annotation); } } final AnnotationComboIterator sentenceTokenCombo = new AnnotationComboIterator(cas, mSentenceType, mTokenType); for (AnnotationIteratorPair annotationIteratorPair : sentenceTokenCombo) { final List<AnnotationFS> sentenceTokenAnnotationList = new LinkedList<AnnotationFS>(); final List<String> sentenceTokenList = new LinkedList<String>(); for (AnnotationFS tokenAnnotation : annotationIteratorPair.getSubIterator()) { sentenceTokenAnnotationList.add(tokenAnnotation); sentenceTokenList.add(tokenAnnotation.getCoveredText()); } Span[] names = find(cas, (String[]) sentenceTokenList.toArray(new String[sentenceTokenList.size()])); AnnotationFS nameAnnotations[] = new AnnotationFS[names.length]; for (int i = 0; i < names.length; i++) { int startIndex = ((AnnotationFS) sentenceTokenAnnotationList.get(names[i].getStart())).getBegin(); int endIndex = ((AnnotationFS) sentenceTokenAnnotationList.get(names[i].getEnd() - 1)).getEnd(); nameAnnotations[i] = cas.createAnnotation(mNameType, startIndex, endIndex); cas.getIndexRepository().addFS(nameAnnotations[i]); } postProcessAnnotations(names, nameAnnotations); } documentDone(cas); }
@Test public void test() { CAS cas = RutaTestUtils.processTestScript(this.getClass()); RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "A b"); RutaTestUtils.assertAnnotationsEquals(cas, 2, 1, "b A"); RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "A b A"); cas.release(); }
@Override public synchronized String nextSentence() { if (sentences == null || !sentences.hasNext()) { try { if (getReader().hasNext()) { CAS cas = resource.retrieve(); try { getReader().getNext(cas); } catch (Exception e) { log.warn("Done iterating returning an empty string"); return ""; } resource.getAnalysisEngine().process(cas); List<String> list = new ArrayList<>(); for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) { list.add(sentence.getCoveredText()); } sentences = list.iterator(); // needs to be next cas while (!sentences.hasNext()) { // sentence is empty; go to another cas if (reader.hasNext()) { cas.reset(); getReader().getNext(cas); resource.getAnalysisEngine().process(cas); for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) { list.add(sentence.getCoveredText()); } sentences = list.iterator(); } else return null; } String ret = sentences.next(); if (this.getPreProcessor() != null) ret = this.getPreProcessor().preProcess(ret); return ret; } return null; } catch (Exception e) { throw new RuntimeException(e); } } else { String ret = sentences.next(); if (this.getPreProcessor() != null) ret = this.getPreProcessor().preProcess(ret); return ret; } }
/** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } // open input stream to file File file = (File) mFiles.get(mCurrentIndex++); String text = FileUtils.file2String(file, mEncoding); // put document in CAS jcas.setDocumentText(text); // set language if it was explicitly specified as a configuration parameter if (mLanguage != null) { jcas.setDocumentLanguage(mLanguage); } // Also store location of source document in CAS. This information is critical // if CAS Consumers will need to know where the original document contents are located. // For example, the Semantic Search CAS Indexer writes this information into the // search index that it creates, which allows applications that use the search index to // locate the documents that satisfy their semantic queries. SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas); srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString()); srcDocInfo.setOffsetInSource(0); srcDocInfo.setDocumentSize((int) file.length()); srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size()); srcDocInfo.addToIndexes(); }
private Type getInputType(CAS cas, String typeName) { Type type = cas.getTypeSystem().getType(typeName); if (type == null) { throw new IllegalStateException("Type [" + typeName + "] not found in type system"); } return type; }
@Override protected void endDocument(final PDDocument aPdf) throws IOException { cas.setDocumentText(text.toString()); if (log.isTraceEnabled()) { log.trace("</document>"); } }
@Override protected void generateAnnotations(JCas jcas) throws AnalysisEngineProcessException, FeaturePathException { // CAS is necessary to retrieve values CAS currCAS = jcas.getCas(); for (String path : paths) { // Separate Typename and featurepath String[] segments = path.split("/", 2); String typeName = segments[0]; // Try to get the type from the typesystem of the CAS Type t = currCAS.getTypeSystem().getType(typeName); if (t == null) { throw new IllegalStateException("Type [" + typeName + "] not found in type system"); } // get an fpi object and initialize it // initialize the FeaturePathInfo with the corresponding part initializeFeaturePathInfoFrom(fp, segments); // get the annotations AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t); FSIterator<?> iterator = idx.iterator(); while (iterator.hasNext()) { AnnotationFS fs = (AnnotationFS) iterator.next(); try { if (this.filterFeaturePath != null) { // check annotation filter condition if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { createStemAnnotation(jcas, fs); } } else { // no annotation filter specified createStemAnnotation(jcas, fs); } } catch (AnalysisEngineProcessException e) { // TODO Auto-generated catch block throw new IllegalStateException("error occured while creating a stem annotation", e); } } } }
/** * Removes all annotations of type removeAnnotationType which are contained by annotations of type * containerAnnotationType. * * @param cas * @param containerAnnotationType * @param removeAnnotationType */ public static void removeAnnotations( CAS cas, AnnotationFS containerAnnotation, Type removeAnnotationType) { FSIndex<AnnotationFS> allRemoveAnnotations = cas.getAnnotationIndex(removeAnnotationType); ContainingConstraint containingConstraint = new ContainingConstraint(containerAnnotation); Iterator<AnnotationFS> containingTokens = cas.createFilteredIterator(allRemoveAnnotations.iterator(), containingConstraint); Collection<AnnotationFS> removeAnnotations = new LinkedList<AnnotationFS>(); while (containingTokens.hasNext()) { removeAnnotations.add(containingTokens.next()); } for (Iterator<AnnotationFS> it = removeAnnotations.iterator(); it.hasNext(); ) { cas.removeFsFromIndexes(it.next()); } }
@Override public void getNext(final CAS aCAS) throws IOException, CollectionException { TikaProcessor processor = new TikaProcessor(); try { processor = TikaProcessor.newInstance(file); } catch (Exception e) { ExceptionHandler.logAndRethrow(logger, "TikaProcessor: ", e); } String documentText = processor.getText(); if (documentText == null || documentText.length() == 0) { ExceptionHandler.logAndThrow(logger, "Document text is null or empty"); } aCAS.setDocumentText(documentText); String textLanguage = processor.getLanguage(); if (!textLanguage.contains("ru")) { ExceptionHandler.logAndThrow(logger, "Document language is not russian"); } aCAS.setDocumentLanguage(textLanguage); }
@Override public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } try { // parse the xml file File xmlFile = GlobalFileStorage.getInstance().poll(); System.out.println("Process file: " + xmlFile.getName()); SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser sp = spf.newSAXParser(); XMLReader xr = sp.getXMLReader(); LinkedList<String[]> textElements = new LinkedList<>(); FragmentContentHandler fch = new FragmentContentHandler(xr, textElements); xr.setContentHandler(fch); xr.parse(new InputSource(new FileInputStream(xmlFile))); StringBuilder docText = new StringBuilder(); for (String[] element : textElements) { int start = docText.length(); int end = start + element[1].length(); docText.append(element[1] + "\n\n"); Section section = new Section(jcas, start, end); section.setValue(element[0]); section.addToIndexes(); } jcas.setDocumentText(docText.toString().trim()); jcas.setDocumentLanguage(language); DocumentMetaData docMetaData = DocumentMetaData.create(aCAS); docMetaData.setDocumentTitle(xmlFile.getName()); docMetaData.setDocumentId(xmlFile.getAbsolutePath()); docMetaData.setDocumentBaseUri("file:" + xmlFile.getParentFile().getAbsolutePath()); docMetaData.setDocumentUri("file:" + xmlFile.getAbsolutePath()); } catch (Exception e) { // e.printStackTrace(); throw new CollectionException(e); } }
/* * Method that reads all serialized cases in the specified folder * param: path --> specifies the folder * returns a list of jcases * */ public List<JCas> read(String path) throws Exception { List<JCas> jCases = new ArrayList<JCas>(); System.out.println("--- READING ---"); @SuppressWarnings("deprecation") CollectionReader reader = CollectionReaderFactory.createReader( BinaryCasReader.class, ResourceCollectionReaderBase.PARAM_PATH, path, ResourceCollectionReaderBase.PARAM_PATTERNS, new String[] {ResourceCollectionReaderBase.INCLUDE_PREFIX + "*.bin"}); while (reader.hasNext()) { CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null); reader.getNext(cas); System.out.println(cas.getJCas().getDocumentText()); jCases.add(cas.getJCas()); } return jCases; }
public void run() throws Exception { // create Asynchronous Client API and initialize it uimaAsEngine = new BaseUIMAAsynchronousEngine_impl(); // callback // uimaAsEngine.addStatusCallbackListener(new StatusCallbackListener()); initializeUimaAsEngine(uimaAsEngine); String filePath = "C:\\WebScience\\Progetti\\K-People\\OntologyController_UIMA\\apache-uima\\examples\\src\\it\\webscience\\uima\\event-2031.xml"; String xml = readFile(filePath); // get an empty CAS from the Cas pool CAS cas = uimaAsEngine.getCAS(); // Initialize it with input data cas.setDocumentText(xml); // Send Cas to service for processing uimaAsEngine.sendCAS(cas); }
@Override public void getNext(CAS cas) throws IOException, CollectionException { this.cumulatedLength += currentDoc.getText().length(); logger.info( "[Stream {}] Processing document {}: {} (total length processed: {})", this.streamName, this.mCurrentIndex, this.currentDoc.getUri(), this.cumulatedLength); SourceDocumentInformation sdi; try { sdi = new SourceDocumentInformation(cas.getJCas()); sdi.setUri(currentDoc.getUri()); cas.setDocumentLanguage(mLanguage.getCode()); cas.setDocumentText(currentDoc.getText()); sdi.setDocumentSize(currentDoc.getText().length()); sdi.setCumulatedDocumentSize(this.cumulatedLength); sdi.setBegin(0); sdi.setEnd(currentDoc.getText().length()); sdi.setOffsetInSource(0); sdi.setDocumentIndex(mCurrentIndex); /* * Cannot be known in case of streaming */ sdi.setCorpusSize(-1); sdi.setNbDocuments(-1); // Cannot know if this is the last sdi.setLastSegment(false); sdi.addToIndexes(); this.mCurrentIndex++; } catch (CASException e) { throw new CollectionException(e); } }
private TypeSystem createTypeSystem() throws IOException, UIMAException { TypeSystemDescription tsDesc = null; if (typeSystemDescPaths != null && typeSystemDescPaths.length > 0) { tsDesc = createTypeSystemDescriptionFromPath(typeSystemDescPaths); } if (typeSystemDescNames != null && typeSystemDescNames.length > 0) { TypeSystemDescription tsDescFromNames = createTypeSystemDescription(typeSystemDescNames); if (tsDesc != null) { tsDesc = mergeTypeSystems(asList(tsDesc, tsDescFromNames)); } else { tsDesc = tsDescFromNames; } } if (tsDesc == null) { log.info("TypeSystemDescription will be created using the UIMAFit discovery"); tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription(); } CAS dumbCas = CasCreationUtils.createCas(tsDesc, null, null); TypeSystem typeSystem = dumbCas.getTypeSystem(); // printAllTypes(); return typeSystem; }
/** * TODO :: 1. construct the global word dictionary 2. keep the word frequency for each sentence * * <p>Creates two dictionaries queryDictionary and answerDictionary * * <p>queryDictionary is list of maps with key as the words in the question and value as the count * of the word in the question sentence. Similarly answerDictionary is list of maps with key as * the words in the answer and value as the count of the word in the answer sentence. */ @Override public void processCas(CAS aCas) throws ResourceProcessException { JCas jcas; try { jcas = aCas.getJCas(); } catch (CASException e) { throw new ResourceProcessException(e); } FSIterator it = jcas.getAnnotationIndex(Document.type).iterator(); if (it.hasNext()) { Document doc = (Document) it.next(); // Make sure that your previous annotators have populated this in CAS FSList fsTokenList = doc.getTokenList(); ArrayList<Token> tokenList = Utils.fromFSListToCollection(fsTokenList, Token.class); HashMap<String, Integer> myMap = new HashMap<String, Integer>(); HashMap<String, Integer> myMap2 = new HashMap<String, Integer>(); // if question then fill QuesqIdList, QuesrelList & queryDictionary if (doc.getRelevanceValue() == 99) { QuesqIdList.add(doc.getQueryID()); QuesrelList.add(doc.getRelevanceValue()); for (int k = 0; k < tokenList.size(); k++) { myMap.put(tokenList.get(k).getText(), tokenList.get(k).getFrequency()); } queryDictionary.add(myMap); } // if answer then fill AnsqIdList, AnsrelList & answerDictionary else { AnsqIdList.add(doc.getQueryID()); AnsrelList.add(doc.getRelevanceValue()); for (int k = 0; k < tokenList.size(); k++) { myMap2.put(tokenList.get(k).getText(), tokenList.get(k).getFrequency()); } answerDictionary.add(myMap2); if (1 == doc.getRelevanceValue()) { GoldAnswerStringList.put(doc.getQueryID(), doc.getText()); } } // Do something useful here /*for(int i=0;i<tokenList.size();i++) System.out.print(tokenList.get(i).getText().toString()+"=>" + tokenList.get(i).getFrequency()+"\t"); System.out.println();*/ } }
/** * Called when the processing of a Document is completed. <br> * The process status can be looked at and corresponding actions taken. * * @param aCas CAS corresponding to the completed processing * @param aStatus EntityProcessStatus that holds the status of all the events for aEntity */ public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) { if (aStatus.isException()) { List<Exception> exceptions = aStatus.getExceptions(); for (int i = 0; i < exceptions.size(); i++) { ((Throwable) exceptions.get(i)).printStackTrace(); } return; } entityCount++; String docText = aCas.getDocumentText(); if (docText != null) { size += docText.length(); } }
public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } fillJCas(jcas); // give an indicator that a file has been processed System.err.print("."); }
/** * Serialize a CAS to a file in XCAS format * * @param aCas CAS to serialize * @param name output file * @throws IOException if an I/O failure occurs * @throws SAXException if an error occurs generating the XML text */ private void writeXCas(CAS aCas, File name) throws IOException, SAXException { FileOutputStream out = null; try { out = new FileOutputStream(name); XCASSerializer ser = new XCASSerializer(aCas.getTypeSystem()); XMLSerializer xmlSer = new XMLSerializer(out, true); ser.serialize(aCas, xmlSer.getContentHandler()); } finally { if (out != null) { out.close(); } } }
public static void main(String[] args) throws Exception { String sLine; long startTime = System.currentTimeMillis(); URL descUrl = VectorSpaceRetrieval.class.getResource( "/descriptors/retrievalsystem/VectorSpaceRetrieval.xml"); if (descUrl == null) { throw new IllegalArgumentException("Error opening VectorSpaceRetrieval.xml"); } // create AnalysisEngine XMLInputSource input = new XMLInputSource(descUrl); AnalysisEngineDescription desc = UIMAFramework.getXMLParser().parseAnalysisEngineDescription(input); AnalysisEngine anAnalysisEngine = UIMAFramework.produceAnalysisEngine(desc); CAS aCas = anAnalysisEngine.newCAS(); URL docUrl = VectorSpaceRetrieval.class.getResource("/data/documents.txt"); if (docUrl == null) { throw new IllegalArgumentException("Error opening data/documents.txt"); } BufferedReader br = new BufferedReader(new InputStreamReader(docUrl.openStream())); while ((sLine = br.readLine()) != null) { aCas.setDocumentText(sLine); anAnalysisEngine.process(aCas); aCas.reset(); } br.close(); br = null; anAnalysisEngine.collectionProcessComplete(); anAnalysisEngine.destroy(); long endTime = System.currentTimeMillis(); double totalTime = (endTime - startTime) / 1000.0; System.out.println("Total time taken: " + totalTime); }
/** * Serialize a CAS to a file in XMI format * * @param aCas CAS to serialize * @param name output file * @throws SAXException - * @throws Exception - * @throws ResourceProcessException - */ private void writeXmi(CAS aCas, File name, String modelFileName) throws IOException, SAXException { FileOutputStream out = null; try { // write XMI out = new FileOutputStream(name); XmiCasSerializer ser = new XmiCasSerializer(aCas.getTypeSystem()); XMLSerializer xmlSer = new XMLSerializer(out, false); ser.serialize(aCas, xmlSer.getContentHandler()); } finally { if (out != null) { out.close(); } } }
@Override public void getNext(CAS aCAS) throws IOException, CollectionException { // TODO Auto-generated method stub JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } // open input stream to file String sentence = mSentences.get(mCurrentIndex++); // put document in CAS jcas.setDocumentText(sentence); }
/** * Gets the next sentence from the input file. * * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ @Override public void getNext(CAS aCas) throws IOException, CollectionException { JCas jcas = null; try { jcas = aCas.getJCas(); } catch (CASException e) { throw new CollectionException(e); } String lineString = mBufferdReader.readLine().trim(); String sentenceId = lineString.substring(0, lineString.indexOf(" ")); String sentenceText = lineString.substring(lineString.indexOf(" ")); jcas.setDocumentText(sentenceText); Sentence sentence = new Sentence(jcas); sentence.setSentenceId(sentenceId); sentence.addToIndexes(); }
@Override public void getNext(CAS aCAS) throws IOException, CollectionException { super.getNext(aCAS); JCas jcas; try { jcas = aCAS.getJCas(); // consider a tweet to be a sentence Sentence sentenceAnno = new Sentence(jcas); sentenceAnno.setBegin(0); sentenceAnno.setEnd(jcas.getDocumentText().length()); sentenceAnno.addToIndexes(); } catch (CASException e) { throw new CollectionException(); } TextClassificationOutcome outcome = new TextClassificationOutcome(jcas); outcome.setOutcome(getTextClassificationOutcome(jcas)); outcome.addToIndexes(); }
/** * Processes the CAS which was populated by the TextAnalysisEngines. <br> * In this case, the CAS is converted to XMI and written into the output file . * * @param aCAS a CAS which has been populated by the TAEs * @throws ResourceProcessException if there is an error in processing the Resource * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS) */ public void processCas(CAS aCAS) throws ResourceProcessException { String modelFileName = null; JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new ResourceProcessException(e); } // retrieve the filename of the input file from the CAS FSIterator it = jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator(); File outFile = null; if (it.hasNext()) { SourceDocumentInformation fileLoc = (SourceDocumentInformation) it.next(); File inFile; try { inFile = new File(new URL(fileLoc.getUri()).getPath()); String outFileName = inFile.getName(); if (fileLoc.getOffsetInSource() > 0) { outFileName += ("_" + fileLoc.getOffsetInSource()); } outFileName += ".xmi"; outFile = new File(mOutputDir, outFileName); modelFileName = mOutputDir.getAbsolutePath() + "/" + inFile.getName() + ".ecore"; } catch (MalformedURLException e1) { // invalid URL, use default processing below } } if (outFile == null) { outFile = new File(mOutputDir, "doc" + mDocNum++ + ".xmi"); } // serialize XCAS and write to output file try { writeXmi(jcas.getCas(), outFile, modelFileName); } catch (IOException e) { throw new ResourceProcessException(e); } catch (SAXException e) { throw new ResourceProcessException(e); } }
/** @see com.ibm.uima.collection.CollectionReader#getNext(com.ibm.uima.cas.CAS) */ public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } String name = fileIterator.next().getAbsolutePath(); jcas.setDocumentText(ReadWriteTextFileWithEncoding.read(name, "UTF-8")); numberOfFilesProcessed++; try { name = filenameToIDTranslator.cleanItUp(name); } catch (StringCleanerException e) { // TODO Auto-generated catch block e.printStackTrace(); } StringArray s = new StringArray(jcas, 1); s.set(0, filenameToIDTranslator.getIdType() + name); ISI_UIMA_Util.setDocumentSecondaryIDs(jcas, s); }
public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) { if (aStatus != null) { if (aStatus.isException()) { System.err.println("Error on process CAS call to remote service:"); List<Exception> exceptions = aStatus.getExceptions(); for (int i = 0; i < exceptions.size(); i++) { ((Throwable) exceptions.get(i)).printStackTrace(); } } try { JCas cas = aCas.getJCas(); for(Token token : JCasUtil.select(cas, Token.class)) { System.out.println(token.getCoveredText() + " " + token.getPos().getPosValue()); } } catch (CASException e) { e.printStackTrace(); } } }
@Override protected void endRegion(final Style aStyle) throws IOException { if (log.isTraceEnabled()) { log.trace("</" + aStyle + ">"); } if (regionText == null) { throw new IllegalStateException("No region started"); } if (regionStyle != aStyle) { throw new IllegalStateException( "Current region has style " + regionStyle + ", but closing region has style " + aStyle); } // Append text int begin = text.length(); sanitize(regionText); text.append(regionText.toString()); int end = text.length(); text.append('\n'); // Add annotation switch (aStyle) { case HEADING: if (headingType != null) { Type t = cas.getTypeSystem().getType(headingType); AnnotationFS a = cas.createAnnotation(t, begin, end); cas.addFsToIndexes(a); } break; case PARAGRAPH: if (paragraphType != null) { Type t = cas.getTypeSystem().getType(paragraphType); AnnotationFS a = cas.createAnnotation(t, begin, end); cas.addFsToIndexes(a); } break; default: throw new IllegalStateException("Unknown region style: " + aStyle); } regionStyle = null; regionText = null; }
@Override public void process(CAS cas) throws AnalysisEngineProcessException { JCas textJCas; try { textJCas = cas.getJCas(); setStream(textJCas); } catch (CASException e1) { throw new AnalysisEngineProcessException(e1); } catch (final IOException e2) { throw new AnalysisEngineProcessException(e2); } final FSIterator<Annotation> annotationIt = SemanticAnnotation.getIterator(textJCas); while (annotationIt.hasNext()) { final SemanticAnnotation ann = (SemanticAnnotation) annotationIt.next(); final String text = replaceNewlines ? StringUtils.join(' ', ann.getCoveredText().split(LINEBREAK)) : ann.getCoveredText(); try { write(ann.getNamespace()); write(fieldSeparator); write(ann.getIdentifier()); write(fieldSeparator); write(ann.getOffset().toString()); write(fieldSeparator); write(text); write(LINEBREAK); } catch (final IOException e) { throw new AnalysisEngineProcessException(e); } } try { unsetStream(); } catch (final IOException e) { throw new AnalysisEngineProcessException(e); } }
/* (non-Javadoc) * @see org.apache.uima.analysis_component.CasAnnotator_ImplBase#process(org.apache.uima.cas.CAS) */ public void process(CAS aCas) throws AnalysisEngineProcessException { this.logger.logrb( Level.INFO, "WhitespaceTokenizer", "process", MESSAGE_BUNDLE, "whitespace_tokenizer_info_start_processing"); ArrayList<CAS> casList = new ArrayList<CAS>(); // check if sofa names are available if (this.sofaNames != null && this.sofaNames.length > 0) { // get sofa names for (int i = 0; i < this.sofaNames.length; i++) { Iterator it = aCas.getViewIterator(this.sofaNames[i]); while (it.hasNext()) { // add sofas to the cas List to process casList.add((CAS) it.next()); } } } else { // use default sofa for the processing casList.add(aCas); } for (int x = 0; x < casList.size(); x++) { this.cas = casList.get(x); // get text content from the CAS char[] textContent = this.cas.getDocumentText().toCharArray(); int tokenStart = UNDEFINED; int currentCharPos = 0; int sentenceStart = 0; int nextCharType = UNDEFINED; char nextChar = INVALID_CHAR; while (currentCharPos < textContent.length) { char currentChar = textContent[currentCharPos]; int currentCharType = getCharacterType(currentChar); // get character class for current and next character if ((currentCharPos + 1) < textContent.length) { nextChar = textContent[currentCharPos + 1]; nextCharType = getCharacterType(nextChar); } else { nextCharType = UNDEFINED; nextChar = INVALID_CHAR; } // check if current character is a letter or number if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) { // check if it is the first letter of a token if (tokenStart == UNDEFINED) { // start new token here tokenStart = currentCharPos; } } // check if current character is a whitespace character else if (currentCharType == CH_WHITESPACE) { // terminate current token if (tokenStart != UNDEFINED) { // end of current word createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } } // check if current character is a special character else if (currentCharType == CH_SPECIAL) { // terminate current token if (tokenStart != UNDEFINED) { // end of current word createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } // create token for special character createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1); } // check if current character is new line character else if (currentCharType == CH_NEWLINE) { // terminate current token if (tokenStart != UNDEFINED) { // end of current word createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } } // check if current character is new punctuation character else if (currentCharType == CH_PUNCTUATION) { // terminates the current token if (tokenStart != UNDEFINED) { createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } // check next token type so see if we have a sentence end if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE)) && (punctuations.contains(new String(new char[] {currentChar})))) { // terminate sentence createAnnotation(this.sentenceType, sentenceStart, currentCharPos + 1); sentenceStart = currentCharPos + 1; } // create token for punctuation character createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1); } // go to the next token currentCharPos++; } // end of character loop // we are at the end of the text terminate open token annotations if (tokenStart != UNDEFINED) { // end of current word createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } // we are at the end of the text terminate open sentence annotations if (sentenceStart != UNDEFINED) { // end of current word createAnnotation(this.sentenceType, sentenceStart, currentCharPos); sentenceStart = UNDEFINED; } } this.logger.logrb( Level.INFO, "WhitespaceTokenizer", "process", MESSAGE_BUNDLE, "whitespace_tokenizer_info_stop_processing"); }