/* (non-Javadoc) * @see org.apache.uima.analysis_component.CasAnnotator_ImplBase#process(org.apache.uima.cas.CAS) */ public void process(CAS aCas) throws AnalysisEngineProcessException { this.logger.logrb( Level.INFO, "WhitespaceTokenizer", "process", MESSAGE_BUNDLE, "whitespace_tokenizer_info_start_processing"); ArrayList<CAS> casList = new ArrayList<CAS>(); // check if sofa names are available if (this.sofaNames != null && this.sofaNames.length > 0) { // get sofa names for (int i = 0; i < this.sofaNames.length; i++) { Iterator it = aCas.getViewIterator(this.sofaNames[i]); while (it.hasNext()) { // add sofas to the cas List to process casList.add((CAS) it.next()); } } } else { // use default sofa for the processing casList.add(aCas); } for (int x = 0; x < casList.size(); x++) { this.cas = casList.get(x); // get text content from the CAS char[] textContent = this.cas.getDocumentText().toCharArray(); int tokenStart = UNDEFINED; int currentCharPos = 0; int sentenceStart = 0; int nextCharType = UNDEFINED; char nextChar = INVALID_CHAR; while (currentCharPos < textContent.length) { char currentChar = textContent[currentCharPos]; int currentCharType = getCharacterType(currentChar); // get character class for current and next character if ((currentCharPos + 1) < textContent.length) { nextChar = textContent[currentCharPos + 1]; nextCharType = getCharacterType(nextChar); } else { nextCharType = UNDEFINED; nextChar = INVALID_CHAR; } // check if current character is a letter or number if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) { // check if it is the first letter of a token if (tokenStart == UNDEFINED) { // start new token here tokenStart = currentCharPos; } } // check if current character is a whitespace character else if (currentCharType == CH_WHITESPACE) { // terminate current token if (tokenStart != UNDEFINED) { // end of current word createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } } // check if current character is a special character else if (currentCharType == CH_SPECIAL) { // terminate current token if (tokenStart != UNDEFINED) { // end of current word createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } // create token for special character createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1); } // check if current character is new line character else if (currentCharType == CH_NEWLINE) { // terminate current token if (tokenStart != UNDEFINED) { // end of current word createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } } // check if current character is new punctuation character else if (currentCharType == CH_PUNCTUATION) { // terminates the current token if (tokenStart != UNDEFINED) { createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } // check next token type so see if we have a sentence end if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE)) && (punctuations.contains(new String(new char[] {currentChar})))) { // terminate sentence createAnnotation(this.sentenceType, sentenceStart, currentCharPos + 1); sentenceStart = currentCharPos + 1; } // create token for punctuation character createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1); } // go to the next token currentCharPos++; } // end of character loop // we are at the end of the text terminate open token annotations if (tokenStart != UNDEFINED) { // end of current word createAnnotation(this.tokenType, tokenStart, currentCharPos); tokenStart = UNDEFINED; } // we are at the end of the text terminate open sentence annotations if (sentenceStart != UNDEFINED) { // end of current word createAnnotation(this.sentenceType, sentenceStart, currentCharPos); sentenceStart = UNDEFINED; } } this.logger.logrb( Level.INFO, "WhitespaceTokenizer", "process", MESSAGE_BUNDLE, "whitespace_tokenizer_info_stop_processing"); }