コード例 #1
0
ファイル: WhitespaceTokenizer.java プロジェクト: tteofili/cpt
  /* (non-Javadoc)
   * @see org.apache.uima.analysis_component.CasAnnotator_ImplBase#process(org.apache.uima.cas.CAS)
   */
  public void process(CAS aCas) throws AnalysisEngineProcessException {

    this.logger.logrb(
        Level.INFO,
        "WhitespaceTokenizer",
        "process",
        MESSAGE_BUNDLE,
        "whitespace_tokenizer_info_start_processing");

    ArrayList<CAS> casList = new ArrayList<CAS>();
    // check if sofa names are available
    if (this.sofaNames != null && this.sofaNames.length > 0) {

      // get sofa names
      for (int i = 0; i < this.sofaNames.length; i++) {
        Iterator it = aCas.getViewIterator(this.sofaNames[i]);
        while (it.hasNext()) {
          // add sofas to the cas List to process
          casList.add((CAS) it.next());
        }
      }
    } else {
      // use default sofa for the processing
      casList.add(aCas);
    }

    for (int x = 0; x < casList.size(); x++) {

      this.cas = casList.get(x);

      // get text content from the CAS
      char[] textContent = this.cas.getDocumentText().toCharArray();

      int tokenStart = UNDEFINED;
      int currentCharPos = 0;
      int sentenceStart = 0;
      int nextCharType = UNDEFINED;
      char nextChar = INVALID_CHAR;

      while (currentCharPos < textContent.length) {
        char currentChar = textContent[currentCharPos];
        int currentCharType = getCharacterType(currentChar);

        // get character class for current and next character
        if ((currentCharPos + 1) < textContent.length) {
          nextChar = textContent[currentCharPos + 1];
          nextCharType = getCharacterType(nextChar);
        } else {
          nextCharType = UNDEFINED;
          nextChar = INVALID_CHAR;
        }

        // check if current character is a letter or number
        if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) {

          // check if it is the first letter of a token
          if (tokenStart == UNDEFINED) {
            // start new token here
            tokenStart = currentCharPos;
          }
        }

        // check if current character is a whitespace character
        else if (currentCharType == CH_WHITESPACE) {

          // terminate current token
          if (tokenStart != UNDEFINED) {
            // end of current word
            createAnnotation(this.tokenType, tokenStart, currentCharPos);
            tokenStart = UNDEFINED;
          }
        }

        // check if current character is a special character
        else if (currentCharType == CH_SPECIAL) {

          // terminate current token
          if (tokenStart != UNDEFINED) {
            // end of current word
            createAnnotation(this.tokenType, tokenStart, currentCharPos);
            tokenStart = UNDEFINED;
          }

          // create token for special character
          createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1);
        }

        // check if current character is new line character
        else if (currentCharType == CH_NEWLINE) {
          // terminate current token
          if (tokenStart != UNDEFINED) {
            // end of current word
            createAnnotation(this.tokenType, tokenStart, currentCharPos);
            tokenStart = UNDEFINED;
          }
        }

        // check if current character is new punctuation character
        else if (currentCharType == CH_PUNCTUATION) {

          // terminates the current token
          if (tokenStart != UNDEFINED) {
            createAnnotation(this.tokenType, tokenStart, currentCharPos);
            tokenStart = UNDEFINED;
          }

          // check next token type so see if we have a sentence end
          if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE))
              && (punctuations.contains(new String(new char[] {currentChar})))) {
            // terminate sentence
            createAnnotation(this.sentenceType, sentenceStart, currentCharPos + 1);
            sentenceStart = currentCharPos + 1;
          }
          // create token for punctuation character
          createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1);
        }
        // go to the next token
        currentCharPos++;
      } // end of character loop

      // we are at the end of the text terminate open token annotations
      if (tokenStart != UNDEFINED) {
        // end of current word
        createAnnotation(this.tokenType, tokenStart, currentCharPos);
        tokenStart = UNDEFINED;
      }

      // we are at the end of the text terminate open sentence annotations
      if (sentenceStart != UNDEFINED) {
        // end of current word
        createAnnotation(this.sentenceType, sentenceStart, currentCharPos);
        sentenceStart = UNDEFINED;
      }
    }
    this.logger.logrb(
        Level.INFO,
        "WhitespaceTokenizer",
        "process",
        MESSAGE_BUNDLE,
        "whitespace_tokenizer_info_stop_processing");
  }