private String tokenizerToString(Tokenizer tokenizer) throws Exception {
    OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt =
        tokenizer.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class);
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class);
    SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class);
    PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class);

    StringBuilder result = new StringBuilder();
    tokenizer.reset();
    while (tokenizer.incrementToken() == true) {
      result.append(new String(term.buffer(), 0, term.length())).append(":");
      result.append(type.type()).append(":");
      result.append(pos.partOfSpeech()).append(":");
      result.append(semanticClass.semanticClass()).append(":");
      result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":");
      result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":");
      result.append(String.valueOf(extOffset.startOffset())).append(":");
      result.append(String.valueOf(extOffset.endOffset()));
      result.append(",");
    }
    tokenizer.end();
    return result.toString();
  }
Exemplo n.º 2
0
 @Override
 public final boolean incrementToken() throws IOException {
   while (true) {
     if (curTermBuffer == null) {
       if (!input.incrementToken()) {
         return false;
       } else {
         curTermBuffer = termAtt.buffer().clone();
         curTermLength = termAtt.length();
         curCodePointCount = charUtils.codePointCount(termAtt);
         curGramSize = minGram;
         tokStart = offsetAtt.startOffset();
         tokEnd = offsetAtt.endOffset();
         savePosIncr += posIncrAtt.getPositionIncrement();
         savePosLen = posLenAtt.getPositionLength();
       }
     }
     if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
       if (curGramSize
           <= curCodePointCount) { // if the remaining input is too short, we can't generate any
         // n-grams
         // grab gramSize chars from front or back
         clearAttributes();
         offsetAtt.setOffset(tokStart, tokEnd);
         // first ngram gets increment, others don't
         if (curGramSize == minGram) {
           posIncrAtt.setPositionIncrement(savePosIncr);
           savePosIncr = 0;
         } else {
           posIncrAtt.setPositionIncrement(0);
         }
         posLenAtt.setPositionLength(savePosLen);
         final int charLength =
             charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
         termAtt.copyBuffer(curTermBuffer, 0, charLength);
         curGramSize++;
         return true;
       }
     }
     curTermBuffer = null;
   }
 }