private String tokenizerToString(Tokenizer tokenizer) throws Exception { OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class); CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class); TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class); SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class); PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class); StringBuilder result = new StringBuilder(); tokenizer.reset(); while (tokenizer.incrementToken() == true) { result.append(new String(term.buffer(), 0, term.length())).append(":"); result.append(type.type()).append(":"); result.append(pos.partOfSpeech()).append(":"); result.append(semanticClass.semanticClass()).append(":"); result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":"); result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":"); result.append(String.valueOf(extOffset.startOffset())).append(":"); result.append(String.valueOf(extOffset.endOffset())); result.append(","); } tokenizer.end(); return result.toString(); }
@Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; } else { curTermBuffer = termAtt.buffer().clone(); curTermLength = termAtt.length(); curCodePointCount = charUtils.codePointCount(termAtt); curGramSize = minGram; tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); savePosIncr += posIncrAtt.getPositionIncrement(); savePosLen = posLenAtt.getPositionLength(); } } if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any // n-grams // grab gramSize chars from front or back clearAttributes(); offsetAtt.setOffset(tokStart, tokEnd); // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.setPositionIncrement(savePosIncr); savePosIncr = 0; } else { posIncrAtt.setPositionIncrement(0); } posLenAtt.setPositionLength(savePosLen); final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; return true; } } curTermBuffer = null; } }