/* (non-Javadoc) * @see org.apache.lucene.analysis.TokenStream#incrementToken() */ @Override public boolean incrementToken() throws IOException { // 清除所有的词元属性 clearAttributes(); skippedPositions = 0; Lexeme nextLexeme = _IKImplement.next(); if (nextLexeme != null) { posIncrAtt.setPositionIncrement(skippedPositions + 1); // 将Lexeme转成Attributes // 设置词元文本 termAtt.append(nextLexeme.getLexemeText()); // 设置词元长度 termAtt.setLength(nextLexeme.getLength()); // 设置词元位移 offsetAtt.setOffset( correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition())); // 记录分词的最后位置 endPosition = nextLexeme.getEndPosition(); // 记录词元分类 typeAtt.setType(nextLexeme.getLexemeTypeString()); // 返会true告知还有下个词元 return true; } // 返会false告知词元输出完毕 return false; }
public static LinkedList<String> segment(String content, int skiplength) { StringReader text = new StringReader(content); IKSegmenter segment = new IKSegmenter(text, true); LinkedList<String> result = new LinkedList<String>(); try { for (Lexeme lexeme = segment.next(); lexeme != null; lexeme = segment.next()) { String word = lexeme.getLexemeText(); if (!CharacterEncoding.isChinese(word)) continue; if (skiplength >= word.length()) continue; result.add(word); } } catch (IOException e) { e.printStackTrace(); } return result; }
public static List<String> segQuery(String query) { List<String> keys = new ArrayList<String>(); try { Reader r = new StringReader(query); IKSegmenter seg = new IKSegmenter(r, true); Lexeme t = seg.next(); while (t != null) { keys.add(t.getLexemeText()); t = seg.next(); } } catch (Exception e) { e.printStackTrace(); } return keys; }
@Override public String nextToken() { try { Lexeme lexeme = null; if ((lexeme = aSeg.next()) != null) { return lexeme.lexemeText(); } } catch (IOException e) { e.printStackTrace(); } return null; }
/* * (non-Javadoc) * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) */ @Override public void reset() throws IOException { super.reset(); _IKImplement.reset(input); skippedPositions = 0; }