/* (non-Javadoc) * @see org.apache.lucene.analysis.TokenStream#incrementToken() */ @Override public boolean incrementToken() throws IOException { // 清除所有的词元属性 clearAttributes(); skippedPositions = 0; Lexeme nextLexeme = _IKImplement.next(); if (nextLexeme != null) { posIncrAtt.setPositionIncrement(skippedPositions + 1); // 将Lexeme转成Attributes // 设置词元文本 termAtt.append(nextLexeme.getLexemeText()); // 设置词元长度 termAtt.setLength(nextLexeme.getLength()); // 设置词元位移 offsetAtt.setOffset( correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition())); // 记录分词的最后位置 endPosition = nextLexeme.getEndPosition(); // 记录词元分类 typeAtt.setType(nextLexeme.getLexemeTypeString()); // 返会true告知还有下个词元 return true; } // 返会false告知词元输出完毕 return false; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { char buffer[] = termAtt.buffer(); int length = termAtt.length(); for (int i = 0; i < length; i++) { int ch = Character.codePointAt(buffer, i, length); // look for digits outside of basic latin if (ch > 0x7F && Character.isDigit(ch)) { // replace with equivalent basic latin digit buffer[i] = (char) ('0' + Character.getNumericValue(ch)); // if the original was supplementary, shrink the string if (ch > 0xFFFF) { length = StemmerUtil.delete(buffer, i + 1, length); termAtt.setLength(length); } } } return true; } else { return false; } }
/* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ @Override public final boolean incrementToken() throws IOException { clearAttributes(); skippedPositions = 0; while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerInterface.YYEOF) { return false; } if (scanner.yylength() <= maxTokenLength) { posIncrAtt.setPositionIncrement(skippedPositions + 1); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizer.ACRONYM_DEP) { typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]); termAtt.setLength(termAtt.length() - 1); // remove extra '.' } else { typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]); } return true; } else // When we skip a too-long term, we still increment the // position increment skippedPositions++; } }
@Override public boolean incrementToken() { if (index < n) { clearAttributes(); termAtt.buffer()[0] = 'a'; termAtt.setLength(1); index++; return true; } return false; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (termAtt.length() > 0 && termAtt.buffer()[0] == 't') { termAtt.setLength(0); } return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAttr.isKeyword()) { final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); termAtt.setLength(newlen); } return true; } else { return false; } }
@Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { final int length = termAttribute.length(); if (length > size) { termAttribute.setLength(size); } return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAttr.isKeyword()) { // this stemmer increases word length by 1: worst case '*çom' -> '*ción' final int len = termAtt.length(); final int newlen = stemmer.stem(termAtt.resizeBuffer(len + 1), len); termAtt.setLength(newlen); } return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) { return false; } final char[] buffer = termAtt.buffer(); final int bufferLength = termAtt.length(); if (bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) termAtt.setLength(bufferLength - 2); // Strip last 2 characters off return true; }
/** Writes the joined unhyphenated term */ private void unhyphenate() { int endOffset = offsetAttribute.endOffset(); restoreState(savedState); savedState = null; char term[] = termAttribute.buffer(); int length = hyphenated.length(); if (length > termAttribute.length()) { term = termAttribute.resizeBuffer(length); } hyphenated.getChars(0, length, term, 0); termAttribute.setLength(length); offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset); hyphenated.setLength(0); }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { final char[] buffer = termAtt.buffer(); final int length = termAtt.length(); for (int i = 0; i < length; i++) { if (buffer[i] == delimiter) { payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1)))); termAtt.setLength(i); // simply set a new length return true; } } // we have not seen the delimiter payAtt.setPayload(null); return true; } else return false; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { char[] termBuffer = termAtt.buffer(); String termText = new String(termBuffer, 0, termAtt.length()); collator.getRawCollationKey(termText, reusableKey); int encodedLength = IndexableBinaryStringTools.getEncodedLength(reusableKey.bytes, 0, reusableKey.size); if (encodedLength > termBuffer.length) { termAtt.resizeBuffer(encodedLength); } termAtt.setLength(encodedLength); IndexableBinaryStringTools.encode( reusableKey.bytes, 0, reusableKey.size, termAtt.buffer(), 0, encodedLength); return true; } else { return false; } }
@Override public final boolean incrementToken() throws IOException { if (!done) { clearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.buffer(); while (true) { final int length = input.read(buffer, upto, buffer.length - upto); if (length == -1) break; upto += length; if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length); } termAtt.setLength(upto); finalOffset = correctOffset(upto); offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); if (!done) { done = true; int upto = 0; char[] buffer = termAtt.buffer(); while (true) { final int length = input.read(buffer, upto, buffer.length - upto); if (length == -1) break; upto += length; if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length); } termAtt.setLength(upto); String str = termAtt.toString(); termAtt.setEmpty(); StringBuilder stringBuilder = new StringBuilder(); StringBuilder firstLetters = new StringBuilder(); for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); if (c < 128) { stringBuilder.append(c); } else { try { String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format); if (strs != null) { // get first result by default String first_value = strs[0]; // TODO more than one pinyin stringBuilder.append(first_value); if (this.padding_char.length() > 0) { stringBuilder.append(this.padding_char); } firstLetters.append(first_value.charAt(0)); } } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) { badHanyuPinyinOutputFormatCombination.printStackTrace(); } } } // let's join them if (first_letter.equals("prefix")) { termAtt.append(firstLetters.toString()); if (this.padding_char.length() > 0) { termAtt.append(this.padding_char); // TODO splitter } termAtt.append(stringBuilder.toString()); } else if (first_letter.equals("append")) { termAtt.append(stringBuilder.toString()); if (this.padding_char.length() > 0) { if (!stringBuilder.toString().endsWith(this.padding_char)) { termAtt.append(this.padding_char); } } termAtt.append(firstLetters.toString()); } else if (first_letter.equals("none")) { termAtt.append(stringBuilder.toString()); } else if (first_letter.equals("only")) { termAtt.append(firstLetters.toString()); } finalOffset = correctOffset(upto); offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); termAtt.append(resultToken); if (resultToken.length() == 0) { posAtt.setPositionIncrement(1); } else { posAtt.setPositionIncrement(0); } int length = 0; boolean added = false; if (endDelimiter) { termAtt.append(replacement); length++; endDelimiter = false; added = true; } while (true) { int c = input.read(); if (c >= 0) { charsRead++; } else { if (skipped > skip) { length += resultToken.length(); termAtt.setLength(length); offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length)); if (added) { resultToken.setLength(0); resultToken.append(termAtt.buffer(), 0, length); } return added; } else { return false; } } if (!added) { added = true; skipped++; if (skipped > skip) { termAtt.append(c == delimiter ? replacement : (char) c); length++; } else { startPosition++; } } else { if (c == delimiter) { if (skipped > skip) { endDelimiter = true; break; } skipped++; if (skipped > skip) { termAtt.append(replacement); length++; } else { startPosition++; } } else { if (skipped > skip) { termAtt.append((char) c); length++; } else { startPosition++; } } } } length += resultToken.length(); termAtt.setLength(length); offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length)); resultToken.setLength(0); resultToken.append(termAtt.buffer(), 0, length); return true; }
/** * lucene 4.2x의 경우 데이터가 있으면 자소분리 후 true가 떨어지나, 여기서는 false로 떨어져 부득이하게 ioBuffer사이즈 상태로 조건변경 * (CharacterUtils.fill) * * @author 최일규 * @since 2014-07-11 */ @Override public final boolean incrementToken() throws IOException { clearAttributes(); int length = 0; int start = -1; // this variable is always initialized char[] buffer = termAtt.buffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; boolean isDecompose = charUtils.fill(ioBuffer, jasoDecompose(input, decomposeMode, typoMode)); // 버퍼사이즈가 있으면 분석한다. (return false일때까지... 재귀호출) if (ioBuffer.getLength() == 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { finalOffset = correctOffset(offset); return false; } } dataLen = ioBuffer.getLength(); bufferIndex = 0; } // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based // methods are gone final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, dataLen); bufferIndex += Character.charCount(c); // if it's a token char if (isTokenChar(c)) { // start of token if (length == 0) { assert start == -1; start = offset + bufferIndex - 1; // check if a supplementary could run out of bounds } else if (length >= buffer.length - 1) { // make sure a supplementary fits in the buffer buffer = termAtt.resizeBuffer(2 + length); } // buffer it, normalized length += Character.toChars(normalize(c), buffer, length); if (length >= MAX_WORD_LEN) { break; } } else if (length > 0) { // return 'em break; } } termAtt.setLength(length); assert start != -1; offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length)); return true; }