@Override public final boolean incrementToken() throws IOException { clearAttributes(); if (delimitersCount == -1) { int length = 0; delimiterPositions.add(0); while (true) { int c = input.read(); if (c < 0) { break; } length++; if (c == delimiter) { delimiterPositions.add(length); resultToken.append(replacement); } else { resultToken.append((char) c); } } delimitersCount = delimiterPositions.size(); if (delimiterPositions.get(delimitersCount - 1) < length) { delimiterPositions.add(length); delimitersCount++; } if (resultTokenBuffer.length < resultToken.length()) { resultTokenBuffer = new char[resultToken.length()]; } resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0); resultToken.setLength(0); int idx = delimitersCount - 1 - skip; if (idx >= 0) { // otherwise its ok, because we will skip and return false endPosition = delimiterPositions.get(idx); } finalOffset = correctOffset(length); posAtt.setPositionIncrement(1); } else { posAtt.setPositionIncrement(0); } while (skipped < delimitersCount - skip - 1) { int start = delimiterPositions.get(skipped); termAtt.copyBuffer(resultTokenBuffer, start, endPosition - start); offsetAtt.setOffset(correctOffset(start), correctOffset(endPosition)); skipped++; return true; } return false; }
@Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; tokenBuffer = wordSegmenter.getTokendWords(termAtt.toString()); tokenIter = tokenBuffer.iterator(); if (!tokenIter.hasNext()) return false; } else { return false; } } clearAttributes(); TokendWords nextWord = tokenIter.next(); termAtt.copyBuffer(nextWord.next(), 0, nextWord.next().length); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(nextWord.start, nextWord.end); } typeAtt.setType("word"); return true; }
@Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { // there are no remaining tokens from the current sentence... are there more sentences? if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; // a new sentence is available: process it. tokenBuffer = splitIntoTokens(termAtt.toString(), offsetAtt.startOffset()); tokenIter = tokenBuffer.iterator(); // it should not be possible to have a sentence with 0 words, check just in case. // returning EOS isn't the best either, but its the behavior of the original code. if (!tokenIter.hasNext()) return false; } else { return false; // no more sentences, end of stream! } } // WordTokenFilter must clear attributes, as it is creating new tokens. clearAttributes(); // There are remaining tokens from the current sentence, return the next one. SegToken nextWord = tokenIter.next(); termAtt.append(nextWord.term); // termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(nextWord.start, nextWord.end); } typeAtt.setType("word"); return true; }
private void emit(Token token) { emit(token.tok); OffsetAttribute offAttr = getOffsetAttribute(); if (token.endPos > token.startPos && token.startPos >= 0) { offAttr.setOffset(token.startPos, token.endPos); } }
@Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; } else { curTermBuffer = termAtt.buffer().clone(); curTermLength = termAtt.length(); curGramSize = minGram; tokStart = offsetAtt.startOffset(); } } if (curGramSize <= maxGram) { if (!(curGramSize > curTermLength // if the remaining input is too short, we can't generate any // n-grams || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; int end = start + curGramSize; clearAttributes(); offsetAtt.setOffset(tokStart + start, tokStart + end); termAtt.copyBuffer(curTermBuffer, start, curGramSize); curGramSize++; return true; } } curTermBuffer = null; } }
/* (non-Javadoc) * @see org.apache.lucene.analysis.TokenStream#incrementToken() */ @Override public boolean incrementToken() throws IOException { // 清除所有的词元属性 clearAttributes(); skippedPositions = 0; Lexeme nextLexeme = _IKImplement.next(); if (nextLexeme != null) { posIncrAtt.setPositionIncrement(skippedPositions + 1); // 将Lexeme转成Attributes // 设置词元文本 termAtt.append(nextLexeme.getLexemeText()); // 设置词元长度 termAtt.setLength(nextLexeme.getLength()); // 设置词元位移 offsetAtt.setOffset( correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition())); // 记录分词的最后位置 endPosition = nextLexeme.getEndPosition(); // 记录词元分类 typeAtt.setType(nextLexeme.getLexemeTypeString()); // 返会true告知还有下个词元 return true; } // 返会false告知词元输出完毕 return false; }
@Override public final void end() throws IOException { super.end(); // set final offset int finalOffset = correctOffset(charsRead); offsetAtt.setOffset(finalOffset, finalOffset); }
@Override public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { assert current != null; CompoundToken token = tokens.removeFirst(); restoreState(current); // keep all other attributes untouched termAtt.setEmpty().append(token.txt); offsetAtt.setOffset(token.startOffset, token.endOffset); posIncAtt.setPositionIncrement(0); return true; } current = null; // not really needed, but for safety if (input.incrementToken()) { // Only words longer than minWordSize get processed if (termAtt.length() >= this.minWordSize) { decompose(); // only capture the state if we really need it for producing new tokens if (!tokens.isEmpty()) { current = captureState(); } } // return original token: return true; } else { return false; } }
@Override public void end() throws IOException { super.end(); // Set final offset int finalOffset = correctOffset(pos); offsetAtt.setOffset(finalOffset, finalOffset); }
/* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ @Override public final boolean incrementToken() throws IOException { clearAttributes(); skippedPositions = 0; while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerInterface.YYEOF) { return false; } if (scanner.yylength() <= maxTokenLength) { posIncrAtt.setPositionIncrement(skippedPositions + 1); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizer.ACRONYM_DEP) { typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]); termAtt.setLength(termAtt.length() - 1); // remove extra '.' } else { typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]); } return true; } else // When we skip a too-long term, we still increment the // position increment skippedPositions++; } }
/* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ @Override public final boolean incrementToken() throws IOException { clearAttributes(); skippedPositions = 0; while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return false; } if (scanner.yylength() <= maxTokenLength) { posIncrAtt.setPositionIncrement(skippedPositions + 1); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length())); typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]); return true; } else // When we skip a too-long term, we still increment the // position increment skippedPositions++; } }
@Override public boolean incrementToken() throws IOException { if (!getNextPartialSnippet()) return false; clearAttributes(); termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm); offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm)); return true; }
@Override public final void end() throws IOException { super.end(); // set final offset int finalOffset = correctOffset(this.endPosition); offsetAtt.setOffset(finalOffset, finalOffset); posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); }
@Override public boolean incrementToken() throws IOException { if (index >= str.length()) return false; clearAttributes(); if (group >= 0) { // match a specific group while (matcher.find()) { final String match = matcher.group(group); if (match.length() == 0) continue; termAtt.setEmpty().append(match); index = matcher.start(group); offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group))); return true; } index = Integer.MAX_VALUE; // mark exhausted return false; } else { // String.split() functionality while (matcher.find()) { if (matcher.start() - index > 0) { // found a non-zero-length token termAtt.setEmpty().append(str, index, matcher.start()); offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start())); index = matcher.end(); return true; } index = matcher.end(); } if (str.length() - index == 0) { index = Integer.MAX_VALUE; // mark exhausted return false; } termAtt.setEmpty().append(str, index, str.length()); offsetAtt.setOffset(correctOffset(index), correctOffset(str.length())); index = Integer.MAX_VALUE; // mark exhausted return true; } }
@Override public final void end() throws IOException { super.end(); // set final offset int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); offsetAtt.setOffset(finalOffset, finalOffset); // adjust any skipped tokens posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); }
public boolean incrementToken() throws IOException { if (inPhrase) { inPhrase = false; clearAttributes(); termAtt.setTermBuffer("phrase2"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else while (input.incrementToken()) { if (termAtt.term().equals("phrase")) { inPhrase = true; savedStart = offsetAtt.startOffset(); savedEnd = offsetAtt.endOffset(); termAtt.setTermBuffer("phrase1"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else if (!termAtt.term().equals("stop")) return true; } return false; }
@Override public boolean incrementToken() { if (used) { return false; } clearAttributes(); termAttribute.append(value); offsetAttribute.setOffset(0, value.length()); used = true; return true; }
/** * Generates a word/number part, updating the appropriate attributes * * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code * false} otherwise */ private void generatePart(boolean isSingleWord) { clearAttributes(); termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current); int startOffset = savedStartOffset + iterator.current; int endOffset = savedStartOffset + iterator.end; if (hasIllegalOffsets) { // historically this filter did this regardless for 'isSingleWord', // but we must do a sanity check: if (isSingleWord && startOffset <= savedEndOffset) { offsetAttribute.setOffset(startOffset, savedEndOffset); } else { offsetAttribute.setOffset(savedStartOffset, savedEndOffset); } } else { offsetAttribute.setOffset(startOffset, endOffset); } posIncAttribute.setPositionIncrement(position(false)); typeAttribute.setType(savedType); }
@Override public void end() throws IOException { super.end(); // NOTE: somewhat... hackish, but we need this to // satisfy BTSTC: final int lastOffset; if (tokens != null && !tokens.isEmpty()) { lastOffset = tokens.get(tokens.size() - 1).endOffset(); } else { lastOffset = 0; } offsetAtt.setOffset(correctOffset(lastOffset), correctOffset(inputLength)); }
public boolean incrementToken() throws IOException { clearAttributes(); Token token = nextToken(reusableToken); if (token != null) { termAtt.copyBuffer(token.buffer(), 0, token.length()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); return true; } else { end(); return false; } }
@Override public boolean incrementToken() throws IOException { clearAttributes(); // 已经穷尽tokensIteractor的Token对象,则继续请求reader流入数据 while (tokenIteractor == null || !tokenIteractor.hasNext()) { // System.out.println(dissected); int read = 0; int remainning = -1; // 重新从reader读入字符前,buffer中还剩下的字符数,负数表示当前暂不需要从reader中读入字符 if (dissected >= beef.length()) { remainning = 0; } else if (dissected < 0) { remainning = bufferLength + dissected; } if (remainning >= 0) { if (remainning > 0) { System.arraycopy(buffer, -dissected, buffer, 0, remainning); } read = input.read(buffer, remainning, bufferLength - remainning); inputLength += read; int charCount = remainning + read; if (charCount < 0) { // reader已尽,按接口next()要求返回null. return false; } if (charCount < bufferLength) { buffer[charCount++] = 0; } // 构造“牛”,并使用knife“解”之 beef.set(0, charCount); offset += Math.abs(dissected); // offset -= remainning; dissected = 0; } dissected = knife.dissect(this, beef, dissected); // offset += read;// !!! tokenIteractor = tokenCollector.iterator(); } if (tokenIteractor.hasNext()) { // 返回tokensIteractor下一个Token对象 Token token = tokenIteractor.next(); termAtt.setEmpty(); termAtt.append(token.charSequence()); offsetAtt.setOffset(correctOffset(token.startOffset()), correctOffset(token.endOffset())); positionIncrementAttribute.setPositionIncrement(token.endOffset()); return true; } return tokenIteractor.hasNext(); }
@Override public boolean incrementToken() throws IOException { if (index >= tokens.length) return false; else { clearAttributes(); Token token = tokens[index++]; termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); posIncAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); typeAtt.setType(token.type()); payloadAtt.setPayload(token.getPayload()); return true; } }
@Override public boolean incrementToken() throws IOException { if (iterator == null) { initializeIterator(); } if (iterator.hasNext()) { clearAttributes(); AnnotationFS next = iterator.next(); termAttr.append(next.getCoveredText()); offsetAttr.setOffset(correctOffset(next.getBegin()), correctOffset(next.getEnd())); return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; int builtGramSize = 0; if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { shiftInputWindow(); gramBuilder.setLength(0); } else { builtGramSize = gramSize.getPreviousValue(); } if (inputWindow.size() >= gramSize.getValue()) { boolean isAllFiller = true; InputWindowToken nextToken = null; Iterator<InputWindowToken> iter = inputWindow.iterator(); for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) { nextToken = iter.next(); if (builtGramSize < gramNum) { if (builtGramSize > 0) { gramBuilder.append(tokenSeparator); } gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; } if (isAllFiller && nextToken.isFiller) { if (gramNum == gramSize.getValue()) { gramSize.advance(); } } else { isAllFiller = false; } } if (!isAllFiller && builtGramSize == gramSize.getValue()) { inputWindow.getFirst().attSource.copyTo(this); posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); posLenAtt.setPositionLength(builtGramSize); isOutputHere = true; gramSize.advance(); tokenAvailable = true; } } return tokenAvailable; }
@Override public boolean incrementToken() throws IOException { clearAttributes(); buffer.setLength(0); int ci; char ch, pch; boolean atBegin = true; tokenStart = tokenEnd; ci = input.read(); ch = (char) ci; while (true) { if (ci == -1) { break; } else if (PUNCTION.indexOf(ch) != -1) { // End of a sentence buffer.append(ch); tokenEnd++; break; } else if (atBegin && SPACES.indexOf(ch) != -1) { tokenStart++; tokenEnd++; ci = input.read(); ch = (char) ci; } else { buffer.append(ch); atBegin = false; tokenEnd++; pch = ch; ci = input.read(); ch = (char) ci; // Two spaces, such as CR, LF if (SPACES.indexOf(ch) != -1 && SPACES.indexOf(pch) != -1) { // buffer.append(ch); tokenEnd++; break; } } } if (buffer.length() == 0) return false; else { termAtt.setEmpty().append(buffer); offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd)); typeAtt.setType("sentence"); return true; } }
/** Writes the joined unhyphenated term */ private void unhyphenate() { int endOffset = offsetAttribute.endOffset(); restoreState(savedState); savedState = null; char term[] = termAttribute.buffer(); int length = hyphenated.length(); if (length > termAttribute.length()) { term = termAttribute.resizeBuffer(length); } hyphenated.getChars(0, length, term, 0); termAttribute.setLength(length); offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset); hyphenated.setLength(0); }
@Override public boolean incrementToken() { if (upto < tokens.length) { final Token token = tokens[upto++]; // TODO: can we just capture/restoreState so // we get all attrs...? clearAttributes(); termAtt.setEmpty(); termAtt.append(token.toString()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); posLengthAtt.setPositionLength(token.getPositionLength()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); payloadAtt.setPayload(token.getPayload()); return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { // parse() is able to return w/o producing any new // tokens, when the tokens it had produced were entirely // punctuation. So we loop here until we get a real // token or we end: while (pending.size() == 0) { if (end) { return false; } // Push Viterbi forward some more: parse(); } final Token token = pending.remove(pending.size() - 1); int position = token.getPosition(); int length = token.getLength(); clearAttributes(); assert length > 0; // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + // token.getSurfaceForm().length); termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length); offsetAtt.setOffset(correctOffset(position), correctOffset(position + length)); basicFormAtt.setToken(token); posAtt.setToken(token); readingAtt.setToken(token); inflectionAtt.setToken(token); if (token.getPosition() == lastTokenPos) { posIncAtt.setPositionIncrement(0); posLengthAtt.setPositionLength(token.getPositionLength()); } else { assert token.getPosition() > lastTokenPos; posIncAtt.setPositionIncrement(1); posLengthAtt.setPositionLength(1); } if (VERBOSE) { System.out.println(Thread.currentThread().getName() + ": incToken: return token=" + token); } lastTokenPos = token.getPosition(); return true; }
@Override public boolean incrementToken() throws IOException { if (tokens == null) { fillTokens(); } // System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size()); if (upto == tokens.size()) { // System.out.println(" END @ " + tokens.size()); return false; } final Token t = tokens.get(upto++); // System.out.println(" return token=" + t); clearAttributes(); termAtt.append(t.toString()); offsetAtt.setOffset(t.startOffset(), t.endOffset()); posIncrAtt.setPositionIncrement(t.getPositionIncrement()); posLengthAtt.setPositionLength(t.getPositionLength()); return true; }
@Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; } else { curTermBuffer = termAtt.buffer().clone(); curTermLength = termAtt.length(); curCodePointCount = charUtils.codePointCount(termAtt); curGramSize = minGram; tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); savePosIncr += posIncrAtt.getPositionIncrement(); savePosLen = posLenAtt.getPositionLength(); } } if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any // n-grams // grab gramSize chars from front or back clearAttributes(); offsetAtt.setOffset(tokStart, tokEnd); // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.setPositionIncrement(savePosIncr); savePosIncr = 0; } else { posIncrAtt.setPositionIncrement(0); } posLenAtt.setPositionLength(savePosLen); final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; return true; } } curTermBuffer = null; } }