/** * Fills {@link #inputWindow} with input stream tokens, if available, shifting to the right if the * window was previously full. * * <p>Resets {@link #gramSize} to its minimum value. * * @throws IOException if there's a problem getting the next token */ private void shiftInputWindow() throws IOException { InputWindowToken firstToken = null; if (inputWindow.size() > 0) { firstToken = inputWindow.removeFirst(); } while (inputWindow.size() < maxShingleSize) { if (null != firstToken) { // recycle the firstToken, if available if (null != getNextToken(firstToken)) { inputWindow.add(firstToken); // the firstToken becomes the last firstToken = null; } else { break; // end of input stream } } else { InputWindowToken nextToken = getNextToken(null); if (null != nextToken) { inputWindow.add(nextToken); } else { break; // end of input stream } } } if (outputUnigramsIfNoShingles && noShingleOutput && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) { gramSize.minValue = 1; } gramSize.reset(); isOutputHere = false; }
@Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; int builtGramSize = 0; if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { shiftInputWindow(); gramBuilder.setLength(0); } else { builtGramSize = gramSize.getPreviousValue(); } if (inputWindow.size() >= gramSize.getValue()) { boolean isAllFiller = true; InputWindowToken nextToken = null; Iterator<InputWindowToken> iter = inputWindow.iterator(); for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) { nextToken = iter.next(); if (builtGramSize < gramNum) { if (builtGramSize > 0) { gramBuilder.append(tokenSeparator); } gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; } if (isAllFiller && nextToken.isFiller) { if (gramNum == gramSize.getValue()) { gramSize.advance(); } } else { isAllFiller = false; } } if (!isAllFiller && builtGramSize == gramSize.getValue()) { inputWindow.getFirst().attSource.copyTo(this); posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); posLenAtt.setPositionLength(builtGramSize); isOutputHere = true; gramSize.advance(); tokenAvailable = true; } } return tokenAvailable; }
@Override public void reset() throws IOException { super.reset(); gramSize.reset(); inputWindow.clear(); nextInputStreamToken = null; isNextInputStreamToken = false; numFillerTokensToInsert = 0; isOutputHere = false; noShingleOutput = true; exhausted = false; endState = null; if (outputUnigramsIfNoShingles && !outputUnigrams) { // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles gramSize.minValue = minShingleSize; } }