@Override public boolean incrementToken() throws IOException { // parse() is able to return w/o producing any new // tokens, when the tokens it had produced were entirely // punctuation. So we loop here until we get a real // token or we end: while (pending.size() == 0) { if (end) { return false; } // Push Viterbi forward some more: parse(); } final Token token = pending.remove(pending.size() - 1); int position = token.getPosition(); int length = token.getLength(); clearAttributes(); assert length > 0; // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + // token.getSurfaceForm().length); termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length); offsetAtt.setOffset(correctOffset(position), correctOffset(position + length)); basicFormAtt.setToken(token); posAtt.setToken(token); readingAtt.setToken(token); inflectionAtt.setToken(token); if (token.getPosition() == lastTokenPos) { posIncAtt.setPositionIncrement(0); posLengthAtt.setPositionLength(token.getPositionLength()); } else { assert token.getPosition() > lastTokenPos; posIncAtt.setPositionIncrement(1); posLengthAtt.setPositionLength(1); } if (VERBOSE) { System.out.println(Thread.currentThread().getName() + ": incToken: return token=" + token); } lastTokenPos = token.getPosition(); return true; }
@Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; int builtGramSize = 0; if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { shiftInputWindow(); gramBuilder.setLength(0); } else { builtGramSize = gramSize.getPreviousValue(); } if (inputWindow.size() >= gramSize.getValue()) { boolean isAllFiller = true; InputWindowToken nextToken = null; Iterator<InputWindowToken> iter = inputWindow.iterator(); for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) { nextToken = iter.next(); if (builtGramSize < gramNum) { if (builtGramSize > 0) { gramBuilder.append(tokenSeparator); } gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; } if (isAllFiller && nextToken.isFiller) { if (gramNum == gramSize.getValue()) { gramSize.advance(); } } else { isAllFiller = false; } } if (!isAllFiller && builtGramSize == gramSize.getValue()) { inputWindow.getFirst().attSource.copyTo(this); posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); posLenAtt.setPositionLength(builtGramSize); isOutputHere = true; gramSize.advance(); tokenAvailable = true; } } return tokenAvailable; }
@Override public boolean incrementToken() { if (upto < tokens.length) { final Token token = tokens[upto++]; // TODO: can we just capture/restoreState so // we get all attrs...? clearAttributes(); termAtt.setEmpty(); termAtt.append(token.toString()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); posLengthAtt.setPositionLength(token.getPositionLength()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); payloadAtt.setPayload(token.getPayload()); return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { if (tokens == null) { fillTokens(); } // System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size()); if (upto == tokens.size()) { // System.out.println(" END @ " + tokens.size()); return false; } final Token t = tokens.get(upto++); // System.out.println(" return token=" + t); clearAttributes(); termAtt.append(t.toString()); offsetAtt.setOffset(t.startOffset(), t.endOffset()); posIncrAtt.setPositionIncrement(t.getPositionIncrement()); posLengthAtt.setPositionLength(t.getPositionLength()); return true; }
@Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; } else { curTermBuffer = termAtt.buffer().clone(); curTermLength = termAtt.length(); curCodePointCount = charUtils.codePointCount(termAtt); curGramSize = minGram; tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); savePosIncr += posIncrAtt.getPositionIncrement(); savePosLen = posLenAtt.getPositionLength(); } } if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any // n-grams // grab gramSize chars from front or back clearAttributes(); offsetAtt.setOffset(tokStart, tokEnd); // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.setPositionIncrement(savePosIncr); savePosIncr = 0; } else { posIncrAtt.setPositionIncrement(0); } posLenAtt.setPositionLength(savePosLen); final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; return true; } } curTermBuffer = null; } }
@Override public boolean incrementToken() throws IOException { // System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + // nextRead + " nextWrite=" + nextWrite); while (true) { // First play back any buffered future inputs/outputs // w/o running parsing again: while (inputSkipCount != 0) { // At each position, we first output the original // token // TODO: maybe just a PendingState class, holding // both input & outputs? final PendingInput input = futureInputs[nextRead]; final PendingOutputs outputs = futureOutputs[nextRead]; // System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " // inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " // input.consumed=" + input.consumed + " input.state=" + input.state); if (!input.consumed && (input.keepOrig || !input.matched)) { if (input.state != null) { // Return a previously saved token (because we // had to lookahead): restoreState(input.state); } else { // Pass-through case: return token we just pulled // but didn't capture: assert inputSkipCount == 1 : "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead; } input.reset(); if (outputs.count > 0) { outputs.posIncr = 0; } else { nextRead = rollIncr(nextRead); inputSkipCount--; } // System.out.println(" return token=" + termAtt.toString()); return true; } else if (outputs.upto < outputs.count) { // Still have pending outputs to replay at this // position input.reset(); final int posIncr = outputs.posIncr; final CharsRef output = outputs.pullNext(); clearAttributes(); termAtt.copyBuffer(output.chars, output.offset, output.length); typeAtt.setType(TYPE_SYNONYM); int endOffset = outputs.getLastEndOffset(); if (endOffset == -1) { endOffset = input.endOffset; } offsetAtt.setOffset(input.startOffset, endOffset); posIncrAtt.setPositionIncrement(posIncr); posLenAtt.setPositionLength(outputs.getLastPosLength()); if (outputs.count == 0) { // Done with the buffered input and all outputs at // this position nextRead = rollIncr(nextRead); inputSkipCount--; } // System.out.println(" return token=" + termAtt.toString()); return true; } else { // Done with the buffered input and all outputs at // this position input.reset(); nextRead = rollIncr(nextRead); inputSkipCount--; } } if (finished && nextRead == nextWrite) { // End case: if any output syns went beyond end of // input stream, enumerate them now: final PendingOutputs outputs = futureOutputs[nextRead]; if (outputs.upto < outputs.count) { final int posIncr = outputs.posIncr; final CharsRef output = outputs.pullNext(); futureInputs[nextRead].reset(); if (outputs.count == 0) { nextWrite = nextRead = rollIncr(nextRead); } clearAttributes(); // Keep offset from last input token: offsetAtt.setOffset(lastStartOffset, lastEndOffset); termAtt.copyBuffer(output.chars, output.offset, output.length); typeAtt.setType(TYPE_SYNONYM); // System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs); posIncrAtt.setPositionIncrement(posIncr); // System.out.println(" return token=" + termAtt.toString()); return true; } else { return false; } } // Find new synonym matches: parse(); } }