@Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    if (delimitersCount == -1) {
      int length = 0;
      delimiterPositions.add(0);
      while (true) {
        int c = input.read();
        if (c < 0) {
          break;
        }
        length++;
        if (c == delimiter) {
          delimiterPositions.add(length);
          resultToken.append(replacement);
        } else {
          resultToken.append((char) c);
        }
      }
      delimitersCount = delimiterPositions.size();
      if (delimiterPositions.get(delimitersCount - 1) < length) {
        delimiterPositions.add(length);
        delimitersCount++;
      }
      if (resultTokenBuffer.length < resultToken.length()) {
        resultTokenBuffer = new char[resultToken.length()];
      }
      resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
      resultToken.setLength(0);
      int idx = delimitersCount - 1 - skip;
      if (idx >= 0) {
        // otherwise its ok, because we will skip and return false
        endPosition = delimiterPositions.get(idx);
      }
      finalOffset = correctOffset(length);
      posAtt.setPositionIncrement(1);
    } else {
      posAtt.setPositionIncrement(0);
    }

    while (skipped < delimitersCount - skip - 1) {
      int start = delimiterPositions.get(skipped);
      termAtt.copyBuffer(resultTokenBuffer, start, endPosition - start);
      offsetAtt.setOffset(correctOffset(start), correctOffset(endPosition));
      skipped++;
      return true;
    }

    return false;
  }
Example #2
0
  @Override
  public final boolean incrementToken() throws IOException {
    if (isMailto) {
      termAtt.setEmpty();
      // return the scheme + the mail part
      isMailto = false;
      posIncrAtt.setPositionIncrement(0);
      termAtt.copyBuffer(termBuffer.array(), 0, termBuffer.position());
      return true;
    }

    if (input.incrementToken()) {
      final String type = typeAtt.type();
      if (type.equals(TupleTokenizer.getTokenTypes()[TupleTokenizer.URI])
          && this.isMailtoScheme()) {
        this.updateBuffer();
        termBuffer.put(termAtt.buffer(), 0, termAtt.length());
        // return only the mail part
        posIncrAtt.setPositionIncrement(1);
        termAtt.copyBuffer(termBuffer.array(), 7, termBuffer.position() - 7);
      }
      return true;
    }
    return false;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (!terms.isEmpty()) {
      char[] buffer = terms.poll();
      termAttribute.setEmpty();
      termAttribute.copyBuffer(buffer, 0, buffer.length);
      posIncAttr.setPositionIncrement(1);
      return true;
    }

    if (!input.incrementToken()) {
      return false;
    } else {
      final char term[] = termAttribute.buffer();
      final int length = termAttribute.length();

      int k = 0;
      for (; k < length; k++) {
        if (term[k] == tokenDelimiter) {
          break;
        }
      }

      LinkedList<CharBuffer> buffers = permuteTerms(term, 0, length);

      Iterator iter = buffers.iterator();
      while (iter.hasNext()) {
        CharBuffer cb = (CharBuffer) iter.next();
        terms.add(cb.array());
      }

      // we return true and leave the original token unchanged
      return true;
    }
  }
  @Override
  public final boolean incrementToken() throws IOException {
    if (!tokens.isEmpty()) {
      assert current != null;
      CompoundToken token = tokens.removeFirst();
      restoreState(current); // keep all other attributes untouched
      termAtt.setEmpty().append(token.txt);
      offsetAtt.setOffset(token.startOffset, token.endOffset);
      posIncAtt.setPositionIncrement(0);
      return true;
    }

    current = null; // not really needed, but for safety
    if (input.incrementToken()) {
      // Only words longer than minWordSize get processed
      if (termAtt.length() >= this.minWordSize) {
        decompose();
        // only capture the state if we really need it for producing new tokens
        if (!tokens.isEmpty()) {
          current = captureState();
        }
      }
      // return original token:
      return true;
    } else {
      return false;
    }
  }
  /* (non-Javadoc)
   * @see org.apache.lucene.analysis.TokenStream#incrementToken()
   */
  @Override
  public boolean incrementToken() throws IOException {
    // 清除所有的词元属性
    clearAttributes();
    skippedPositions = 0;

    Lexeme nextLexeme = _IKImplement.next();
    if (nextLexeme != null) {
      posIncrAtt.setPositionIncrement(skippedPositions + 1);

      // 将Lexeme转成Attributes
      // 设置词元文本
      termAtt.append(nextLexeme.getLexemeText());
      // 设置词元长度
      termAtt.setLength(nextLexeme.getLength());
      // 设置词元位移
      offsetAtt.setOffset(
          correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));

      // 记录分词的最后位置
      endPosition = nextLexeme.getEndPosition();
      // 记录词元分类
      typeAtt.setType(nextLexeme.getLexemeTypeString());
      // 返会true告知还有下个词元
      return true;
    }
    // 返会false告知词元输出完毕
    return false;
  }
Example #6
0
  /*
   * (non-Javadoc)
   *
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    skippedPositions = 0;

    while (true) {
      int tokenType = scanner.getNextToken();

      if (tokenType == StandardTokenizerImpl.YYEOF) {
        return false;
      }

      if (scanner.yylength() <= maxTokenLength) {
        posIncrAtt.setPositionIncrement(skippedPositions + 1);
        scanner.getText(termAtt);
        final int start = scanner.yychar();
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
        typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
        return true;
      } else
        // When we skip a too-long term, we still increment the
        // position increment
        skippedPositions++;
    }
  }
  /*
   * (non-Javadoc)
   *
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    skippedPositions = 0;

    while (true) {
      int tokenType = scanner.getNextToken();

      if (tokenType == StandardTokenizerInterface.YYEOF) {
        return false;
      }

      if (scanner.yylength() <= maxTokenLength) {
        posIncrAtt.setPositionIncrement(skippedPositions + 1);
        scanner.getText(termAtt);
        final int start = scanner.yychar();
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
        // This 'if' should be removed in the next release. For now, it converts
        // invalid acronyms to HOST. When removed, only the 'else' part should
        // remain.
        if (tokenType == StandardTokenizer.ACRONYM_DEP) {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]);
          termAtt.setLength(termAtt.length() - 1); // remove extra '.'
        } else {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
        }
        return true;
      } else
        // When we skip a too-long term, we still increment the
        // position increment
        skippedPositions++;
    }
  }
 @Override
 public final void end() throws IOException {
   super.end();
   // set final offset
   int finalOffset = correctOffset(this.endPosition);
   offsetAtt.setOffset(finalOffset, finalOffset);
   posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
 }
Example #9
0
 @Override
 public final void end() throws IOException {
   super.end();
   // set final offset
   int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
   offsetAtt.setOffset(finalOffset, finalOffset);
   // adjust any skipped tokens
   posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
 }
  @Override
  public boolean incrementToken() throws IOException {

    // parse() is able to return w/o producing any new
    // tokens, when the tokens it had produced were entirely
    // punctuation.  So we loop here until we get a real
    // token or we end:
    while (pending.size() == 0) {
      if (end) {
        return false;
      }

      // Push Viterbi forward some more:
      parse();
    }

    final Token token = pending.remove(pending.size() - 1);

    int position = token.getPosition();
    int length = token.getLength();
    clearAttributes();
    assert length > 0;
    // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
    // token.getSurfaceForm().length);
    termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
    offsetAtt.setOffset(correctOffset(position), correctOffset(position + length));
    basicFormAtt.setToken(token);
    posAtt.setToken(token);
    readingAtt.setToken(token);
    inflectionAtt.setToken(token);
    if (token.getPosition() == lastTokenPos) {
      posIncAtt.setPositionIncrement(0);
      posLengthAtt.setPositionLength(token.getPositionLength());
    } else {
      assert token.getPosition() > lastTokenPos;
      posIncAtt.setPositionIncrement(1);
      posLengthAtt.setPositionLength(1);
    }
    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
    }
    lastTokenPos = token.getPosition();
    return true;
  }
Example #11
0
 @Override
 public boolean incrementToken() throws IOException {
   if (currentPrefix != null) {
     if (!currentPrefix.hasNext()) {
       return input.incrementToken();
     } else {
       posAttr.setPositionIncrement(0);
     }
   } else {
     currentPrefix = prefixes.iterator();
     termAttr.setEmpty();
     posAttr.setPositionIncrement(1);
     assert (currentPrefix.hasNext()) : "one or more prefixes needed";
   }
   termAttr.setEmpty();
   termAttr.append(currentPrefix.next());
   termAttr.append(separator);
   return true;
 }
 @Override
 public final boolean incrementToken() throws IOException {
   while (true) {
     if (curTermBuffer == null) {
       if (!input.incrementToken()) {
         return false;
       } else {
         curTermBuffer = termAtt.buffer().clone();
         curTermLength = termAtt.length();
         curCodePointCount = charUtils.codePointCount(termAtt);
         curGramSize = minGram;
         tokStart = offsetAtt.startOffset();
         tokEnd = offsetAtt.endOffset();
         savePosIncr += posIncrAtt.getPositionIncrement();
         savePosLen = posLenAtt.getPositionLength();
       }
     }
     if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
       if (curGramSize
           <= curCodePointCount) { // if the remaining input is too short, we can't generate any
         // n-grams
         // grab gramSize chars from front or back
         clearAttributes();
         offsetAtt.setOffset(tokStart, tokEnd);
         // first ngram gets increment, others don't
         if (curGramSize == minGram) {
           posIncrAtt.setPositionIncrement(savePosIncr);
           savePosIncr = 0;
         } else {
           posIncrAtt.setPositionIncrement(0);
         }
         posLenAtt.setPositionLength(savePosLen);
         final int charLength =
             charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
         termAtt.copyBuffer(curTermBuffer, 0, charLength);
         curGramSize++;
         return true;
       }
     }
     curTermBuffer = null;
   }
 }
  public final boolean incrementToken() throws IOException {
    int increment = 0;
    while (input.incrementToken()) {
      if (!stopWords.contains(termAttr.termBuffer(), 0, termAttr.termLength())) {
        posIncrAttr.setPositionIncrement(posIncrAttr.getPositionIncrement() + increment);
        return true;
      }

      increment += posIncrAttr.getPositionIncrement();
    }

    return false;
  }
 @Override
 public boolean incrementToken() {
   clearAttributes();
   if (upto == 4) {
     return false;
   }
   if (upto == 0) {
     posIncr.setPositionIncrement(1);
     term.setEmpty().append("a");
   } else if (upto == 1) {
     posIncr.setPositionIncrement(1);
     term.setEmpty().append("b");
   } else if (upto == 2) {
     posIncr.setPositionIncrement(0);
     term.setEmpty().append("c");
   } else {
     posIncr.setPositionIncrement(0);
     term.setEmpty().append("d");
   }
   upto++;
   return true;
 }
  @Override
  public boolean incrementToken() throws IOException {
    clearAttributes();

    // 已经穷尽tokensIteractor的Token对象,则继续请求reader流入数据
    while (tokenIteractor == null || !tokenIteractor.hasNext()) {
      // System.out.println(dissected);
      int read = 0;
      int remainning = -1; // 重新从reader读入字符前,buffer中还剩下的字符数,负数表示当前暂不需要从reader中读入字符
      if (dissected >= beef.length()) {
        remainning = 0;
      } else if (dissected < 0) {
        remainning = bufferLength + dissected;
      }
      if (remainning >= 0) {
        if (remainning > 0) {
          System.arraycopy(buffer, -dissected, buffer, 0, remainning);
        }
        read = input.read(buffer, remainning, bufferLength - remainning);
        inputLength += read;
        int charCount = remainning + read;
        if (charCount < 0) {
          // reader已尽,按接口next()要求返回null.
          return false;
        }
        if (charCount < bufferLength) {
          buffer[charCount++] = 0;
        }
        // 构造“牛”,并使用knife“解”之
        beef.set(0, charCount);
        offset += Math.abs(dissected);
        // offset -= remainning;
        dissected = 0;
      }
      dissected = knife.dissect(this, beef, dissected);
      // offset += read;// !!!
      tokenIteractor = tokenCollector.iterator();
    }

    if (tokenIteractor.hasNext()) {
      // 返回tokensIteractor下一个Token对象
      Token token = tokenIteractor.next();
      termAtt.setEmpty();
      termAtt.append(token.charSequence());
      offsetAtt.setOffset(correctOffset(token.startOffset()), correctOffset(token.endOffset()));
      positionIncrementAttribute.setPositionIncrement(token.endOffset());
      return true;
    }
    return tokenIteractor.hasNext();
  }
Example #16
0
 @Override
 public boolean incrementToken() throws IOException {
   if (index >= tokens.length) return false;
   else {
     clearAttributes();
     Token token = tokens[index++];
     termAtt.setEmpty().append(token);
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     posIncAtt.setPositionIncrement(token.getPositionIncrement());
     flagsAtt.setFlags(token.getFlags());
     typeAtt.setType(token.type());
     payloadAtt.setPayload(token.getPayload());
     return true;
   }
 }
 @Override
 public boolean incrementToken() throws IOException {
   while (true) {
     final boolean gotOne = input.incrementToken();
     if (!gotOne) {
       return false;
     } else if (termAtt.toString().equals("a")) {
       pendingPosInc += posIncAtt.getPositionIncrement();
     } else {
       posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
       pendingPosInc = 0;
       return true;
     }
   }
 }
 @Override
 public boolean incrementToken() throws IOException {
   boolean tokenAvailable = false;
   int builtGramSize = 0;
   if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
     shiftInputWindow();
     gramBuilder.setLength(0);
   } else {
     builtGramSize = gramSize.getPreviousValue();
   }
   if (inputWindow.size() >= gramSize.getValue()) {
     boolean isAllFiller = true;
     InputWindowToken nextToken = null;
     Iterator<InputWindowToken> iter = inputWindow.iterator();
     for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) {
       nextToken = iter.next();
       if (builtGramSize < gramNum) {
         if (builtGramSize > 0) {
           gramBuilder.append(tokenSeparator);
         }
         gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length());
         ++builtGramSize;
       }
       if (isAllFiller && nextToken.isFiller) {
         if (gramNum == gramSize.getValue()) {
           gramSize.advance();
         }
       } else {
         isAllFiller = false;
       }
     }
     if (!isAllFiller && builtGramSize == gramSize.getValue()) {
       inputWindow.getFirst().attSource.copyTo(this);
       posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
       termAtt.setEmpty().append(gramBuilder);
       if (gramSize.getValue() > 1) {
         typeAtt.setType(tokenType);
         noShingleOutput = false;
       }
       offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
       posLenAtt.setPositionLength(builtGramSize);
       isOutputHere = true;
       gramSize.advance();
       tokenAvailable = true;
     }
   }
   return tokenAvailable;
 }
 @Override
 public boolean incrementToken() throws IOException {
   // return the first non-stop word found
   int skippedPositions = 0;
   while (input.incrementToken()) {
     if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
       if (enablePositionIncrements) {
         posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
       }
       return true;
     }
     skippedPositions += posIncrAtt.getPositionIncrement();
   }
   // reached EOS -- return false
   return false;
 }
    @Override
    public final boolean incrementToken() throws IOException {
      if (addSynonym) { // inject our synonym
        clearAttributes();
        termAtt.setEmpty().append("國");
        posIncAtt.setPositionIncrement(0);
        addSynonym = false;
        return true;
      }

      if (input.incrementToken()) {
        addSynonym = termAtt.toString().equals("国");
        return true;
      } else {
        return false;
      }
    }
Example #21
0
 @Override
 public boolean incrementToken() {
   if (upto < tokens.length) {
     final Token token = tokens[upto++];
     // TODO: can we just capture/restoreState so
     // we get all attrs...?
     clearAttributes();
     termAtt.setEmpty();
     termAtt.append(token.toString());
     posIncrAtt.setPositionIncrement(token.getPositionIncrement());
     posLengthAtt.setPositionLength(token.getPositionLength());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     payloadAtt.setPayload(token.getPayload());
     return true;
   } else {
     return false;
   }
 }
 @Override
 public boolean incrementToken() throws IOException {
   if (tokens == null) {
     fillTokens();
   }
   // System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
   if (upto == tokens.size()) {
     // System.out.println("  END @ " + tokens.size());
     return false;
   }
   final Token t = tokens.get(upto++);
   // System.out.println("  return token=" + t);
   clearAttributes();
   termAtt.append(t.toString());
   offsetAtt.setOffset(t.startOffset(), t.endOffset());
   posIncrAtt.setPositionIncrement(t.getPositionIncrement());
   posLengthAtt.setPositionLength(t.getPositionLength());
   return true;
 }
  private void setAttributesFromQueue(boolean isFirst) {
    final KoreanToken iw = morphQueue.removeFirst();
    if (isFirst && !morphQueue.isEmpty()) {
      // our queue has more elements remaining (e.g. we decompounded)
      // capture state for those. We set the term attribute to be empty
      // so we save lots of array copying later.
      termAtt.setEmpty();
      currentState = captureState();
    }

    termAtt.setEmpty().append(iw.getTerm());
    offsetAtt.setOffset(iw.getOffset(), iw.getOffset() + iw.getLength());
    morphAtt.setToken(iw);

    // on the first Token we preserve incoming increment:
    if (!isFirst) {
      posIncrAtt.setPositionIncrement(iw.getPosInc());
    }

    // TODO: How to handle PositionLengthAttribute correctly?
  }
Example #24
0
  /**
   * Generates a word/number part, updating the appropriate attributes
   *
   * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code
   *     false} otherwise
   */
  private void generatePart(boolean isSingleWord) {
    clearAttributes();
    termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);

    int startOffset = savedStartOffset + iterator.current;
    int endOffset = savedStartOffset + iterator.end;

    if (hasIllegalOffsets) {
      // historically this filter did this regardless for 'isSingleWord',
      // but we must do a sanity check:
      if (isSingleWord && startOffset <= savedEndOffset) {
        offsetAttribute.setOffset(startOffset, savedEndOffset);
      } else {
        offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
      }
    } else {
      offsetAttribute.setOffset(startOffset, endOffset);
    }
    posIncAttribute.setPositionIncrement(position(false));
    typeAttribute.setType(savedType);
  }
  private void emit(char[] token) {
    Log.debug("emit: " + new String(token));
    if (replaceWhitespaceWith != null) {
      token = replaceWhiteSpace(token);
    }
    CharTermAttribute termAttr = getTermAttribute();
    termAttr.setEmpty();
    termAttr.append(new StringBuilder().append(token));

    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
      int start = offAttr.endOffset() - token.length;
      offAttr.setOffset(start, offAttr.endOffset());
    }

    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
      pia.setPositionIncrement(++positionIncr);
    }

    lastEmitted = token;
  }
Example #26
0
  @Override
  public final boolean incrementToken() throws IOException {
    // initialise the numeric attribute
    if (!isInitialised) {
      final long value = parser.parseAndConvert(this.input);
      numericAtt.init(parser.getNumericType(), value, parser.getValueSize());
      isInitialised = true;
    }

    // this will only clear all other attributes in this TokenStream
    this.clearAttributes();

    // increment the shift and generate next token
    final boolean hasNext = numericAtt.incrementShift(termAtt);
    // set other attributes after the call to incrementShift since getShift
    // is undefined before first call
    typeAtt.setType(
        (numericAtt.getShift() == 0)
            ? NumericTokenStream.TOKEN_TYPE_FULL_PREC
            : NumericTokenStream.TOKEN_TYPE_LOWER_PREC);
    posIncrAtt.setPositionIncrement((numericAtt.getShift() == 0) ? 1 : 0);

    return hasNext;
  }
Example #27
0
  @Override
  public boolean incrementToken() throws IOException {
    while (true) {
      if (!hasSavedState) {
        // process a new input word
        if (!input.incrementToken()) {
          return false;
        }

        int termLength = termAttribute.length();
        char[] termBuffer = termAttribute.buffer();

        accumPosInc += posIncAttribute.getPositionIncrement();

        iterator.setText(termBuffer, termLength);
        iterator.next();

        // word of no delimiters, or protected word: just return it
        if ((iterator.current == 0 && iterator.end == termLength)
            || (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
          posIncAttribute.setPositionIncrement(accumPosInc);
          accumPosInc = 0;
          first = false;
          return true;
        }

        // word of simply delimiters
        if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL)) {
          // if the posInc is 1, simply ignore it in the accumulation
          // TODO: proper hole adjustment (FilteringTokenFilter-like) instead of this previous
          // logic!
          if (posIncAttribute.getPositionIncrement() == 1 && !first) {
            accumPosInc--;
          }
          continue;
        }

        saveState();

        hasOutputToken = false;
        hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL);
        lastConcatCount = 0;

        if (has(PRESERVE_ORIGINAL)) {
          posIncAttribute.setPositionIncrement(accumPosInc);
          accumPosInc = 0;
          first = false;
          return true;
        }
      }

      // at the end of the string, output any concatenations
      if (iterator.end == WordDelimiterIterator.DONE) {
        if (!concat.isEmpty()) {
          if (flushConcatenation(concat)) {
            buffer();
            continue;
          }
        }

        if (!concatAll.isEmpty()) {
          // only if we haven't output this same combo above!
          if (concatAll.subwordCount > lastConcatCount) {
            concatAll.writeAndClear();
            buffer();
            continue;
          }
          concatAll.clear();
        }

        if (bufferedPos < bufferedLen) {
          if (bufferedPos == 0) {
            sorter.sort(0, bufferedLen);
          }
          clearAttributes();
          restoreState(buffered[bufferedPos++]);
          if (first && posIncAttribute.getPositionIncrement() == 0) {
            // can easily happen with strange combinations (e.g. not outputting numbers, but
            // concat-all)
            posIncAttribute.setPositionIncrement(1);
          }
          first = false;
          return true;
        }

        // no saved concatenations, on to the next input word
        bufferedPos = bufferedLen = 0;
        hasSavedState = false;
        continue;
      }

      // word surrounded by delimiters: always output
      if (iterator.isSingleWord()) {
        generatePart(true);
        iterator.next();
        first = false;
        return true;
      }

      int wordType = iterator.type();

      // do we already have queued up incompatible concatenations?
      if (!concat.isEmpty() && (concat.type & wordType) == 0) {
        if (flushConcatenation(concat)) {
          hasOutputToken = false;
          buffer();
          continue;
        }
        hasOutputToken = false;
      }

      // add subwords depending upon options
      if (shouldConcatenate(wordType)) {
        if (concat.isEmpty()) {
          concat.type = wordType;
        }
        concatenate(concat);
      }

      // add all subwords (catenateAll)
      if (has(CATENATE_ALL)) {
        concatenate(concatAll);
      }

      // if we should output the word or number part
      if (shouldGenerateParts(wordType)) {
        generatePart(false);
        buffer();
      }

      iterator.next();
    }
  }
 private void applyToken(Token token) {
   termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
   posAtt.setPositionIncrement(token.getPositionIncrement());
   offsetAtt.setOffset(token.startOffset(), token.endOffset());
 }
 public static void setPositionIncrement(AttributeSource source, int posIncr) {
   PositionIncrementAttribute attr = source.addAttribute(PositionIncrementAttribute.class);
   attr.setPositionIncrement(posIncr);
 }
  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int startOffsets[],
      int endOffsets[],
      String types[],
      int posIncrements[],
      Integer finalOffset)
      throws IOException {
    assertNotNull(output);
    CheckClearAttributesAttribute checkClearAtt =
        (CheckClearAttributesAttribute) ts.addAttribute(CheckClearAttributesAttribute.class);

    assertTrue("has no TermAttribute", ts.hasAttribute(TermAttribute.class));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
      assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
      offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
      assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
      typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null) {
      assertTrue(
          "has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
      posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class);
    }

    ts.reset();
    for (int i = 0; i < output.length; i++) {
      // extra safety to enforce, that the state is not preserved and also assign bogus values
      ts.clearAttributes();
      termAtt.setTermBuffer("bogusTerm");
      if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243);
      if (typeAtt != null) typeAtt.setType("bogusType");
      if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);

      checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
      assertTrue("token " + i + " does not exist", ts.incrementToken());
      assertTrue(
          "clearAttributes() was not called correctly in TokenStream chain",
          checkClearAtt.getAndResetClearCalled());

      assertEquals("term " + i, output[i], termAtt.term());
      if (startOffsets != null)
        assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset());
      if (endOffsets != null) assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset());
      if (types != null) assertEquals("type " + i, types[i], typeAtt.type());
      if (posIncrements != null)
        assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement());
    }
    assertFalse("end of stream", ts.incrementToken());
    ts.end();
    if (finalOffset != null)
      assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
    ts.close();
  }