@Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    if (delimitersCount == -1) {
      int length = 0;
      delimiterPositions.add(0);
      while (true) {
        int c = input.read();
        if (c < 0) {
          break;
        }
        length++;
        if (c == delimiter) {
          delimiterPositions.add(length);
          resultToken.append(replacement);
        } else {
          resultToken.append((char) c);
        }
      }
      delimitersCount = delimiterPositions.size();
      if (delimiterPositions.get(delimitersCount - 1) < length) {
        delimiterPositions.add(length);
        delimitersCount++;
      }
      if (resultTokenBuffer.length < resultToken.length()) {
        resultTokenBuffer = new char[resultToken.length()];
      }
      resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
      resultToken.setLength(0);
      int idx = delimitersCount - 1 - skip;
      if (idx >= 0) {
        // otherwise its ok, because we will skip and return false
        endPosition = delimiterPositions.get(idx);
      }
      finalOffset = correctOffset(length);
      posAtt.setPositionIncrement(1);
    } else {
      posAtt.setPositionIncrement(0);
    }

    while (skipped < delimitersCount - skip - 1) {
      int start = delimiterPositions.get(skipped);
      termAtt.copyBuffer(resultTokenBuffer, start, endPosition - start);
      offsetAtt.setOffset(correctOffset(start), correctOffset(endPosition));
      skipped++;
      return true;
    }

    return false;
  }
  @Override
  public boolean incrementToken() throws IOException {

    if (tokenIter == null || !tokenIter.hasNext()) {
      if (input.incrementToken()) {
        tokStart = offsetAtt.startOffset();
        tokEnd = offsetAtt.endOffset();
        hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
        tokenBuffer = wordSegmenter.getTokendWords(termAtt.toString());
        tokenIter = tokenBuffer.iterator();
        if (!tokenIter.hasNext()) return false;

      } else {
        return false;
      }
    }

    clearAttributes();

    TokendWords nextWord = tokenIter.next();

    termAtt.copyBuffer(nextWord.next(), 0, nextWord.next().length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.start, nextWord.end);
    }
    typeAtt.setType("word");
    return true;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (tokenIter == null || !tokenIter.hasNext()) {
      // there are no remaining tokens from the current sentence... are there more sentences?
      if (input.incrementToken()) {
        tokStart = offsetAtt.startOffset();
        tokEnd = offsetAtt.endOffset();
        // if length by start + end offsets doesn't match the term text then assume
        // this is a synonym and don't adjust the offsets.
        hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
        // a new sentence is available: process it.
        tokenBuffer = splitIntoTokens(termAtt.toString(), offsetAtt.startOffset());
        tokenIter = tokenBuffer.iterator();

        // it should not be possible to have a sentence with 0 words, check just in case.
        // returning EOS isn't the best either, but its the behavior of the original code.
        if (!tokenIter.hasNext()) return false;
      } else {
        return false; // no more sentences, end of stream!
      }
    }
    // WordTokenFilter must clear attributes, as it is creating new tokens.
    clearAttributes();
    // There are remaining tokens from the current sentence, return the next one.
    SegToken nextWord = tokenIter.next();
    termAtt.append(nextWord.term);
    // termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.start, nextWord.end);
    }
    typeAtt.setType("word");
    return true;
  }
 private void emit(Token token) {
   emit(token.tok);
   OffsetAttribute offAttr = getOffsetAttribute();
   if (token.endPos > token.startPos && token.startPos >= 0) {
     offAttr.setOffset(token.startPos, token.endPos);
   }
 }
 @Override
 public final boolean incrementToken() throws IOException {
   while (true) {
     if (curTermBuffer == null) {
       if (!input.incrementToken()) {
         return false;
       } else {
         curTermBuffer = termAtt.buffer().clone();
         curTermLength = termAtt.length();
         curGramSize = minGram;
         tokStart = offsetAtt.startOffset();
       }
     }
     if (curGramSize <= maxGram) {
       if (!(curGramSize
               > curTermLength // if the remaining input is too short, we can't generate any
                               // n-grams
           || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit
         // grab gramSize chars from front or back
         int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
         int end = start + curGramSize;
         clearAttributes();
         offsetAtt.setOffset(tokStart + start, tokStart + end);
         termAtt.copyBuffer(curTermBuffer, start, curGramSize);
         curGramSize++;
         return true;
       }
     }
     curTermBuffer = null;
   }
 }
  /* (non-Javadoc)
   * @see org.apache.lucene.analysis.TokenStream#incrementToken()
   */
  @Override
  public boolean incrementToken() throws IOException {
    // 清除所有的词元属性
    clearAttributes();
    skippedPositions = 0;

    Lexeme nextLexeme = _IKImplement.next();
    if (nextLexeme != null) {
      posIncrAtt.setPositionIncrement(skippedPositions + 1);

      // 将Lexeme转成Attributes
      // 设置词元文本
      termAtt.append(nextLexeme.getLexemeText());
      // 设置词元长度
      termAtt.setLength(nextLexeme.getLength());
      // 设置词元位移
      offsetAtt.setOffset(
          correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));

      // 记录分词的最后位置
      endPosition = nextLexeme.getEndPosition();
      // 记录词元分类
      typeAtt.setType(nextLexeme.getLexemeTypeString());
      // 返会true告知还有下个词元
      return true;
    }
    // 返会false告知词元输出完毕
    return false;
  }
 @Override
 public final void end() throws IOException {
   super.end();
   // set final offset
   int finalOffset = correctOffset(charsRead);
   offsetAtt.setOffset(finalOffset, finalOffset);
 }
  @Override
  public final boolean incrementToken() throws IOException {
    if (!tokens.isEmpty()) {
      assert current != null;
      CompoundToken token = tokens.removeFirst();
      restoreState(current); // keep all other attributes untouched
      termAtt.setEmpty().append(token.txt);
      offsetAtt.setOffset(token.startOffset, token.endOffset);
      posIncAtt.setPositionIncrement(0);
      return true;
    }

    current = null; // not really needed, but for safety
    if (input.incrementToken()) {
      // Only words longer than minWordSize get processed
      if (termAtt.length() >= this.minWordSize) {
        decompose();
        // only capture the state if we really need it for producing new tokens
        if (!tokens.isEmpty()) {
          current = captureState();
        }
      }
      // return original token:
      return true;
    } else {
      return false;
    }
  }
 @Override
 public void end() throws IOException {
   super.end();
   // Set final offset
   int finalOffset = correctOffset(pos);
   offsetAtt.setOffset(finalOffset, finalOffset);
 }
  /*
   * (non-Javadoc)
   *
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    skippedPositions = 0;

    while (true) {
      int tokenType = scanner.getNextToken();

      if (tokenType == StandardTokenizerInterface.YYEOF) {
        return false;
      }

      if (scanner.yylength() <= maxTokenLength) {
        posIncrAtt.setPositionIncrement(skippedPositions + 1);
        scanner.getText(termAtt);
        final int start = scanner.yychar();
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
        // This 'if' should be removed in the next release. For now, it converts
        // invalid acronyms to HOST. When removed, only the 'else' part should
        // remain.
        if (tokenType == StandardTokenizer.ACRONYM_DEP) {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]);
          termAtt.setLength(termAtt.length() - 1); // remove extra '.'
        } else {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
        }
        return true;
      } else
        // When we skip a too-long term, we still increment the
        // position increment
        skippedPositions++;
    }
  }
  /*
   * (non-Javadoc)
   *
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    skippedPositions = 0;

    while (true) {
      int tokenType = scanner.getNextToken();

      if (tokenType == StandardTokenizerImpl.YYEOF) {
        return false;
      }

      if (scanner.yylength() <= maxTokenLength) {
        posIncrAtt.setPositionIncrement(skippedPositions + 1);
        scanner.getText(termAtt);
        final int start = scanner.yychar();
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
        typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
        return true;
      } else
        // When we skip a too-long term, we still increment the
        // position increment
        skippedPositions++;
    }
  }
 @Override
 public boolean incrementToken() throws IOException {
   if (!getNextPartialSnippet()) return false;
   clearAttributes();
   termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm);
   offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
   return true;
 }
 @Override
 public final void end() throws IOException {
   super.end();
   // set final offset
   int finalOffset = correctOffset(this.endPosition);
   offsetAtt.setOffset(finalOffset, finalOffset);
   posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
 }
  @Override
  public boolean incrementToken() throws IOException {
    if (index >= str.length()) return false;
    clearAttributes();
    if (group >= 0) {

      // match a specific group
      while (matcher.find()) {
        final String match = matcher.group(group);
        if (match.length() == 0) continue;
        termAtt.setEmpty().append(match);
        index = matcher.start(group);
        offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group)));
        return true;
      }

      index = Integer.MAX_VALUE; // mark exhausted
      return false;

    } else {

      // String.split() functionality
      while (matcher.find()) {
        if (matcher.start() - index > 0) {
          // found a non-zero-length token
          termAtt.setEmpty().append(str, index, matcher.start());
          offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
          index = matcher.end();
          return true;
        }

        index = matcher.end();
      }

      if (str.length() - index == 0) {
        index = Integer.MAX_VALUE; // mark exhausted
        return false;
      }

      termAtt.setEmpty().append(str, index, str.length());
      offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
      index = Integer.MAX_VALUE; // mark exhausted
      return true;
    }
  }
 @Override
 public final void end() throws IOException {
   super.end();
   // set final offset
   int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
   offsetAtt.setOffset(finalOffset, finalOffset);
   // adjust any skipped tokens
   posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
 }
 public boolean incrementToken() throws IOException {
   if (inPhrase) {
     inPhrase = false;
     clearAttributes();
     termAtt.setTermBuffer("phrase2");
     offsetAtt.setOffset(savedStart, savedEnd);
     return true;
   } else
     while (input.incrementToken()) {
       if (termAtt.term().equals("phrase")) {
         inPhrase = true;
         savedStart = offsetAtt.startOffset();
         savedEnd = offsetAtt.endOffset();
         termAtt.setTermBuffer("phrase1");
         offsetAtt.setOffset(savedStart, savedEnd);
         return true;
       } else if (!termAtt.term().equals("stop")) return true;
     }
   return false;
 }
 @Override
 public boolean incrementToken() {
   if (used) {
     return false;
   }
   clearAttributes();
   termAttribute.append(value);
   offsetAttribute.setOffset(0, value.length());
   used = true;
   return true;
 }
  /**
   * Generates a word/number part, updating the appropriate attributes
   *
   * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code
   *     false} otherwise
   */
  private void generatePart(boolean isSingleWord) {
    clearAttributes();
    termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);

    int startOffset = savedStartOffset + iterator.current;
    int endOffset = savedStartOffset + iterator.end;

    if (hasIllegalOffsets) {
      // historically this filter did this regardless for 'isSingleWord',
      // but we must do a sanity check:
      if (isSingleWord && startOffset <= savedEndOffset) {
        offsetAttribute.setOffset(startOffset, savedEndOffset);
      } else {
        offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
      }
    } else {
      offsetAttribute.setOffset(startOffset, endOffset);
    }
    posIncAttribute.setPositionIncrement(position(false));
    typeAttribute.setType(savedType);
  }
 @Override
 public void end() throws IOException {
   super.end();
   // NOTE: somewhat... hackish, but we need this to
   // satisfy BTSTC:
   final int lastOffset;
   if (tokens != null && !tokens.isEmpty()) {
     lastOffset = tokens.get(tokens.size() - 1).endOffset();
   } else {
     lastOffset = 0;
   }
   offsetAtt.setOffset(correctOffset(lastOffset), correctOffset(inputLength));
 }
Exemple #20
0
 public boolean incrementToken() throws IOException {
   clearAttributes();
   Token token = nextToken(reusableToken);
   if (token != null) {
     termAtt.copyBuffer(token.buffer(), 0, token.length());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     typeAtt.setType(token.type());
     return true;
   } else {
     end();
     return false;
   }
 }
  @Override
  public boolean incrementToken() throws IOException {
    clearAttributes();

    // 已经穷尽tokensIteractor的Token对象,则继续请求reader流入数据
    while (tokenIteractor == null || !tokenIteractor.hasNext()) {
      // System.out.println(dissected);
      int read = 0;
      int remainning = -1; // 重新从reader读入字符前,buffer中还剩下的字符数,负数表示当前暂不需要从reader中读入字符
      if (dissected >= beef.length()) {
        remainning = 0;
      } else if (dissected < 0) {
        remainning = bufferLength + dissected;
      }
      if (remainning >= 0) {
        if (remainning > 0) {
          System.arraycopy(buffer, -dissected, buffer, 0, remainning);
        }
        read = input.read(buffer, remainning, bufferLength - remainning);
        inputLength += read;
        int charCount = remainning + read;
        if (charCount < 0) {
          // reader已尽,按接口next()要求返回null.
          return false;
        }
        if (charCount < bufferLength) {
          buffer[charCount++] = 0;
        }
        // 构造“牛”,并使用knife“解”之
        beef.set(0, charCount);
        offset += Math.abs(dissected);
        // offset -= remainning;
        dissected = 0;
      }
      dissected = knife.dissect(this, beef, dissected);
      // offset += read;// !!!
      tokenIteractor = tokenCollector.iterator();
    }

    if (tokenIteractor.hasNext()) {
      // 返回tokensIteractor下一个Token对象
      Token token = tokenIteractor.next();
      termAtt.setEmpty();
      termAtt.append(token.charSequence());
      offsetAtt.setOffset(correctOffset(token.startOffset()), correctOffset(token.endOffset()));
      positionIncrementAttribute.setPositionIncrement(token.endOffset());
      return true;
    }
    return tokenIteractor.hasNext();
  }
 @Override
 public boolean incrementToken() throws IOException {
   if (index >= tokens.length) return false;
   else {
     clearAttributes();
     Token token = tokens[index++];
     termAtt.setEmpty().append(token);
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     posIncAtt.setPositionIncrement(token.getPositionIncrement());
     flagsAtt.setFlags(token.getFlags());
     typeAtt.setType(token.type());
     payloadAtt.setPayload(token.getPayload());
     return true;
   }
 }
 @Override
 public boolean incrementToken() throws IOException {
   if (iterator == null) {
     initializeIterator();
   }
   if (iterator.hasNext()) {
     clearAttributes();
     AnnotationFS next = iterator.next();
     termAttr.append(next.getCoveredText());
     offsetAttr.setOffset(correctOffset(next.getBegin()), correctOffset(next.getEnd()));
     return true;
   } else {
     return false;
   }
 }
 @Override
 public boolean incrementToken() throws IOException {
   boolean tokenAvailable = false;
   int builtGramSize = 0;
   if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
     shiftInputWindow();
     gramBuilder.setLength(0);
   } else {
     builtGramSize = gramSize.getPreviousValue();
   }
   if (inputWindow.size() >= gramSize.getValue()) {
     boolean isAllFiller = true;
     InputWindowToken nextToken = null;
     Iterator<InputWindowToken> iter = inputWindow.iterator();
     for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) {
       nextToken = iter.next();
       if (builtGramSize < gramNum) {
         if (builtGramSize > 0) {
           gramBuilder.append(tokenSeparator);
         }
         gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length());
         ++builtGramSize;
       }
       if (isAllFiller && nextToken.isFiller) {
         if (gramNum == gramSize.getValue()) {
           gramSize.advance();
         }
       } else {
         isAllFiller = false;
       }
     }
     if (!isAllFiller && builtGramSize == gramSize.getValue()) {
       inputWindow.getFirst().attSource.copyTo(this);
       posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
       termAtt.setEmpty().append(gramBuilder);
       if (gramSize.getValue() > 1) {
         typeAtt.setType(tokenType);
         noShingleOutput = false;
       }
       offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
       posLenAtt.setPositionLength(builtGramSize);
       isOutputHere = true;
       gramSize.advance();
       tokenAvailable = true;
     }
   }
   return tokenAvailable;
 }
  @Override
  public boolean incrementToken() throws IOException {
    clearAttributes();
    buffer.setLength(0);
    int ci;
    char ch, pch;
    boolean atBegin = true;
    tokenStart = tokenEnd;
    ci = input.read();
    ch = (char) ci;

    while (true) {
      if (ci == -1) {
        break;
      } else if (PUNCTION.indexOf(ch) != -1) {
        // End of a sentence
        buffer.append(ch);
        tokenEnd++;
        break;
      } else if (atBegin && SPACES.indexOf(ch) != -1) {
        tokenStart++;
        tokenEnd++;
        ci = input.read();
        ch = (char) ci;
      } else {
        buffer.append(ch);
        atBegin = false;
        tokenEnd++;
        pch = ch;
        ci = input.read();
        ch = (char) ci;
        // Two spaces, such as CR, LF
        if (SPACES.indexOf(ch) != -1 && SPACES.indexOf(pch) != -1) {
          // buffer.append(ch);
          tokenEnd++;
          break;
        }
      }
    }
    if (buffer.length() == 0) return false;
    else {
      termAtt.setEmpty().append(buffer);
      offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
      typeAtt.setType("sentence");
      return true;
    }
  }
  /** Writes the joined unhyphenated term */
  private void unhyphenate() {
    int endOffset = offsetAttribute.endOffset();

    restoreState(savedState);
    savedState = null;

    char term[] = termAttribute.buffer();
    int length = hyphenated.length();
    if (length > termAttribute.length()) {
      term = termAttribute.resizeBuffer(length);
    }

    hyphenated.getChars(0, length, term, 0);
    termAttribute.setLength(length);
    offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
    hyphenated.setLength(0);
  }
 @Override
 public boolean incrementToken() {
   if (upto < tokens.length) {
     final Token token = tokens[upto++];
     // TODO: can we just capture/restoreState so
     // we get all attrs...?
     clearAttributes();
     termAtt.setEmpty();
     termAtt.append(token.toString());
     posIncrAtt.setPositionIncrement(token.getPositionIncrement());
     posLengthAtt.setPositionLength(token.getPositionLength());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     payloadAtt.setPayload(token.getPayload());
     return true;
   } else {
     return false;
   }
 }
  @Override
  public boolean incrementToken() throws IOException {

    // parse() is able to return w/o producing any new
    // tokens, when the tokens it had produced were entirely
    // punctuation.  So we loop here until we get a real
    // token or we end:
    while (pending.size() == 0) {
      if (end) {
        return false;
      }

      // Push Viterbi forward some more:
      parse();
    }

    final Token token = pending.remove(pending.size() - 1);

    int position = token.getPosition();
    int length = token.getLength();
    clearAttributes();
    assert length > 0;
    // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
    // token.getSurfaceForm().length);
    termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
    offsetAtt.setOffset(correctOffset(position), correctOffset(position + length));
    basicFormAtt.setToken(token);
    posAtt.setToken(token);
    readingAtt.setToken(token);
    inflectionAtt.setToken(token);
    if (token.getPosition() == lastTokenPos) {
      posIncAtt.setPositionIncrement(0);
      posLengthAtt.setPositionLength(token.getPositionLength());
    } else {
      assert token.getPosition() > lastTokenPos;
      posIncAtt.setPositionIncrement(1);
      posLengthAtt.setPositionLength(1);
    }
    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
    }
    lastTokenPos = token.getPosition();
    return true;
  }
 @Override
 public boolean incrementToken() throws IOException {
   if (tokens == null) {
     fillTokens();
   }
   // System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
   if (upto == tokens.size()) {
     // System.out.println("  END @ " + tokens.size());
     return false;
   }
   final Token t = tokens.get(upto++);
   // System.out.println("  return token=" + t);
   clearAttributes();
   termAtt.append(t.toString());
   offsetAtt.setOffset(t.startOffset(), t.endOffset());
   posIncrAtt.setPositionIncrement(t.getPositionIncrement());
   posLengthAtt.setPositionLength(t.getPositionLength());
   return true;
 }
 @Override
 public final boolean incrementToken() throws IOException {
   while (true) {
     if (curTermBuffer == null) {
       if (!input.incrementToken()) {
         return false;
       } else {
         curTermBuffer = termAtt.buffer().clone();
         curTermLength = termAtt.length();
         curCodePointCount = charUtils.codePointCount(termAtt);
         curGramSize = minGram;
         tokStart = offsetAtt.startOffset();
         tokEnd = offsetAtt.endOffset();
         savePosIncr += posIncrAtt.getPositionIncrement();
         savePosLen = posLenAtt.getPositionLength();
       }
     }
     if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
       if (curGramSize
           <= curCodePointCount) { // if the remaining input is too short, we can't generate any
         // n-grams
         // grab gramSize chars from front or back
         clearAttributes();
         offsetAtt.setOffset(tokStart, tokEnd);
         // first ngram gets increment, others don't
         if (curGramSize == minGram) {
           posIncrAtt.setPositionIncrement(savePosIncr);
           savePosIncr = 0;
         } else {
           posIncrAtt.setPositionIncrement(0);
         }
         posLenAtt.setPositionLength(savePosLen);
         final int charLength =
             charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
         termAtt.copyBuffer(curTermBuffer, 0, charLength);
         curGramSize++;
         return true;
       }
     }
     curTermBuffer = null;
   }
 }