/* (non-Javadoc)
   * @see org.apache.lucene.analysis.TokenStream#incrementToken()
   */
  @Override
  public boolean incrementToken() throws IOException {
    // 清除所有的词元属性
    clearAttributes();
    skippedPositions = 0;

    Lexeme nextLexeme = _IKImplement.next();
    if (nextLexeme != null) {
      posIncrAtt.setPositionIncrement(skippedPositions + 1);

      // 将Lexeme转成Attributes
      // 设置词元文本
      termAtt.append(nextLexeme.getLexemeText());
      // 设置词元长度
      termAtt.setLength(nextLexeme.getLength());
      // 设置词元位移
      offsetAtt.setOffset(
          correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));

      // 记录分词的最后位置
      endPosition = nextLexeme.getEndPosition();
      // 记录词元分类
      typeAtt.setType(nextLexeme.getLexemeTypeString());
      // 返会true告知还有下个词元
      return true;
    }
    // 返会false告知词元输出完毕
    return false;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      char buffer[] = termAtt.buffer();
      int length = termAtt.length();

      for (int i = 0; i < length; i++) {
        int ch = Character.codePointAt(buffer, i, length);
        // look for digits outside of basic latin
        if (ch > 0x7F && Character.isDigit(ch)) {
          // replace with equivalent basic latin digit
          buffer[i] = (char) ('0' + Character.getNumericValue(ch));
          // if the original was supplementary, shrink the string
          if (ch > 0xFFFF) {
            length = StemmerUtil.delete(buffer, i + 1, length);
            termAtt.setLength(length);
          }
        }
      }

      return true;
    } else {
      return false;
    }
  }
  /*
   * (non-Javadoc)
   *
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    skippedPositions = 0;

    while (true) {
      int tokenType = scanner.getNextToken();

      if (tokenType == StandardTokenizerInterface.YYEOF) {
        return false;
      }

      if (scanner.yylength() <= maxTokenLength) {
        posIncrAtt.setPositionIncrement(skippedPositions + 1);
        scanner.getText(termAtt);
        final int start = scanner.yychar();
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
        // This 'if' should be removed in the next release. For now, it converts
        // invalid acronyms to HOST. When removed, only the 'else' part should
        // remain.
        if (tokenType == StandardTokenizer.ACRONYM_DEP) {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]);
          termAtt.setLength(termAtt.length() - 1); // remove extra '.'
        } else {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
        }
        return true;
      } else
        // When we skip a too-long term, we still increment the
        // position increment
        skippedPositions++;
    }
  }
 @Override
 public boolean incrementToken() {
   if (index < n) {
     clearAttributes();
     termAtt.buffer()[0] = 'a';
     termAtt.setLength(1);
     index++;
     return true;
   }
   return false;
 }
 @Override
 public boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     if (termAtt.length() > 0 && termAtt.buffer()[0] == 't') {
       termAtt.setLength(0);
     }
     return true;
   } else {
     return false;
   }
 }
 @Override
 public boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     if (!keywordAttr.isKeyword()) {
       final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
       termAtt.setLength(newlen);
     }
     return true;
   } else {
     return false;
   }
 }
 @Override
 public final boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     final int length = termAttribute.length();
     if (length > size) {
       termAttribute.setLength(size);
     }
     return true;
   } else {
     return false;
   }
 }
Example #8
0
 @Override
 public boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     if (!keywordAttr.isKeyword()) {
       // this stemmer increases word length by 1: worst case '*çom' -> '*ción'
       final int len = termAtt.length();
       final int newlen = stemmer.stem(termAtt.resizeBuffer(len + 1), len);
       termAtt.setLength(newlen);
     }
     return true;
   } else {
     return false;
   }
 }
  @Override
  public boolean incrementToken() throws IOException {
    if (!input.incrementToken()) {
      return false;
    }

    final char[] buffer = termAtt.buffer();
    final int bufferLength = termAtt.length();

    if (bufferLength >= 2
        && buffer[bufferLength - 2] == '\''
        && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
      termAtt.setLength(bufferLength - 2); // Strip last 2 characters off

    return true;
  }
  /** Writes the joined unhyphenated term */
  private void unhyphenate() {
    int endOffset = offsetAttribute.endOffset();

    restoreState(savedState);
    savedState = null;

    char term[] = termAttribute.buffer();
    int length = hyphenated.length();
    if (length > termAttribute.length()) {
      term = termAttribute.resizeBuffer(length);
    }

    hyphenated.getChars(0, length, term, 0);
    termAttribute.setLength(length);
    offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
    hyphenated.setLength(0);
  }
 @Override
 public boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     final char[] buffer = termAtt.buffer();
     final int length = termAtt.length();
     for (int i = 0; i < length; i++) {
       if (buffer[i] == delimiter) {
         payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
         termAtt.setLength(i); // simply set a new length
         return true;
       }
     }
     // we have not seen the delimiter
     payAtt.setPayload(null);
     return true;
   } else return false;
 }
 @Override
 public boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     char[] termBuffer = termAtt.buffer();
     String termText = new String(termBuffer, 0, termAtt.length());
     collator.getRawCollationKey(termText, reusableKey);
     int encodedLength =
         IndexableBinaryStringTools.getEncodedLength(reusableKey.bytes, 0, reusableKey.size);
     if (encodedLength > termBuffer.length) {
       termAtt.resizeBuffer(encodedLength);
     }
     termAtt.setLength(encodedLength);
     IndexableBinaryStringTools.encode(
         reusableKey.bytes, 0, reusableKey.size, termAtt.buffer(), 0, encodedLength);
     return true;
   } else {
     return false;
   }
 }
 @Override
 public final boolean incrementToken() throws IOException {
   if (!done) {
     clearAttributes();
     done = true;
     int upto = 0;
     char[] buffer = termAtt.buffer();
     while (true) {
       final int length = input.read(buffer, upto, buffer.length - upto);
       if (length == -1) break;
       upto += length;
       if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length);
     }
     termAtt.setLength(upto);
     finalOffset = correctOffset(upto);
     offsetAtt.setOffset(correctOffset(0), finalOffset);
     return true;
   }
   return false;
 }
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();

    if (!done) {
      done = true;
      int upto = 0;
      char[] buffer = termAtt.buffer();
      while (true) {
        final int length = input.read(buffer, upto, buffer.length - upto);
        if (length == -1) break;
        upto += length;
        if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length);
      }
      termAtt.setLength(upto);
      String str = termAtt.toString();
      termAtt.setEmpty();
      StringBuilder stringBuilder = new StringBuilder();
      StringBuilder firstLetters = new StringBuilder();
      for (int i = 0; i < str.length(); i++) {
        char c = str.charAt(i);
        if (c < 128) {
          stringBuilder.append(c);
        } else {
          try {
            String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format);
            if (strs != null) {
              // get first result by default
              String first_value = strs[0];
              // TODO more than one pinyin
              stringBuilder.append(first_value);
              if (this.padding_char.length() > 0) {
                stringBuilder.append(this.padding_char);
              }
              firstLetters.append(first_value.charAt(0));
            }
          } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
            badHanyuPinyinOutputFormatCombination.printStackTrace();
          }
        }
      }

      // let's join them
      if (first_letter.equals("prefix")) {
        termAtt.append(firstLetters.toString());
        if (this.padding_char.length() > 0) {
          termAtt.append(this.padding_char); // TODO splitter
        }
        termAtt.append(stringBuilder.toString());
      } else if (first_letter.equals("append")) {
        termAtt.append(stringBuilder.toString());
        if (this.padding_char.length() > 0) {
          if (!stringBuilder.toString().endsWith(this.padding_char)) {
            termAtt.append(this.padding_char);
          }
        }
        termAtt.append(firstLetters.toString());
      } else if (first_letter.equals("none")) {
        termAtt.append(stringBuilder.toString());
      } else if (first_letter.equals("only")) {
        termAtt.append(firstLetters.toString());
      }

      finalOffset = correctOffset(upto);
      offsetAtt.setOffset(correctOffset(0), finalOffset);
      return true;
    }
    return false;
  }
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    termAtt.append(resultToken);
    if (resultToken.length() == 0) {
      posAtt.setPositionIncrement(1);
    } else {
      posAtt.setPositionIncrement(0);
    }
    int length = 0;
    boolean added = false;
    if (endDelimiter) {
      termAtt.append(replacement);
      length++;
      endDelimiter = false;
      added = true;
    }

    while (true) {
      int c = input.read();
      if (c >= 0) {
        charsRead++;
      } else {
        if (skipped > skip) {
          length += resultToken.length();
          termAtt.setLength(length);
          offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length));
          if (added) {
            resultToken.setLength(0);
            resultToken.append(termAtt.buffer(), 0, length);
          }
          return added;
        } else {
          return false;
        }
      }
      if (!added) {
        added = true;
        skipped++;
        if (skipped > skip) {
          termAtt.append(c == delimiter ? replacement : (char) c);
          length++;
        } else {
          startPosition++;
        }
      } else {
        if (c == delimiter) {
          if (skipped > skip) {
            endDelimiter = true;
            break;
          }
          skipped++;
          if (skipped > skip) {
            termAtt.append(replacement);
            length++;
          } else {
            startPosition++;
          }
        } else {
          if (skipped > skip) {
            termAtt.append((char) c);
            length++;
          } else {
            startPosition++;
          }
        }
      }
    }
    length += resultToken.length();
    termAtt.setLength(length);
    offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length));
    resultToken.setLength(0);
    resultToken.append(termAtt.buffer(), 0, length);
    return true;
  }
  /**
   * lucene 4.2x의 경우 데이터가 있으면 자소분리 후 true가 떨어지나, 여기서는 false로 떨어져 부득이하게 ioBuffer사이즈 상태로 조건변경
   * (CharacterUtils.fill)
   *
   * @author 최일규
   * @since 2014-07-11
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();

    int length = 0;
    int start = -1; // this variable is always initialized
    char[] buffer = termAtt.buffer();
    while (true) {
      if (bufferIndex >= dataLen) {

        offset += dataLen;
        boolean isDecompose =
            charUtils.fill(ioBuffer, jasoDecompose(input, decomposeMode, typoMode));

        // 버퍼사이즈가 있으면 분석한다. (return false일때까지... 재귀호출)
        if (ioBuffer.getLength() == 0) {
          dataLen = 0; // so next offset += dataLen won't decrement offset
          if (length > 0) {
            break;
          } else {
            finalOffset = correctOffset(offset);
            return false;
          }
        }
        dataLen = ioBuffer.getLength();
        bufferIndex = 0;
      }
      // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based
      // methods are gone
      final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, dataLen);
      bufferIndex += Character.charCount(c);

      // if it's a token char
      if (isTokenChar(c)) {

        // start of token
        if (length == 0) {
          assert start == -1;
          start = offset + bufferIndex - 1;

          // check if a supplementary could run out of bounds
        } else if (length >= buffer.length - 1) {

          // make sure a supplementary fits in the buffer
          buffer = termAtt.resizeBuffer(2 + length);
        }

        // buffer it, normalized
        length += Character.toChars(normalize(c), buffer, length);
        if (length >= MAX_WORD_LEN) {
          break;
        }
      } else if (length > 0) {
        // return 'em
        break;
      }
    }

    termAtt.setLength(length);
    assert start != -1;
    offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length));
    return true;
  }