public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
   super(factory, input);
   if (bufferSize <= 0) {
     throw new IllegalArgumentException("bufferSize must be > 0");
   }
   termAtt.resizeBuffer(bufferSize);
 }
 public PinyinTokenizer(Reader input, int bufferSize) {
   super(input);
   termAtt.resizeBuffer(bufferSize);
   format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
   format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
   format.setVCharType(HanyuPinyinVCharType.WITH_V);
 }
Example #3
0
 public NGTokenizer(int ngramSize) {
   super();
   if (ngramSize < 1) {
     throw new IllegalArgumentException("ngramSize < 1");
   }
   this.ngramSize = ngramSize;
   termAtt = addAttribute(CharTermAttribute.class);
   termAtt.resizeBuffer(ngramSize);
   buffer = new char[ngramSize];
   for (int idx = 0; idx < ngramSize; idx++) {
     buffer[idx] = ' ';
   }
 }
Example #4
0
 @Override
 public boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     if (!keywordAttr.isKeyword()) {
       // this stemmer increases word length by 1: worst case '*çom' -> '*ción'
       final int len = termAtt.length();
       final int newlen = stemmer.stem(termAtt.resizeBuffer(len + 1), len);
       termAtt.setLength(newlen);
     }
     return true;
   } else {
     return false;
   }
 }
  /** Writes the joined unhyphenated term */
  private void unhyphenate() {
    int endOffset = offsetAttribute.endOffset();

    restoreState(savedState);
    savedState = null;

    char term[] = termAttribute.buffer();
    int length = hyphenated.length();
    if (length > termAttribute.length()) {
      term = termAttribute.resizeBuffer(length);
    }

    hyphenated.getChars(0, length, term, 0);
    termAttribute.setLength(length);
    offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
    hyphenated.setLength(0);
  }
 @Override
 public boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     char[] termBuffer = termAtt.buffer();
     String termText = new String(termBuffer, 0, termAtt.length());
     collator.getRawCollationKey(termText, reusableKey);
     int encodedLength =
         IndexableBinaryStringTools.getEncodedLength(reusableKey.bytes, 0, reusableKey.size);
     if (encodedLength > termBuffer.length) {
       termAtt.resizeBuffer(encodedLength);
     }
     termAtt.setLength(encodedLength);
     IndexableBinaryStringTools.encode(
         reusableKey.bytes, 0, reusableKey.size, termAtt.buffer(), 0, encodedLength);
     return true;
   } else {
     return false;
   }
 }
 @Override
 public final boolean incrementToken() throws IOException {
   if (!done) {
     clearAttributes();
     done = true;
     int upto = 0;
     char[] buffer = termAtt.buffer();
     while (true) {
       final int length = input.read(buffer, upto, buffer.length - upto);
       if (length == -1) break;
       upto += length;
       if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length);
     }
     termAtt.setLength(upto);
     finalOffset = correctOffset(upto);
     offsetAtt.setOffset(correctOffset(0), finalOffset);
     return true;
   }
   return false;
 }
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();

    if (!done) {
      done = true;
      int upto = 0;
      char[] buffer = termAtt.buffer();
      while (true) {
        final int length = input.read(buffer, upto, buffer.length - upto);
        if (length == -1) break;
        upto += length;
        if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length);
      }
      termAtt.setLength(upto);
      String str = termAtt.toString();
      termAtt.setEmpty();
      StringBuilder stringBuilder = new StringBuilder();
      StringBuilder firstLetters = new StringBuilder();
      for (int i = 0; i < str.length(); i++) {
        char c = str.charAt(i);
        if (c < 128) {
          stringBuilder.append(c);
        } else {
          try {
            String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format);
            if (strs != null) {
              // get first result by default
              String first_value = strs[0];
              // TODO more than one pinyin
              stringBuilder.append(first_value);
              if (this.padding_char.length() > 0) {
                stringBuilder.append(this.padding_char);
              }
              firstLetters.append(first_value.charAt(0));
            }
          } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
            badHanyuPinyinOutputFormatCombination.printStackTrace();
          }
        }
      }

      // let's join them
      if (first_letter.equals("prefix")) {
        termAtt.append(firstLetters.toString());
        if (this.padding_char.length() > 0) {
          termAtt.append(this.padding_char); // TODO splitter
        }
        termAtt.append(stringBuilder.toString());
      } else if (first_letter.equals("append")) {
        termAtt.append(stringBuilder.toString());
        if (this.padding_char.length() > 0) {
          if (!stringBuilder.toString().endsWith(this.padding_char)) {
            termAtt.append(this.padding_char);
          }
        }
        termAtt.append(firstLetters.toString());
      } else if (first_letter.equals("none")) {
        termAtt.append(stringBuilder.toString());
      } else if (first_letter.equals("only")) {
        termAtt.append(firstLetters.toString());
      }

      finalOffset = correctOffset(upto);
      offsetAtt.setOffset(correctOffset(0), finalOffset);
      return true;
    }
    return false;
  }
  /**
   * lucene 4.2x의 경우 데이터가 있으면 자소분리 후 true가 떨어지나, 여기서는 false로 떨어져 부득이하게 ioBuffer사이즈 상태로 조건변경
   * (CharacterUtils.fill)
   *
   * @author 최일규
   * @since 2014-07-11
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();

    int length = 0;
    int start = -1; // this variable is always initialized
    char[] buffer = termAtt.buffer();
    while (true) {
      if (bufferIndex >= dataLen) {

        offset += dataLen;
        boolean isDecompose =
            charUtils.fill(ioBuffer, jasoDecompose(input, decomposeMode, typoMode));

        // 버퍼사이즈가 있으면 분석한다. (return false일때까지... 재귀호출)
        if (ioBuffer.getLength() == 0) {
          dataLen = 0; // so next offset += dataLen won't decrement offset
          if (length > 0) {
            break;
          } else {
            finalOffset = correctOffset(offset);
            return false;
          }
        }
        dataLen = ioBuffer.getLength();
        bufferIndex = 0;
      }
      // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based
      // methods are gone
      final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, dataLen);
      bufferIndex += Character.charCount(c);

      // if it's a token char
      if (isTokenChar(c)) {

        // start of token
        if (length == 0) {
          assert start == -1;
          start = offset + bufferIndex - 1;

          // check if a supplementary could run out of bounds
        } else if (length >= buffer.length - 1) {

          // make sure a supplementary fits in the buffer
          buffer = termAtt.resizeBuffer(2 + length);
        }

        // buffer it, normalized
        length += Character.toChars(normalize(c), buffer, length);
        if (length >= MAX_WORD_LEN) {
          break;
        }
      } else if (length > 0) {
        // return 'em
        break;
      }
    }

    termAtt.setLength(length);
    assert start != -1;
    offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length));
    return true;
  }