public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) { super(factory, input); if (bufferSize <= 0) { throw new IllegalArgumentException("bufferSize must be > 0"); } termAtt.resizeBuffer(bufferSize); }
public PinyinTokenizer(Reader input, int bufferSize) { super(input); termAtt.resizeBuffer(bufferSize); format.setCaseType(HanyuPinyinCaseType.LOWERCASE); format.setToneType(HanyuPinyinToneType.WITHOUT_TONE); format.setVCharType(HanyuPinyinVCharType.WITH_V); }
public NGTokenizer(int ngramSize) { super(); if (ngramSize < 1) { throw new IllegalArgumentException("ngramSize < 1"); } this.ngramSize = ngramSize; termAtt = addAttribute(CharTermAttribute.class); termAtt.resizeBuffer(ngramSize); buffer = new char[ngramSize]; for (int idx = 0; idx < ngramSize; idx++) { buffer[idx] = ' '; } }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAttr.isKeyword()) { // this stemmer increases word length by 1: worst case '*çom' -> '*ción' final int len = termAtt.length(); final int newlen = stemmer.stem(termAtt.resizeBuffer(len + 1), len); termAtt.setLength(newlen); } return true; } else { return false; } }
/** Writes the joined unhyphenated term */ private void unhyphenate() { int endOffset = offsetAttribute.endOffset(); restoreState(savedState); savedState = null; char term[] = termAttribute.buffer(); int length = hyphenated.length(); if (length > termAttribute.length()) { term = termAttribute.resizeBuffer(length); } hyphenated.getChars(0, length, term, 0); termAttribute.setLength(length); offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset); hyphenated.setLength(0); }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { char[] termBuffer = termAtt.buffer(); String termText = new String(termBuffer, 0, termAtt.length()); collator.getRawCollationKey(termText, reusableKey); int encodedLength = IndexableBinaryStringTools.getEncodedLength(reusableKey.bytes, 0, reusableKey.size); if (encodedLength > termBuffer.length) { termAtt.resizeBuffer(encodedLength); } termAtt.setLength(encodedLength); IndexableBinaryStringTools.encode( reusableKey.bytes, 0, reusableKey.size, termAtt.buffer(), 0, encodedLength); return true; } else { return false; } }
@Override public final boolean incrementToken() throws IOException { if (!done) { clearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.buffer(); while (true) { final int length = input.read(buffer, upto, buffer.length - upto); if (length == -1) break; upto += length; if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length); } termAtt.setLength(upto); finalOffset = correctOffset(upto); offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); if (!done) { done = true; int upto = 0; char[] buffer = termAtt.buffer(); while (true) { final int length = input.read(buffer, upto, buffer.length - upto); if (length == -1) break; upto += length; if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length); } termAtt.setLength(upto); String str = termAtt.toString(); termAtt.setEmpty(); StringBuilder stringBuilder = new StringBuilder(); StringBuilder firstLetters = new StringBuilder(); for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); if (c < 128) { stringBuilder.append(c); } else { try { String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format); if (strs != null) { // get first result by default String first_value = strs[0]; // TODO more than one pinyin stringBuilder.append(first_value); if (this.padding_char.length() > 0) { stringBuilder.append(this.padding_char); } firstLetters.append(first_value.charAt(0)); } } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) { badHanyuPinyinOutputFormatCombination.printStackTrace(); } } } // let's join them if (first_letter.equals("prefix")) { termAtt.append(firstLetters.toString()); if (this.padding_char.length() > 0) { termAtt.append(this.padding_char); // TODO splitter } termAtt.append(stringBuilder.toString()); } else if (first_letter.equals("append")) { termAtt.append(stringBuilder.toString()); if (this.padding_char.length() > 0) { if (!stringBuilder.toString().endsWith(this.padding_char)) { termAtt.append(this.padding_char); } } termAtt.append(firstLetters.toString()); } else if (first_letter.equals("none")) { termAtt.append(stringBuilder.toString()); } else if (first_letter.equals("only")) { termAtt.append(firstLetters.toString()); } finalOffset = correctOffset(upto); offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; }
/** * lucene 4.2x의 경우 데이터가 있으면 자소분리 후 true가 떨어지나, 여기서는 false로 떨어져 부득이하게 ioBuffer사이즈 상태로 조건변경 * (CharacterUtils.fill) * * @author 최일규 * @since 2014-07-11 */ @Override public final boolean incrementToken() throws IOException { clearAttributes(); int length = 0; int start = -1; // this variable is always initialized char[] buffer = termAtt.buffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; boolean isDecompose = charUtils.fill(ioBuffer, jasoDecompose(input, decomposeMode, typoMode)); // 버퍼사이즈가 있으면 분석한다. (return false일때까지... 재귀호출) if (ioBuffer.getLength() == 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { finalOffset = correctOffset(offset); return false; } } dataLen = ioBuffer.getLength(); bufferIndex = 0; } // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based // methods are gone final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, dataLen); bufferIndex += Character.charCount(c); // if it's a token char if (isTokenChar(c)) { // start of token if (length == 0) { assert start == -1; start = offset + bufferIndex - 1; // check if a supplementary could run out of bounds } else if (length >= buffer.length - 1) { // make sure a supplementary fits in the buffer buffer = termAtt.resizeBuffer(2 + length); } // buffer it, normalized length += Character.toChars(normalize(c), buffer, length); if (length >= MAX_WORD_LEN) { break; } } else if (length > 0) { // return 'em break; } } termAtt.setLength(length); assert start != -1; offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length)); return true; }