@Override public final boolean incrementToken() throws IOException { if (isMailto) { termAtt.setEmpty(); // return the scheme + the mail part isMailto = false; posIncrAtt.setPositionIncrement(0); termAtt.copyBuffer(termBuffer.array(), 0, termBuffer.position()); return true; } if (input.incrementToken()) { final String type = typeAtt.type(); if (type.equals(TupleTokenizer.getTokenTypes()[TupleTokenizer.URI]) && this.isMailtoScheme()) { this.updateBuffer(); termBuffer.put(termAtt.buffer(), 0, termAtt.length()); // return only the mail part posIncrAtt.setPositionIncrement(1); termAtt.copyBuffer(termBuffer.array(), 7, termBuffer.position() - 7); } return true; } return false; }
@Override public boolean incrementToken() throws IOException { if (!terms.isEmpty()) { char[] buffer = terms.poll(); termAttribute.setEmpty(); termAttribute.copyBuffer(buffer, 0, buffer.length); posIncAttr.setPositionIncrement(1); return true; } if (!input.incrementToken()) { return false; } else { final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); int k = 0; for (; k < length; k++) { if (term[k] == tokenDelimiter) { break; } } LinkedList<CharBuffer> buffers = permuteTerms(term, 0, length); Iterator iter = buffers.iterator(); while (iter.hasNext()) { CharBuffer cb = (CharBuffer) iter.next(); terms.add(cb.array()); } // we return true and leave the original token unchanged return true; } }
private boolean getNextToken(final int pos) throws IOException { assert pos >= 0; final boolean ret; if (pos == ngramSize) { ret = true; } else { final int ich = input.read(); if (ich == -1) { termAtt.setEmpty(); ret = false; } else { final char ch = (char) ich; if (ch == ' ') { ret = getNextToken(); } else { termAtt.append(ch); ret = getNextToken(pos + 1); } } } return ret; }
@Override public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { assert current != null; CompoundToken token = tokens.removeFirst(); restoreState(current); // keep all other attributes untouched termAtt.setEmpty().append(token.txt); offsetAtt.setOffset(token.startOffset, token.endOffset); posIncAtt.setPositionIncrement(0); return true; } current = null; // not really needed, but for safety if (input.incrementToken()) { // Only words longer than minWordSize get processed if (termAtt.length() >= this.minWordSize) { decompose(); // only capture the state if we really need it for producing new tokens if (!tokens.isEmpty()) { current = captureState(); } } // return original token: return true; } else { return false; } }
@Override public final boolean incrementToken() throws IOException { int ch = input.read(); if (ch < 0) return false; clearAttributes(); termAtt.setEmpty().append((char) ch); return true; }
@Override public boolean incrementToken() throws IOException { if (!getNextPartialSnippet()) return false; clearAttributes(); termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm); offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm)); return true; }
@Override public boolean incrementToken() throws IOException { if (index >= str.length()) return false; clearAttributes(); if (group >= 0) { // match a specific group while (matcher.find()) { final String match = matcher.group(group); if (match.length() == 0) continue; termAtt.setEmpty().append(match); index = matcher.start(group); offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group))); return true; } index = Integer.MAX_VALUE; // mark exhausted return false; } else { // String.split() functionality while (matcher.find()) { if (matcher.start() - index > 0) { // found a non-zero-length token termAtt.setEmpty().append(str, index, matcher.start()); offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start())); index = matcher.end(); return true; } index = matcher.end(); } if (str.length() - index == 0) { index = Integer.MAX_VALUE; // mark exhausted return false; } termAtt.setEmpty().append(str, index, str.length()); offsetAtt.setOffset(correctOffset(index), correctOffset(str.length())); index = Integer.MAX_VALUE; // mark exhausted return true; } }
/** TODO: rewrite tests not to use string comparison. */ private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); in.reset(); while (in.incrementToken()) { if (out.length() > 0) out.append(' '); out.append(termAtt.toString()); in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); } in.close(); return out.toString(); }
@Override public boolean incrementToken() throws IOException { if (currentPrefix != null) { if (!currentPrefix.hasNext()) { return input.incrementToken(); } else { posAttr.setPositionIncrement(0); } } else { currentPrefix = prefixes.iterator(); termAttr.setEmpty(); posAttr.setPositionIncrement(1); assert (currentPrefix.hasNext()) : "one or more prefixes needed"; } termAttr.setEmpty(); termAttr.append(currentPrefix.next()); termAttr.append(separator); return true; }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); BytesRef bytes = termsEnum.next(); if (bytes == null) return false; charTerm.setEmpty(); charTerm.append(bytes.utf8ToString()); return true; }
@Override public boolean incrementToken() throws IOException { if (inPhrase) { inPhrase = false; clearAttributes(); termAtt.setEmpty().append("phrase2"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else while (input.incrementToken()) { if (termAtt.toString().equals("phrase")) { inPhrase = true; savedStart = offsetAtt.startOffset(); savedEnd = offsetAtt.endOffset(); termAtt.setEmpty().append("phrase1"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else if (!termAtt.toString().equals("stop")) return true; } return false; }
private void setAttributesFromQueue(boolean isFirst) { final KoreanToken iw = morphQueue.removeFirst(); if (isFirst && !morphQueue.isEmpty()) { // our queue has more elements remaining (e.g. we decompounded) // capture state for those. We set the term attribute to be empty // so we save lots of array copying later. termAtt.setEmpty(); currentState = captureState(); } termAtt.setEmpty().append(iw.getTerm()); offsetAtt.setOffset(iw.getOffset(), iw.getOffset() + iw.getLength()); morphAtt.setToken(iw); // on the first Token we preserve incoming increment: if (!isFirst) { posIncrAtt.setPositionIncrement(iw.getPosInc()); } // TODO: How to handle PositionLengthAttribute correctly? }
@Override public boolean incrementToken() { clearAttributes(); if (upto == 4) { return false; } if (upto == 0) { posIncr.setPositionIncrement(1); term.setEmpty().append("a"); } else if (upto == 1) { posIncr.setPositionIncrement(1); term.setEmpty().append("b"); } else if (upto == 2) { posIncr.setPositionIncrement(0); term.setEmpty().append("c"); } else { posIncr.setPositionIncrement(0); term.setEmpty().append("d"); } upto++; return true; }
@Override public boolean incrementToken() throws IOException { clearAttributes(); // 已经穷尽tokensIteractor的Token对象,则继续请求reader流入数据 while (tokenIteractor == null || !tokenIteractor.hasNext()) { // System.out.println(dissected); int read = 0; int remainning = -1; // 重新从reader读入字符前,buffer中还剩下的字符数,负数表示当前暂不需要从reader中读入字符 if (dissected >= beef.length()) { remainning = 0; } else if (dissected < 0) { remainning = bufferLength + dissected; } if (remainning >= 0) { if (remainning > 0) { System.arraycopy(buffer, -dissected, buffer, 0, remainning); } read = input.read(buffer, remainning, bufferLength - remainning); inputLength += read; int charCount = remainning + read; if (charCount < 0) { // reader已尽,按接口next()要求返回null. return false; } if (charCount < bufferLength) { buffer[charCount++] = 0; } // 构造“牛”,并使用knife“解”之 beef.set(0, charCount); offset += Math.abs(dissected); // offset -= remainning; dissected = 0; } dissected = knife.dissect(this, beef, dissected); // offset += read;// !!! tokenIteractor = tokenCollector.iterator(); } if (tokenIteractor.hasNext()) { // 返回tokensIteractor下一个Token对象 Token token = tokenIteractor.next(); termAtt.setEmpty(); termAtt.append(token.charSequence()); offsetAtt.setOffset(correctOffset(token.startOffset()), correctOffset(token.endOffset())); positionIncrementAttribute.setPositionIncrement(token.endOffset()); return true; } return tokenIteractor.hasNext(); }
@Override public boolean incrementToken() throws IOException { if (index >= tokens.length) return false; else { clearAttributes(); Token token = tokens[index++]; termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); posIncAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); typeAtt.setType(token.type()); payloadAtt.setPayload(token.getPayload()); return true; } }
/** @return Returns true for next token in the stream, or false at EOS */ @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { String term = termAtt.toString(); if (!keywordAttr.isKeyword()) { String s = stemmer.stem(term); // If not stemmed, don't waste the time adjusting the token. if ((s != null) && !s.equals(term)) termAtt.setEmpty().append(s); } return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { boolean tokenAvailable = false; int builtGramSize = 0; if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { shiftInputWindow(); gramBuilder.setLength(0); } else { builtGramSize = gramSize.getPreviousValue(); } if (inputWindow.size() >= gramSize.getValue()) { boolean isAllFiller = true; InputWindowToken nextToken = null; Iterator<InputWindowToken> iter = inputWindow.iterator(); for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) { nextToken = iter.next(); if (builtGramSize < gramNum) { if (builtGramSize > 0) { gramBuilder.append(tokenSeparator); } gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; } if (isAllFiller && nextToken.isFiller) { if (gramNum == gramSize.getValue()) { gramSize.advance(); } } else { isAllFiller = false; } } if (!isAllFiller && builtGramSize == gramSize.getValue()) { inputWindow.getFirst().attSource.copyTo(this); posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); posLenAtt.setPositionLength(builtGramSize); isOutputHere = true; gramSize.advance(); tokenAvailable = true; } } return tokenAvailable; }
/** * @return true if token was added to search/analysis stream * @throws IOException */ @Override public final boolean incrementToken() throws IOException { if (!input.incrementToken()) { return false; } Optional<CharSequence> lemma = lemmatizer.lemmatize(termAtt); if (lemma.isPresent()) { if (!keywordAttr.isKeyword() && !equalCharSequences(lemma.get(), termAtt)) { termAtt.setEmpty().append(lemma.get()); } } return true; }
@Override public boolean incrementToken() throws IOException { clearAttributes(); buffer.setLength(0); int ci; char ch, pch; boolean atBegin = true; tokenStart = tokenEnd; ci = input.read(); ch = (char) ci; while (true) { if (ci == -1) { break; } else if (PUNCTION.indexOf(ch) != -1) { // End of a sentence buffer.append(ch); tokenEnd++; break; } else if (atBegin && SPACES.indexOf(ch) != -1) { tokenStart++; tokenEnd++; ci = input.read(); ch = (char) ci; } else { buffer.append(ch); atBegin = false; tokenEnd++; pch = ch; ci = input.read(); ch = (char) ci; // Two spaces, such as CR, LF if (SPACES.indexOf(ch) != -1 && SPACES.indexOf(pch) != -1) { // buffer.append(ch); tokenEnd++; break; } } } if (buffer.length() == 0) return false; else { termAtt.setEmpty().append(buffer); offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd)); typeAtt.setType("sentence"); return true; } }
@Override public final boolean incrementToken() throws IOException { if (addSynonym) { // inject our synonym clearAttributes(); termAtt.setEmpty().append("國"); posIncAtt.setPositionIncrement(0); addSynonym = false; return true; } if (input.incrementToken()) { addSynonym = termAtt.toString().equals("国"); return true; } else { return false; } }
@Override public boolean incrementToken() { if (upto < tokens.length) { final Token token = tokens[upto++]; // TODO: can we just capture/restoreState so // we get all attrs...? clearAttributes(); termAtt.setEmpty(); termAtt.append(token.toString()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); posLengthAtt.setPositionLength(token.getPositionLength()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); payloadAtt.setPayload(token.getPayload()); return true; } else { return false; } }
private void emit(char[] token) { Log.debug("emit: " + new String(token)); if (replaceWhitespaceWith != null) { token = replaceWhiteSpace(token); } CharTermAttribute termAttr = getTermAttribute(); termAttr.setEmpty(); termAttr.append(new StringBuilder().append(token)); OffsetAttribute offAttr = getOffsetAttribute(); if (offAttr != null && offAttr.endOffset() >= token.length) { int start = offAttr.endOffset() - token.length; offAttr.setOffset(start, offAttr.endOffset()); } PositionIncrementAttribute pia = getPositionIncrementAttribute(); if (pia != null) { pia.setPositionIncrement(++positionIncr); } lastEmitted = token; }
private boolean getNextToken() throws IOException { termAtt.setEmpty(); return getNextToken(0); }
public DataTokenStream(String text, IntEncoder encoder) throws IOException { this.encoder = encoder; term.setEmpty().append(text); }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); if (!done) { done = true; int upto = 0; char[] buffer = termAtt.buffer(); while (true) { final int length = input.read(buffer, upto, buffer.length - upto); if (length == -1) break; upto += length; if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length); } termAtt.setLength(upto); String str = termAtt.toString(); termAtt.setEmpty(); StringBuilder stringBuilder = new StringBuilder(); StringBuilder firstLetters = new StringBuilder(); for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); if (c < 128) { stringBuilder.append(c); } else { try { String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format); if (strs != null) { // get first result by default String first_value = strs[0]; // TODO more than one pinyin stringBuilder.append(first_value); if (this.padding_char.length() > 0) { stringBuilder.append(this.padding_char); } firstLetters.append(first_value.charAt(0)); } } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) { badHanyuPinyinOutputFormatCombination.printStackTrace(); } } } // let's join them if (first_letter.equals("prefix")) { termAtt.append(firstLetters.toString()); if (this.padding_char.length() > 0) { termAtt.append(this.padding_char); // TODO splitter } termAtt.append(stringBuilder.toString()); } else if (first_letter.equals("append")) { termAtt.append(stringBuilder.toString()); if (this.padding_char.length() > 0) { if (!stringBuilder.toString().endsWith(this.padding_char)) { termAtt.append(this.padding_char); } } termAtt.append(firstLetters.toString()); } else if (first_letter.equals("none")) { termAtt.append(stringBuilder.toString()); } else if (first_letter.equals("only")) { termAtt.append(firstLetters.toString()); } finalOffset = correctOffset(upto); offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; }