/** * Concatenates the saved buffer to the given WordDelimiterConcatenation * * @param concatenation WordDelimiterConcatenation to concatenate the buffer to */ private void concatenate(WordDelimiterConcatenation concatenation) { if (concatenation.isEmpty()) { concatenation.startOffset = savedStartOffset + iterator.current; } concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current); concatenation.endOffset = savedStartOffset + iterator.end; }
@Override public void reset() throws IOException { super.reset(); hasSavedState = false; concat.clear(); concatAll.clear(); accumPosInc = bufferedPos = bufferedLen = 0; first = true; }
/** * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or * just clearing. * * @param concatenation WordDelimiterConcatenation that will be flushed * @return {@code true} if the concatenation was written before it was cleared, {@code false} * otherwise */ private boolean flushConcatenation(WordDelimiterConcatenation concatenation) { lastConcatCount = concatenation.subwordCount; if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) { concatenation.writeAndClear(); return true; } concatenation.clear(); return false; }
@Override public boolean incrementToken() throws IOException { while (true) { if (!hasSavedState) { // process a new input word if (!input.incrementToken()) { return false; } int termLength = termAttribute.length(); char[] termBuffer = termAttribute.buffer(); accumPosInc += posIncAttribute.getPositionIncrement(); iterator.setText(termBuffer, termLength); iterator.next(); // word of no delimiters, or protected word: just return it if ((iterator.current == 0 && iterator.end == termLength) || (protWords != null && protWords.contains(termBuffer, 0, termLength))) { posIncAttribute.setPositionIncrement(accumPosInc); accumPosInc = 0; first = false; return true; } // word of simply delimiters if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL)) { // if the posInc is 1, simply ignore it in the accumulation // TODO: proper hole adjustment (FilteringTokenFilter-like) instead of this previous // logic! if (posIncAttribute.getPositionIncrement() == 1 && !first) { accumPosInc--; } continue; } saveState(); hasOutputToken = false; hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL); lastConcatCount = 0; if (has(PRESERVE_ORIGINAL)) { posIncAttribute.setPositionIncrement(accumPosInc); accumPosInc = 0; first = false; return true; } } // at the end of the string, output any concatenations if (iterator.end == WordDelimiterIterator.DONE) { if (!concat.isEmpty()) { if (flushConcatenation(concat)) { buffer(); continue; } } if (!concatAll.isEmpty()) { // only if we haven't output this same combo above! if (concatAll.subwordCount > lastConcatCount) { concatAll.writeAndClear(); buffer(); continue; } concatAll.clear(); } if (bufferedPos < bufferedLen) { if (bufferedPos == 0) { sorter.sort(0, bufferedLen); } clearAttributes(); restoreState(buffered[bufferedPos++]); if (first && posIncAttribute.getPositionIncrement() == 0) { // can easily happen with strange combinations (e.g. not outputting numbers, but // concat-all) posIncAttribute.setPositionIncrement(1); } first = false; return true; } // no saved concatenations, on to the next input word bufferedPos = bufferedLen = 0; hasSavedState = false; continue; } // word surrounded by delimiters: always output if (iterator.isSingleWord()) { generatePart(true); iterator.next(); first = false; return true; } int wordType = iterator.type(); // do we already have queued up incompatible concatenations? if (!concat.isEmpty() && (concat.type & wordType) == 0) { if (flushConcatenation(concat)) { hasOutputToken = false; buffer(); continue; } hasOutputToken = false; } // add subwords depending upon options if (shouldConcatenate(wordType)) { if (concat.isEmpty()) { concat.type = wordType; } concatenate(concat); } // add all subwords (catenateAll) if (has(CATENATE_ALL)) { concatenate(concatAll); } // if we should output the word or number part if (shouldGenerateParts(wordType)) { generatePart(false); buffer(); } iterator.next(); } }