@Test public void testStream_bytesRefArray() throws Exception { final BytesRefArray bArr = new BytesRefArray(Counter.newCounter(false)); bArr.append(new BytesRef("foo")); bArr.append(new BytesRef("bar")); bArr.append(new BytesRef("baz")); Assert.assertEquals("Not all items streamed.", 3L, StreamUtils.stream(bArr).count()); Assert.assertEquals( "Term not found.", 1L, StreamUtils.stream(bArr).filter(br -> br.bytesEquals(new BytesRef("foo"))).count()); Assert.assertEquals( "Term not found.", 1L, StreamUtils.stream(bArr).filter(br -> br.bytesEquals(new BytesRef("bar"))).count()); Assert.assertEquals( "Term not found.", 1L, StreamUtils.stream(bArr).filter(br -> br.bytesEquals(new BytesRef("baz"))).count()); Assert.assertEquals( "Unknown term found.", 0L, StreamUtils.stream(bArr) .filter( t -> !t.bytesEquals(new BytesRef("foo")) && !t.bytesEquals(new BytesRef("bar")) && !t.bytesEquals(new BytesRef("baz"))) .count()); }
@Override public BytesRef payload() { if (hasPayloads && curPos < payloads.size()) { return payloads.get(payloadSpare, curPos); } return null; }
@Override public BytesRef next() throws IOException { if (++curPos < entries.size()) { entries.get(spare, curPos); return spare.get(); } return null; }
private static void sort( final BytesRefBuilder scratch, final BytesRefBuilder scratch1, final BytesRefArray bytes, final int[] indices) { final int numValues = bytes.size(); assert indices.length >= numValues; if (numValues > 1) { new InPlaceMergeSorter() { final Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUnicodeComparator(); @Override protected int compare(int i, int j) { return comparator.compare( bytes.get(scratch, indices[i]), bytes.get(scratch1, indices[j])); } @Override protected void swap(int i, int j) { int value_i = indices[i]; indices[i] = indices[j]; indices[j] = value_i; } }.sort(0, numValues); } }
/** Resets the {@link MemoryIndex} to its initial state and recycles all internal buffers. */ public void reset() { fields.clear(); this.normSimilarity = IndexSearcher.getDefaultSimilarity(); byteBlockPool.reset(false, false); // no need to 0-fill the buffers intBlockPool.reset(true, false); // here must must 0-fill since we use slices if (payloadsBytesRefs != null) { payloadsBytesRefs.clear(); } this.frozen = false; }
/** Creates a new iterator, buffering entries from the specified iterator */ public BufferedInputIterator(InputIterator source) throws IOException { BytesRef spare; int freqIndex = 0; hasPayloads = source.hasPayloads(); hasContexts = source.hasContexts(); while ((spare = source.next()) != null) { entries.append(spare); if (hasPayloads) { payloads.append(source.payload()); } if (hasContexts) { contextSets.add(source.contexts()); } if (freqIndex >= freqs.length) { freqs = ArrayUtil.grow(freqs, freqs.length + 1); } freqs[freqIndex++] = source.weight(); } }
public static int sortAndDedup(final BytesRefArray bytes, final int[] indices) { final BytesRefBuilder scratch = new BytesRefBuilder(); final BytesRefBuilder scratch1 = new BytesRefBuilder(); final int numValues = bytes.size(); assert indices.length >= numValues; if (numValues <= 1) { return numValues; } sort(scratch, scratch1, bytes, indices); int uniqueCount = 1; BytesRefBuilder previous = scratch; BytesRefBuilder current = scratch1; bytes.get(previous, indices[0]); for (int i = 1; i < numValues; ++i) { bytes.get(current, indices[i]); if (!previous.get().equals(current.get())) { indices[uniqueCount++] = indices[i]; } BytesRefBuilder tmp = previous; previous = current; current = tmp; } return uniqueCount; }
/** * Iterates over the given token stream and adds the resulting terms to the index; Equivalent to * adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link * org.apache.lucene.document.Field}. Finally closes the token stream. Note that untokenized * keywords can be added with this method via {@link #keywordTokenStream(Collection)}, the Lucene * <code>KeywordTokenizer</code> or similar utilities. * * @param fieldName a name to be associated with the text * @param tokenStream the token stream to retrieve tokens from. It's guaranteed to be closed no * matter what. * @param boost the boost factor for hits for this field * @param positionIncrementGap the position increment gap if fields with the same name are added * more than once * @param offsetGap the offset gap if fields with the same name are added more than once * @see org.apache.lucene.document.Field#setBoost(float) */ public void addField( String fieldName, TokenStream tokenStream, float boost, int positionIncrementGap, int offsetGap) { try (TokenStream stream = tokenStream) { if (frozen) throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen"); if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null"); if (stream == null) throw new IllegalArgumentException("token stream must not be null"); if (boost <= 0.0f) throw new IllegalArgumentException("boost factor must be greater than 0.0"); int numTokens = 0; int numOverlapTokens = 0; int pos = -1; final BytesRefHash terms; final SliceByteStartArray sliceArray; Info info; long sumTotalTermFreq = 0; int offset = 0; FieldInfo fieldInfo; if ((info = fields.get(fieldName)) != null) { fieldInfo = info.fieldInfo; numTokens = info.numTokens; numOverlapTokens = info.numOverlapTokens; pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; terms = info.terms; boost *= info.boost; sliceArray = info.sliceArray; sumTotalTermFreq = info.sumTotalTermFreq; } else { fieldInfo = new FieldInfo( fieldName, fields.size(), true, false, this.storePayloads, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, DocValuesType.NONE, -1, Collections.<String, String>emptyMap()); sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); } TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null; BytesRef ref = termAtt.getBytesRef(); stream.reset(); while (stream.incrementToken()) { termAtt.fillBytesRef(); // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; final int posIncr = posIncrAttribute.getPositionIncrement(); if (posIncr == 0) numOverlapTokens++; pos += posIncr; int ord = terms.add(ref); if (ord < 0) { ord = (-ord) - 1; postingsWriter.reset(sliceArray.end[ord]); } else { sliceArray.start[ord] = postingsWriter.startNewSlice(); } sliceArray.freq[ord]++; sumTotalTermFreq++; postingsWriter.writeInt(pos); if (storeOffsets) { postingsWriter.writeInt(offsetAtt.startOffset() + offset); postingsWriter.writeInt(offsetAtt.endOffset() + offset); } if (storePayloads) { final BytesRef payload = payloadAtt.getPayload(); final int pIndex; if (payload == null || payload.length == 0) { pIndex = -1; } else { pIndex = payloadsBytesRefs.append(payload); } postingsWriter.writeInt(pIndex); } sliceArray.end[ord] = postingsWriter.getCurrentOffset(); } stream.end(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { fields.put( fieldName, new Info( fieldInfo, terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.endOffset() + offset, sumTotalTermFreq)); } } catch (IOException e) { throw new RuntimeException(e); } }