@Test
  public void testStream_bytesRefArray() throws Exception {
    final BytesRefArray bArr = new BytesRefArray(Counter.newCounter(false));
    bArr.append(new BytesRef("foo"));
    bArr.append(new BytesRef("bar"));
    bArr.append(new BytesRef("baz"));

    Assert.assertEquals("Not all items streamed.", 3L, StreamUtils.stream(bArr).count());

    Assert.assertEquals(
        "Term not found.",
        1L,
        StreamUtils.stream(bArr).filter(br -> br.bytesEquals(new BytesRef("foo"))).count());
    Assert.assertEquals(
        "Term not found.",
        1L,
        StreamUtils.stream(bArr).filter(br -> br.bytesEquals(new BytesRef("bar"))).count());
    Assert.assertEquals(
        "Term not found.",
        1L,
        StreamUtils.stream(bArr).filter(br -> br.bytesEquals(new BytesRef("baz"))).count());

    Assert.assertEquals(
        "Unknown term found.",
        0L,
        StreamUtils.stream(bArr)
            .filter(
                t ->
                    !t.bytesEquals(new BytesRef("foo"))
                        && !t.bytesEquals(new BytesRef("bar"))
                        && !t.bytesEquals(new BytesRef("baz")))
            .count());
  }
 @Override
 public BytesRef payload() {
   if (hasPayloads && curPos < payloads.size()) {
     return payloads.get(payloadSpare, curPos);
   }
   return null;
 }
 @Override
 public BytesRef next() throws IOException {
   if (++curPos < entries.size()) {
     entries.get(spare, curPos);
     return spare.get();
   }
   return null;
 }
  private static void sort(
      final BytesRefBuilder scratch,
      final BytesRefBuilder scratch1,
      final BytesRefArray bytes,
      final int[] indices) {

    final int numValues = bytes.size();
    assert indices.length >= numValues;
    if (numValues > 1) {
      new InPlaceMergeSorter() {
        final Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUnicodeComparator();

        @Override
        protected int compare(int i, int j) {
          return comparator.compare(
              bytes.get(scratch, indices[i]), bytes.get(scratch1, indices[j]));
        }

        @Override
        protected void swap(int i, int j) {
          int value_i = indices[i];
          indices[i] = indices[j];
          indices[j] = value_i;
        }
      }.sort(0, numValues);
    }
  }
Example #5
0
 /** Resets the {@link MemoryIndex} to its initial state and recycles all internal buffers. */
 public void reset() {
   fields.clear();
   this.normSimilarity = IndexSearcher.getDefaultSimilarity();
   byteBlockPool.reset(false, false); // no need to 0-fill the buffers
   intBlockPool.reset(true, false); // here must must 0-fill since we use slices
   if (payloadsBytesRefs != null) {
     payloadsBytesRefs.clear();
   }
   this.frozen = false;
 }
 /** Creates a new iterator, buffering entries from the specified iterator */
 public BufferedInputIterator(InputIterator source) throws IOException {
   BytesRef spare;
   int freqIndex = 0;
   hasPayloads = source.hasPayloads();
   hasContexts = source.hasContexts();
   while ((spare = source.next()) != null) {
     entries.append(spare);
     if (hasPayloads) {
       payloads.append(source.payload());
     }
     if (hasContexts) {
       contextSets.add(source.contexts());
     }
     if (freqIndex >= freqs.length) {
       freqs = ArrayUtil.grow(freqs, freqs.length + 1);
     }
     freqs[freqIndex++] = source.weight();
   }
 }
 public static int sortAndDedup(final BytesRefArray bytes, final int[] indices) {
   final BytesRefBuilder scratch = new BytesRefBuilder();
   final BytesRefBuilder scratch1 = new BytesRefBuilder();
   final int numValues = bytes.size();
   assert indices.length >= numValues;
   if (numValues <= 1) {
     return numValues;
   }
   sort(scratch, scratch1, bytes, indices);
   int uniqueCount = 1;
   BytesRefBuilder previous = scratch;
   BytesRefBuilder current = scratch1;
   bytes.get(previous, indices[0]);
   for (int i = 1; i < numValues; ++i) {
     bytes.get(current, indices[i]);
     if (!previous.get().equals(current.get())) {
       indices[uniqueCount++] = indices[i];
     }
     BytesRefBuilder tmp = previous;
     previous = current;
     current = tmp;
   }
   return uniqueCount;
 }
Example #8
0
  /**
   * Iterates over the given token stream and adds the resulting terms to the index; Equivalent to
   * adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link
   * org.apache.lucene.document.Field}. Finally closes the token stream. Note that untokenized
   * keywords can be added with this method via {@link #keywordTokenStream(Collection)}, the Lucene
   * <code>KeywordTokenizer</code> or similar utilities.
   *
   * @param fieldName a name to be associated with the text
   * @param tokenStream the token stream to retrieve tokens from. It's guaranteed to be closed no
   *     matter what.
   * @param boost the boost factor for hits for this field
   * @param positionIncrementGap the position increment gap if fields with the same name are added
   *     more than once
   * @param offsetGap the offset gap if fields with the same name are added more than once
   * @see org.apache.lucene.document.Field#setBoost(float)
   */
  public void addField(
      String fieldName,
      TokenStream tokenStream,
      float boost,
      int positionIncrementGap,
      int offsetGap) {
    try (TokenStream stream = tokenStream) {
      if (frozen)
        throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen");
      if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null");
      if (stream == null) throw new IllegalArgumentException("token stream must not be null");
      if (boost <= 0.0f)
        throw new IllegalArgumentException("boost factor must be greater than 0.0");
      int numTokens = 0;
      int numOverlapTokens = 0;
      int pos = -1;
      final BytesRefHash terms;
      final SliceByteStartArray sliceArray;
      Info info;
      long sumTotalTermFreq = 0;
      int offset = 0;
      FieldInfo fieldInfo;
      if ((info = fields.get(fieldName)) != null) {
        fieldInfo = info.fieldInfo;
        numTokens = info.numTokens;
        numOverlapTokens = info.numOverlapTokens;
        pos = info.lastPosition + positionIncrementGap;
        offset = info.lastOffset + offsetGap;
        terms = info.terms;
        boost *= info.boost;
        sliceArray = info.sliceArray;
        sumTotalTermFreq = info.sumTotalTermFreq;
      } else {
        fieldInfo =
            new FieldInfo(
                fieldName,
                fields.size(),
                true,
                false,
                this.storePayloads,
                this.storeOffsets
                    ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
                    : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
                DocValuesType.NONE,
                -1,
                Collections.<String, String>emptyMap());
        sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
        terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
      }

      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute =
          stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt =
          storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();

      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
        //        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0) numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(sliceArray.end[ord]);
        } else {
          sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        sliceArray.freq[ord]++;
        sumTotalTermFreq++;
        postingsWriter.writeInt(pos);
        if (storeOffsets) {
          postingsWriter.writeInt(offsetAtt.startOffset() + offset);
          postingsWriter.writeInt(offsetAtt.endOffset() + offset);
        }
        if (storePayloads) {
          final BytesRef payload = payloadAtt.getPayload();
          final int pIndex;
          if (payload == null || payload.length == 0) {
            pIndex = -1;
          } else {
            pIndex = payloadsBytesRefs.append(payload);
          }
          postingsWriter.writeInt(pIndex);
        }
        sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();

      // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
      if (numTokens > 0) {
        fields.put(
            fieldName,
            new Info(
                fieldInfo,
                terms,
                sliceArray,
                numTokens,
                numOverlapTokens,
                boost,
                pos,
                offsetAtt.endOffset() + offset,
                sumTotalTermFreq));
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }