Exemplo n.º 1
1
    @Override
    public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException {
      final long numOrds = globalOrdinals.getValueCount();
      final LongBitSet acceptedGlobalOrdinals = new LongBitSet(numOrds);
      final TermsEnum termEnum = globalOrdinals.termsEnum();

      BytesRef term = termEnum.next();
      while (term != null) {
        if (Math.floorMod(
                StringHelper.murmurhash3_x86_32(term, HASH_PARTITIONING_SEED), incNumPartitions)
            == incZeroBasedPartition) {
          acceptedGlobalOrdinals.set(termEnum.ord());
        }
        term = termEnum.next();
      }
      return acceptedGlobalOrdinals;
    }
 OrdinalsCollector(HyperLogLogPlusPlus counts, RandomAccessOrds values, BigArrays bigArrays) {
   Preconditions.checkArgument(values.getValueCount() <= Integer.MAX_VALUE);
   maxOrd = (int) values.getValueCount();
   this.bigArrays = bigArrays;
   this.counts = counts;
   this.values = values;
   visitedOrds = bigArrays.newObjectArray(1);
 }
 @Override
 public void collect(int doc, long bucketOrd) {
   visitedOrds = bigArrays.grow(visitedOrds, bucketOrd + 1);
   FixedBitSet bits = visitedOrds.get(bucketOrd);
   if (bits == null) {
     bits = new FixedBitSet(maxOrd);
     visitedOrds.set(bucketOrd, bits);
   }
   values.setDocument(doc);
   final int valueCount = values.cardinality();
   for (int i = 0; i < valueCount; ++i) {
     bits.set((int) values.ordAt(i));
   }
 }
    @Override
    public void postCollect() {
      final FixedBitSet allVisitedOrds = new FixedBitSet(maxOrd);
      for (long bucket = visitedOrds.size() - 1; bucket >= 0; --bucket) {
        final FixedBitSet bits = visitedOrds.get(bucket);
        if (bits != null) {
          allVisitedOrds.or(bits);
        }
      }

      final org.elasticsearch.common.hash.MurmurHash3.Hash128 hash =
          new org.elasticsearch.common.hash.MurmurHash3.Hash128();
      try (LongArray hashes = bigArrays.newLongArray(maxOrd, false)) {
        for (int ord = allVisitedOrds.nextSetBit(0);
            ord != -1;
            ord = ord + 1 < maxOrd ? allVisitedOrds.nextSetBit(ord + 1) : -1) {
          final BytesRef value = values.lookupOrd(ord);
          org.elasticsearch.common.hash.MurmurHash3.hash128(
              value.bytes, value.offset, value.length, 0, hash);
          hashes.set(ord, hash.h1);
        }

        for (long bucket = visitedOrds.size() - 1; bucket >= 0; --bucket) {
          final FixedBitSet bits = visitedOrds.get(bucket);
          if (bits != null) {
            for (int ord = bits.nextSetBit(0);
                ord != -1;
                ord = ord + 1 < maxOrd ? bits.nextSetBit(ord + 1) : -1) {
              counts.collect(bucket, hashes.get(ord));
            }
          }
        }
      }
    }
Exemplo n.º 5
0
 /** Computes which global ordinals are accepted by this IncludeExclude instance. */
 @Override
 public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException {
   LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
   TermsEnum globalTermsEnum;
   Terms globalTerms = new DocValuesTerms(globalOrdinals);
   // TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can
   // avoid i/o and just set bits.
   globalTermsEnum = compiled.getTermsEnum(globalTerms);
   for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
     acceptedGlobalOrdinals.set(globalTermsEnum.ord());
   }
   return acceptedGlobalOrdinals;
 }
  private Collector createCollector(AtomicReaderContext reader) {

    // if rehash is false then the value source is either already hashed, or the user explicitly
    // requested not to hash the values (perhaps they already hashed the values themselves before
    // indexing the doc)
    // so we can just work with the original value source as is
    if (!rehash) {
      MurmurHash3Values hashValues =
          MurmurHash3Values.cast(((ValuesSource.Numeric) valuesSource).longValues());
      return new DirectCollector(counts, hashValues);
    }

    if (valuesSource instanceof ValuesSource.Numeric) {
      ValuesSource.Numeric source = (ValuesSource.Numeric) valuesSource;
      MurmurHash3Values hashValues =
          source.isFloatingPoint()
              ? MurmurHash3Values.hash(source.doubleValues())
              : MurmurHash3Values.hash(source.longValues());
      return new DirectCollector(counts, hashValues);
    }

    if (valuesSource instanceof ValuesSource.Bytes.WithOrdinals) {
      ValuesSource.Bytes.WithOrdinals source = (ValuesSource.Bytes.WithOrdinals) valuesSource;
      final RandomAccessOrds ordinalValues = source.ordinalsValues();
      final long maxOrd = ordinalValues.getValueCount();
      if (maxOrd == 0) {
        return new EmptyCollector();
      }

      final long ordinalsMemoryUsage = OrdinalsCollector.memoryOverhead(maxOrd);
      final long countsMemoryUsage = HyperLogLogPlusPlus.memoryUsage(precision);
      // only use ordinals if they don't increase memory usage by more than 25%
      if (ordinalsMemoryUsage < countsMemoryUsage / 4) {
        return new OrdinalsCollector(counts, ordinalValues, bigArrays);
      }
    }

    return new DirectCollector(counts, MurmurHash3Values.hash(valuesSource.bytesValues()));
  }
Exemplo n.º 7
0
 @Override
 public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException {
   LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
   if (includeValues != null) {
     for (BytesRef term : includeValues) {
       long ord = globalOrdinals.lookupTerm(term);
       if (ord >= 0) {
         acceptedGlobalOrdinals.set(ord);
       }
     }
   } else if (acceptedGlobalOrdinals.length() > 0) {
     // default to all terms being acceptable
     acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
   }
   if (excludeValues != null) {
     for (BytesRef term : excludeValues) {
       long ord = globalOrdinals.lookupTerm(term);
       if (ord >= 0) {
         acceptedGlobalOrdinals.clear(ord);
       }
     }
   }
   return acceptedGlobalOrdinals;
 }
  public void testDuelGlobalOrdinals() throws Exception {
    Random random = getRandom();
    final int numDocs = scaledRandomIntBetween(10, 1000);
    final int numValues = scaledRandomIntBetween(10, 500);
    final String[] values = new String[numValues];
    for (int i = 0; i < numValues; ++i) {
      values[i] = new String(RandomStrings.randomAsciiOfLength(random, 10));
    }
    for (int i = 0; i < numDocs; i++) {
      Document d = new Document();
      final int numVals = randomInt(3);
      for (int j = 0; j < numVals; ++j) {
        final String value = RandomPicks.randomFrom(random, Arrays.asList(values));
        d.add(new StringField("string", value, Field.Store.NO));
        d.add(new SortedSetDocValuesField("bytes", new BytesRef(value)));
      }
      writer.addDocument(d);
      if (randomInt(10) == 0) {
        refreshReader();
      }
    }
    refreshReader();

    Map<FieldDataType, Type> typeMap = new HashMap<FieldDataType, DuelFieldDataTests.Type>();
    typeMap.put(
        new FieldDataType("string", ImmutableSettings.builder().put("format", "fst")), Type.Bytes);
    typeMap.put(
        new FieldDataType("string", ImmutableSettings.builder().put("format", "paged_bytes")),
        Type.Bytes);
    typeMap.put(
        new FieldDataType("string", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Bytes);

    for (Map.Entry<FieldDataType, Type> entry : typeMap.entrySet()) {
      ifdService.clear();
      IndexOrdinalsFieldData fieldData =
          getForField(entry.getKey(), entry.getValue().name().toLowerCase(Locale.ROOT));
      RandomAccessOrds left = fieldData.load(readerContext).getOrdinalsValues();
      fieldData.clear();
      RandomAccessOrds right =
          fieldData
              .loadGlobal(topLevelReader)
              .load(topLevelReader.leaves().get(0))
              .getOrdinalsValues();
      assertEquals(left.getValueCount(), right.getValueCount());
      for (long ord = 0; ord < left.getValueCount(); ++ord) {
        assertEquals(left.lookupOrd(ord), right.lookupOrd(ord));
      }
    }
  }
  protected CommonSettings.MemoryStorageFormat chooseStorageFormat(
      LeafReader reader,
      PackedLongValues values,
      Ordinals build,
      RandomAccessOrds ordinals,
      long minValue,
      long maxValue,
      float acceptableOverheadRatio,
      int pageSize) {

    CommonSettings.MemoryStorageFormat format;

    // estimate memory usage for a single packed array
    long packedDelta = maxValue - minValue + 1; // allow for a missing value
    // valuesDelta can be negative if the difference between max and min values overflows the
    // positive side of longs.
    int bitsRequired = packedDelta < 0 ? 64 : PackedInts.bitsRequired(packedDelta);
    PackedInts.FormatAndBits formatAndBits =
        PackedInts.fastestFormatAndBits(reader.maxDoc(), bitsRequired, acceptableOverheadRatio);
    final long singleValuesSize =
        formatAndBits.format.longCount(
                PackedInts.VERSION_CURRENT, reader.maxDoc(), formatAndBits.bitsPerValue)
            * 8L;

    // ordinal memory usage
    final long ordinalsSize = build.ramBytesUsed() + values.ramBytesUsed();

    // estimate the memory signature of paged packing
    long pagedSingleValuesSize =
        (reader.maxDoc() / pageSize + 1) * RamUsageEstimator.NUM_BYTES_OBJECT_REF; // array of pages
    int pageIndex = 0;
    long pageMinOrdinal = Long.MAX_VALUE;
    long pageMaxOrdinal = Long.MIN_VALUE;
    for (int i = 1; i < reader.maxDoc(); ++i, pageIndex = (pageIndex + 1) % pageSize) {
      ordinals.setDocument(i);
      if (ordinals.cardinality() > 0) {
        long ordinal = ordinals.ordAt(0);
        pageMaxOrdinal = Math.max(ordinal, pageMaxOrdinal);
        pageMinOrdinal = Math.min(ordinal, pageMinOrdinal);
      }
      if (pageIndex == pageSize - 1) {
        // end of page, we now know enough to estimate memory usage
        pagedSingleValuesSize +=
            getPageMemoryUsage(
                values, acceptableOverheadRatio, pageSize, pageMinOrdinal, pageMaxOrdinal);

        pageMinOrdinal = Long.MAX_VALUE;
        pageMaxOrdinal = Long.MIN_VALUE;
      }
    }

    if (pageIndex > 0) {
      // last page estimation
      pageIndex++;
      pagedSingleValuesSize +=
          getPageMemoryUsage(
              values, acceptableOverheadRatio, pageSize, pageMinOrdinal, pageMaxOrdinal);
    }

    if (ordinalsSize < singleValuesSize) {
      if (ordinalsSize < pagedSingleValuesSize) {
        format = CommonSettings.MemoryStorageFormat.ORDINALS;
      } else {
        format = CommonSettings.MemoryStorageFormat.PAGED;
      }
    } else {
      if (pagedSingleValuesSize < singleValuesSize) {
        format = CommonSettings.MemoryStorageFormat.PAGED;
      } else {
        format = CommonSettings.MemoryStorageFormat.PACKED;
      }
    }
    return format;
  }
  @Override
  public AtomicNumericFieldData loadDirect(LeafReaderContext context) throws Exception {
    final LeafReader reader = context.reader();
    Terms terms = reader.terms(getFieldNames().indexName());
    AtomicNumericFieldData data = null;
    PackedArrayEstimator estimator =
        new PackedArrayEstimator(
            breakerService.getBreaker(CircuitBreaker.FIELDDATA),
            getNumericType(),
            getFieldNames().fullName());
    if (terms == null) {
      data = AtomicLongFieldData.empty(reader.maxDoc());
      estimator.adjustForNoTerms(data.ramBytesUsed());
      return data;
    }
    // TODO: how can we guess the number of terms? numerics end up creating more terms per value...
    // Lucene encodes numeric data so that the lexicographical (encoded) order matches the integer
    // order so we know the sequence of
    // longs is going to be monotonically increasing
    final PackedLongValues.Builder valuesBuilder =
        PackedLongValues.monotonicBuilder(PackedInts.COMPACT);

    final float acceptableTransientOverheadRatio =
        fieldDataType
            .getSettings()
            .getAsFloat(
                "acceptable_transient_overhead_ratio",
                OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO);
    TermsEnum termsEnum = estimator.beforeLoad(terms);
    assert !getNumericType().isFloatingPoint();
    boolean success = false;
    try (OrdinalsBuilder builder =
        new OrdinalsBuilder(-1, reader.maxDoc(), acceptableTransientOverheadRatio)) {
      BytesRefIterator iter = builder.buildFromTerms(termsEnum);
      BytesRef term;
      while ((term = iter.next()) != null) {
        final long value = numericType.toLong(term);
        valuesBuilder.add(value);
      }
      final PackedLongValues values = valuesBuilder.build();
      final Ordinals build = builder.build(fieldDataType.getSettings());
      CommonSettings.MemoryStorageFormat formatHint =
          CommonSettings.getMemoryStorageHint(fieldDataType);

      RandomAccessOrds ordinals = build.ordinals();
      if (FieldData.isMultiValued(ordinals)
          || formatHint == CommonSettings.MemoryStorageFormat.ORDINALS) {
        final long ramBytesUsed = build.ramBytesUsed() + values.ramBytesUsed();
        data =
            new AtomicLongFieldData(ramBytesUsed) {

              @Override
              public SortedNumericDocValues getLongValues() {
                return withOrdinals(build, values, reader.maxDoc());
              }

              @Override
              public Collection<Accountable> getChildResources() {
                List<Accountable> resources = new ArrayList<>();
                resources.add(Accountables.namedAccountable("ordinals", build));
                resources.add(Accountables.namedAccountable("values", values));
                return Collections.unmodifiableList(resources);
              }
            };
      } else {
        final BitSet docsWithValues = builder.buildDocsWithValuesSet();

        long minV, maxV;
        minV = maxV = 0;
        if (values.size() > 0) {
          minV = values.get(0);
          maxV = values.get(values.size() - 1);
        }

        final float acceptableOverheadRatio =
            fieldDataType.getSettings().getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
        final int pageSize = fieldDataType.getSettings().getAsInt("single_value_page_size", 1024);

        if (formatHint == null) {
          formatHint =
              chooseStorageFormat(
                  reader, values, build, ordinals, minV, maxV, acceptableOverheadRatio, pageSize);
        }

        logger.trace(
            "single value format for field [{}] set to [{}]",
            getFieldNames().fullName(),
            formatHint);

        switch (formatHint) {
          case PACKED:
            // Encode document without a value with a special value
            long missingV = 0;
            if (docsWithValues != null) {
              if ((maxV - minV + 1) == values.size()) {
                // values are dense
                if (minV > Long.MIN_VALUE) {
                  missingV = --minV;
                } else {
                  assert maxV != Long.MAX_VALUE;
                  missingV = ++maxV;
                }
              } else {
                for (long i = 1; i < values.size(); ++i) {
                  if (values.get(i) > values.get(i - 1) + 1) {
                    missingV = values.get(i - 1) + 1;
                    break;
                  }
                }
              }
              missingV -= minV;
            }
            final long missingValue = missingV;
            final long minValue = minV;
            final long maxValue = maxV;

            final long valuesDelta = maxValue - minValue;
            int bitsRequired = valuesDelta < 0 ? 64 : PackedInts.bitsRequired(valuesDelta);
            final PackedInts.Mutable sValues =
                PackedInts.getMutable(reader.maxDoc(), bitsRequired, acceptableOverheadRatio);

            if (docsWithValues != null) {
              sValues.fill(0, sValues.size(), missingV);
            }

            for (int i = 0; i < reader.maxDoc(); i++) {
              ordinals.setDocument(i);
              if (ordinals.cardinality() > 0) {
                final long ord = ordinals.ordAt(0);
                long value = values.get(ord);
                sValues.set(i, value - minValue);
              }
            }
            long ramBytesUsed =
                values.ramBytesUsed()
                    + (docsWithValues == null ? 0 : docsWithValues.ramBytesUsed());
            data =
                new AtomicLongFieldData(ramBytesUsed) {

                  @Override
                  public SortedNumericDocValues getLongValues() {
                    if (docsWithValues == null) {
                      return singles(sValues, minValue);
                    } else {
                      return sparseSingles(sValues, minValue, missingValue, reader.maxDoc());
                    }
                  }

                  @Override
                  public Collection<Accountable> getChildResources() {
                    List<Accountable> resources = new ArrayList<>();
                    resources.add(Accountables.namedAccountable("values", sValues));
                    if (docsWithValues != null) {
                      resources.add(
                          Accountables.namedAccountable("missing bitset", docsWithValues));
                    }
                    return Collections.unmodifiableList(resources);
                  }
                };
            break;
          case PAGED:
            final PackedLongValues.Builder dpValues =
                PackedLongValues.deltaPackedBuilder(pageSize, acceptableOverheadRatio);

            long lastValue = 0;
            for (int i = 0; i < reader.maxDoc(); i++) {
              ordinals.setDocument(i);
              if (ordinals.cardinality() > 0) {
                final long ord = ordinals.ordAt(i);
                lastValue = values.get(ord);
              }
              dpValues.add(lastValue);
            }
            final PackedLongValues pagedValues = dpValues.build();
            ramBytesUsed = pagedValues.ramBytesUsed();
            if (docsWithValues != null) {
              ramBytesUsed += docsWithValues.ramBytesUsed();
            }
            data =
                new AtomicLongFieldData(ramBytesUsed) {

                  @Override
                  public SortedNumericDocValues getLongValues() {
                    return pagedSingles(pagedValues, docsWithValues);
                  }

                  @Override
                  public Collection<Accountable> getChildResources() {
                    List<Accountable> resources = new ArrayList<>();
                    resources.add(Accountables.namedAccountable("values", pagedValues));
                    if (docsWithValues != null) {
                      resources.add(
                          Accountables.namedAccountable("missing bitset", docsWithValues));
                    }
                    return Collections.unmodifiableList(resources);
                  }
                };
            break;
          case ORDINALS:
            ramBytesUsed = build.ramBytesUsed() + values.ramBytesUsed();
            data =
                new AtomicLongFieldData(ramBytesUsed) {

                  @Override
                  public SortedNumericDocValues getLongValues() {
                    return withOrdinals(build, values, reader.maxDoc());
                  }

                  @Override
                  public Collection<Accountable> getChildResources() {
                    List<Accountable> resources = new ArrayList<>();
                    resources.add(Accountables.namedAccountable("ordinals", build));
                    resources.add(Accountables.namedAccountable("values", values));
                    return Collections.unmodifiableList(resources);
                  }
                };
            break;
          default:
            throw new ElasticsearchException("unknown memory format: " + formatHint);
        }
      }

      success = true;
      return data;
    } finally {
      if (!success) {
        // If something went wrong, unwind any current estimations we've made
        estimator.afterLoad(termsEnum, 0);
      } else {
        // Adjust as usual, based on the actual size of the field data
        estimator.afterLoad(termsEnum, data.ramBytesUsed());
      }
    }
  }