@Override public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException { final long numOrds = globalOrdinals.getValueCount(); final LongBitSet acceptedGlobalOrdinals = new LongBitSet(numOrds); final TermsEnum termEnum = globalOrdinals.termsEnum(); BytesRef term = termEnum.next(); while (term != null) { if (Math.floorMod( StringHelper.murmurhash3_x86_32(term, HASH_PARTITIONING_SEED), incNumPartitions) == incZeroBasedPartition) { acceptedGlobalOrdinals.set(termEnum.ord()); } term = termEnum.next(); } return acceptedGlobalOrdinals; }
OrdinalsCollector(HyperLogLogPlusPlus counts, RandomAccessOrds values, BigArrays bigArrays) { Preconditions.checkArgument(values.getValueCount() <= Integer.MAX_VALUE); maxOrd = (int) values.getValueCount(); this.bigArrays = bigArrays; this.counts = counts; this.values = values; visitedOrds = bigArrays.newObjectArray(1); }
@Override public void collect(int doc, long bucketOrd) { visitedOrds = bigArrays.grow(visitedOrds, bucketOrd + 1); FixedBitSet bits = visitedOrds.get(bucketOrd); if (bits == null) { bits = new FixedBitSet(maxOrd); visitedOrds.set(bucketOrd, bits); } values.setDocument(doc); final int valueCount = values.cardinality(); for (int i = 0; i < valueCount; ++i) { bits.set((int) values.ordAt(i)); } }
@Override public void postCollect() { final FixedBitSet allVisitedOrds = new FixedBitSet(maxOrd); for (long bucket = visitedOrds.size() - 1; bucket >= 0; --bucket) { final FixedBitSet bits = visitedOrds.get(bucket); if (bits != null) { allVisitedOrds.or(bits); } } final org.elasticsearch.common.hash.MurmurHash3.Hash128 hash = new org.elasticsearch.common.hash.MurmurHash3.Hash128(); try (LongArray hashes = bigArrays.newLongArray(maxOrd, false)) { for (int ord = allVisitedOrds.nextSetBit(0); ord != -1; ord = ord + 1 < maxOrd ? allVisitedOrds.nextSetBit(ord + 1) : -1) { final BytesRef value = values.lookupOrd(ord); org.elasticsearch.common.hash.MurmurHash3.hash128( value.bytes, value.offset, value.length, 0, hash); hashes.set(ord, hash.h1); } for (long bucket = visitedOrds.size() - 1; bucket >= 0; --bucket) { final FixedBitSet bits = visitedOrds.get(bucket); if (bits != null) { for (int ord = bits.nextSetBit(0); ord != -1; ord = ord + 1 < maxOrd ? bits.nextSetBit(ord + 1) : -1) { counts.collect(bucket, hashes.get(ord)); } } } } }
/** Computes which global ordinals are accepted by this IncludeExclude instance. */ @Override public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException { LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); TermsEnum globalTermsEnum; Terms globalTerms = new DocValuesTerms(globalOrdinals); // TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can // avoid i/o and just set bits. globalTermsEnum = compiled.getTermsEnum(globalTerms); for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) { acceptedGlobalOrdinals.set(globalTermsEnum.ord()); } return acceptedGlobalOrdinals; }
private Collector createCollector(AtomicReaderContext reader) { // if rehash is false then the value source is either already hashed, or the user explicitly // requested not to hash the values (perhaps they already hashed the values themselves before // indexing the doc) // so we can just work with the original value source as is if (!rehash) { MurmurHash3Values hashValues = MurmurHash3Values.cast(((ValuesSource.Numeric) valuesSource).longValues()); return new DirectCollector(counts, hashValues); } if (valuesSource instanceof ValuesSource.Numeric) { ValuesSource.Numeric source = (ValuesSource.Numeric) valuesSource; MurmurHash3Values hashValues = source.isFloatingPoint() ? MurmurHash3Values.hash(source.doubleValues()) : MurmurHash3Values.hash(source.longValues()); return new DirectCollector(counts, hashValues); } if (valuesSource instanceof ValuesSource.Bytes.WithOrdinals) { ValuesSource.Bytes.WithOrdinals source = (ValuesSource.Bytes.WithOrdinals) valuesSource; final RandomAccessOrds ordinalValues = source.ordinalsValues(); final long maxOrd = ordinalValues.getValueCount(); if (maxOrd == 0) { return new EmptyCollector(); } final long ordinalsMemoryUsage = OrdinalsCollector.memoryOverhead(maxOrd); final long countsMemoryUsage = HyperLogLogPlusPlus.memoryUsage(precision); // only use ordinals if they don't increase memory usage by more than 25% if (ordinalsMemoryUsage < countsMemoryUsage / 4) { return new OrdinalsCollector(counts, ordinalValues, bigArrays); } } return new DirectCollector(counts, MurmurHash3Values.hash(valuesSource.bytesValues())); }
@Override public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException { LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); if (includeValues != null) { for (BytesRef term : includeValues) { long ord = globalOrdinals.lookupTerm(term); if (ord >= 0) { acceptedGlobalOrdinals.set(ord); } } } else if (acceptedGlobalOrdinals.length() > 0) { // default to all terms being acceptable acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length()); } if (excludeValues != null) { for (BytesRef term : excludeValues) { long ord = globalOrdinals.lookupTerm(term); if (ord >= 0) { acceptedGlobalOrdinals.clear(ord); } } } return acceptedGlobalOrdinals; }
public void testDuelGlobalOrdinals() throws Exception { Random random = getRandom(); final int numDocs = scaledRandomIntBetween(10, 1000); final int numValues = scaledRandomIntBetween(10, 500); final String[] values = new String[numValues]; for (int i = 0; i < numValues; ++i) { values[i] = new String(RandomStrings.randomAsciiOfLength(random, 10)); } for (int i = 0; i < numDocs; i++) { Document d = new Document(); final int numVals = randomInt(3); for (int j = 0; j < numVals; ++j) { final String value = RandomPicks.randomFrom(random, Arrays.asList(values)); d.add(new StringField("string", value, Field.Store.NO)); d.add(new SortedSetDocValuesField("bytes", new BytesRef(value))); } writer.addDocument(d); if (randomInt(10) == 0) { refreshReader(); } } refreshReader(); Map<FieldDataType, Type> typeMap = new HashMap<FieldDataType, DuelFieldDataTests.Type>(); typeMap.put( new FieldDataType("string", ImmutableSettings.builder().put("format", "fst")), Type.Bytes); typeMap.put( new FieldDataType("string", ImmutableSettings.builder().put("format", "paged_bytes")), Type.Bytes); typeMap.put( new FieldDataType("string", ImmutableSettings.builder().put("format", "doc_values")), Type.Bytes); for (Map.Entry<FieldDataType, Type> entry : typeMap.entrySet()) { ifdService.clear(); IndexOrdinalsFieldData fieldData = getForField(entry.getKey(), entry.getValue().name().toLowerCase(Locale.ROOT)); RandomAccessOrds left = fieldData.load(readerContext).getOrdinalsValues(); fieldData.clear(); RandomAccessOrds right = fieldData .loadGlobal(topLevelReader) .load(topLevelReader.leaves().get(0)) .getOrdinalsValues(); assertEquals(left.getValueCount(), right.getValueCount()); for (long ord = 0; ord < left.getValueCount(); ++ord) { assertEquals(left.lookupOrd(ord), right.lookupOrd(ord)); } } }
protected CommonSettings.MemoryStorageFormat chooseStorageFormat( LeafReader reader, PackedLongValues values, Ordinals build, RandomAccessOrds ordinals, long minValue, long maxValue, float acceptableOverheadRatio, int pageSize) { CommonSettings.MemoryStorageFormat format; // estimate memory usage for a single packed array long packedDelta = maxValue - minValue + 1; // allow for a missing value // valuesDelta can be negative if the difference between max and min values overflows the // positive side of longs. int bitsRequired = packedDelta < 0 ? 64 : PackedInts.bitsRequired(packedDelta); PackedInts.FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(reader.maxDoc(), bitsRequired, acceptableOverheadRatio); final long singleValuesSize = formatAndBits.format.longCount( PackedInts.VERSION_CURRENT, reader.maxDoc(), formatAndBits.bitsPerValue) * 8L; // ordinal memory usage final long ordinalsSize = build.ramBytesUsed() + values.ramBytesUsed(); // estimate the memory signature of paged packing long pagedSingleValuesSize = (reader.maxDoc() / pageSize + 1) * RamUsageEstimator.NUM_BYTES_OBJECT_REF; // array of pages int pageIndex = 0; long pageMinOrdinal = Long.MAX_VALUE; long pageMaxOrdinal = Long.MIN_VALUE; for (int i = 1; i < reader.maxDoc(); ++i, pageIndex = (pageIndex + 1) % pageSize) { ordinals.setDocument(i); if (ordinals.cardinality() > 0) { long ordinal = ordinals.ordAt(0); pageMaxOrdinal = Math.max(ordinal, pageMaxOrdinal); pageMinOrdinal = Math.min(ordinal, pageMinOrdinal); } if (pageIndex == pageSize - 1) { // end of page, we now know enough to estimate memory usage pagedSingleValuesSize += getPageMemoryUsage( values, acceptableOverheadRatio, pageSize, pageMinOrdinal, pageMaxOrdinal); pageMinOrdinal = Long.MAX_VALUE; pageMaxOrdinal = Long.MIN_VALUE; } } if (pageIndex > 0) { // last page estimation pageIndex++; pagedSingleValuesSize += getPageMemoryUsage( values, acceptableOverheadRatio, pageSize, pageMinOrdinal, pageMaxOrdinal); } if (ordinalsSize < singleValuesSize) { if (ordinalsSize < pagedSingleValuesSize) { format = CommonSettings.MemoryStorageFormat.ORDINALS; } else { format = CommonSettings.MemoryStorageFormat.PAGED; } } else { if (pagedSingleValuesSize < singleValuesSize) { format = CommonSettings.MemoryStorageFormat.PAGED; } else { format = CommonSettings.MemoryStorageFormat.PACKED; } } return format; }
@Override public AtomicNumericFieldData loadDirect(LeafReaderContext context) throws Exception { final LeafReader reader = context.reader(); Terms terms = reader.terms(getFieldNames().indexName()); AtomicNumericFieldData data = null; PackedArrayEstimator estimator = new PackedArrayEstimator( breakerService.getBreaker(CircuitBreaker.FIELDDATA), getNumericType(), getFieldNames().fullName()); if (terms == null) { data = AtomicLongFieldData.empty(reader.maxDoc()); estimator.adjustForNoTerms(data.ramBytesUsed()); return data; } // TODO: how can we guess the number of terms? numerics end up creating more terms per value... // Lucene encodes numeric data so that the lexicographical (encoded) order matches the integer // order so we know the sequence of // longs is going to be monotonically increasing final PackedLongValues.Builder valuesBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); final float acceptableTransientOverheadRatio = fieldDataType .getSettings() .getAsFloat( "acceptable_transient_overhead_ratio", OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO); TermsEnum termsEnum = estimator.beforeLoad(terms); assert !getNumericType().isFloatingPoint(); boolean success = false; try (OrdinalsBuilder builder = new OrdinalsBuilder(-1, reader.maxDoc(), acceptableTransientOverheadRatio)) { BytesRefIterator iter = builder.buildFromTerms(termsEnum); BytesRef term; while ((term = iter.next()) != null) { final long value = numericType.toLong(term); valuesBuilder.add(value); } final PackedLongValues values = valuesBuilder.build(); final Ordinals build = builder.build(fieldDataType.getSettings()); CommonSettings.MemoryStorageFormat formatHint = CommonSettings.getMemoryStorageHint(fieldDataType); RandomAccessOrds ordinals = build.ordinals(); if (FieldData.isMultiValued(ordinals) || formatHint == CommonSettings.MemoryStorageFormat.ORDINALS) { final long ramBytesUsed = build.ramBytesUsed() + values.ramBytesUsed(); data = new AtomicLongFieldData(ramBytesUsed) { @Override public SortedNumericDocValues getLongValues() { return withOrdinals(build, values, reader.maxDoc()); } @Override public Collection<Accountable> getChildResources() { List<Accountable> resources = new ArrayList<>(); resources.add(Accountables.namedAccountable("ordinals", build)); resources.add(Accountables.namedAccountable("values", values)); return Collections.unmodifiableList(resources); } }; } else { final BitSet docsWithValues = builder.buildDocsWithValuesSet(); long minV, maxV; minV = maxV = 0; if (values.size() > 0) { minV = values.get(0); maxV = values.get(values.size() - 1); } final float acceptableOverheadRatio = fieldDataType.getSettings().getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT); final int pageSize = fieldDataType.getSettings().getAsInt("single_value_page_size", 1024); if (formatHint == null) { formatHint = chooseStorageFormat( reader, values, build, ordinals, minV, maxV, acceptableOverheadRatio, pageSize); } logger.trace( "single value format for field [{}] set to [{}]", getFieldNames().fullName(), formatHint); switch (formatHint) { case PACKED: // Encode document without a value with a special value long missingV = 0; if (docsWithValues != null) { if ((maxV - minV + 1) == values.size()) { // values are dense if (minV > Long.MIN_VALUE) { missingV = --minV; } else { assert maxV != Long.MAX_VALUE; missingV = ++maxV; } } else { for (long i = 1; i < values.size(); ++i) { if (values.get(i) > values.get(i - 1) + 1) { missingV = values.get(i - 1) + 1; break; } } } missingV -= minV; } final long missingValue = missingV; final long minValue = minV; final long maxValue = maxV; final long valuesDelta = maxValue - minValue; int bitsRequired = valuesDelta < 0 ? 64 : PackedInts.bitsRequired(valuesDelta); final PackedInts.Mutable sValues = PackedInts.getMutable(reader.maxDoc(), bitsRequired, acceptableOverheadRatio); if (docsWithValues != null) { sValues.fill(0, sValues.size(), missingV); } for (int i = 0; i < reader.maxDoc(); i++) { ordinals.setDocument(i); if (ordinals.cardinality() > 0) { final long ord = ordinals.ordAt(0); long value = values.get(ord); sValues.set(i, value - minValue); } } long ramBytesUsed = values.ramBytesUsed() + (docsWithValues == null ? 0 : docsWithValues.ramBytesUsed()); data = new AtomicLongFieldData(ramBytesUsed) { @Override public SortedNumericDocValues getLongValues() { if (docsWithValues == null) { return singles(sValues, minValue); } else { return sparseSingles(sValues, minValue, missingValue, reader.maxDoc()); } } @Override public Collection<Accountable> getChildResources() { List<Accountable> resources = new ArrayList<>(); resources.add(Accountables.namedAccountable("values", sValues)); if (docsWithValues != null) { resources.add( Accountables.namedAccountable("missing bitset", docsWithValues)); } return Collections.unmodifiableList(resources); } }; break; case PAGED: final PackedLongValues.Builder dpValues = PackedLongValues.deltaPackedBuilder(pageSize, acceptableOverheadRatio); long lastValue = 0; for (int i = 0; i < reader.maxDoc(); i++) { ordinals.setDocument(i); if (ordinals.cardinality() > 0) { final long ord = ordinals.ordAt(i); lastValue = values.get(ord); } dpValues.add(lastValue); } final PackedLongValues pagedValues = dpValues.build(); ramBytesUsed = pagedValues.ramBytesUsed(); if (docsWithValues != null) { ramBytesUsed += docsWithValues.ramBytesUsed(); } data = new AtomicLongFieldData(ramBytesUsed) { @Override public SortedNumericDocValues getLongValues() { return pagedSingles(pagedValues, docsWithValues); } @Override public Collection<Accountable> getChildResources() { List<Accountable> resources = new ArrayList<>(); resources.add(Accountables.namedAccountable("values", pagedValues)); if (docsWithValues != null) { resources.add( Accountables.namedAccountable("missing bitset", docsWithValues)); } return Collections.unmodifiableList(resources); } }; break; case ORDINALS: ramBytesUsed = build.ramBytesUsed() + values.ramBytesUsed(); data = new AtomicLongFieldData(ramBytesUsed) { @Override public SortedNumericDocValues getLongValues() { return withOrdinals(build, values, reader.maxDoc()); } @Override public Collection<Accountable> getChildResources() { List<Accountable> resources = new ArrayList<>(); resources.add(Accountables.namedAccountable("ordinals", build)); resources.add(Accountables.namedAccountable("values", values)); return Collections.unmodifiableList(resources); } }; break; default: throw new ElasticsearchException("unknown memory format: " + formatHint); } } success = true; return data; } finally { if (!success) { // If something went wrong, unwind any current estimations we've made estimator.afterLoad(termsEnum, 0); } else { // Adjust as usual, based on the actual size of the field data estimator.afterLoad(termsEnum, data.ramBytesUsed()); } } }