private static Bucket countBucket(Collection<Bucket> dist, double numGroups) { Constant cntLow = new DoubleConstant(1.0); Double totalFreq = 0.0; for (Bucket bkt : dist) { totalFreq += bkt.frequency(); } double maxGroupSize = totalFreq - numGroups + 1; Constant cntHigh = new DoubleConstant(maxGroupSize); ConstantRange countRange = ConstantRange.newInstance(cntLow, true, cntHigh, true); // discard percentiles return new Bucket(countRange, numGroups, numGroups); }
private static Bucket avgBucket(Collection<Bucket> dist, double numGroups) { Constant avgLow = null, avgHigh = null; for (Bucket bkt : dist) { if (avgLow == null || bkt.valueRange().low().compareTo(avgLow) < 0) avgLow = bkt.valueRange().low(); if (avgHigh == null || bkt.valueRange().high().compareTo(avgHigh) > 0) avgHigh = bkt.valueRange().high(); } ConstantRange avgRange = ConstantRange.newInstance(avgLow, true, avgHigh, true); // discard percentiles return new Bucket(avgRange, numGroups, numGroups); }
private static Bucket distinctCountBucket(Collection<Bucket> dist, double numGroups) { Constant dcLow = new DoubleConstant(1.0); Double totalFreq = 0.0, dv = 0.0; for (Bucket bkt : dist) { totalFreq += bkt.frequency(); dv += bkt.distinctValues(); } double maxGroupSize = totalFreq - numGroups + 1; Constant dcHigh = new DoubleConstant(Math.min(maxGroupSize, dv)); ConstantRange distinctCountRange = ConstantRange.newInstance(dcLow, true, dcHigh, true); // discard percentiles return new Bucket(distinctCountRange, numGroups, numGroups); }
private static Bucket sumBucket(Collection<Bucket> dist, double numGroups) { Constant sumLow = null, sumHigh = new DoubleConstant(1.0); double totalFreq = 0.0; Map<Constant, Bucket> highs = new HashMap<Constant, Bucket>(); for (Bucket bkt : dist) { // estimate sumLow as the only one smallest value in a group if (sumLow == null || bkt.valueRange().low().compareTo(sumLow) < 0) sumLow = bkt.valueRange().low(); totalFreq += bkt.frequency(); highs.put(bkt.valueRange().high(), bkt); } SortedSet<Constant> desc = new TreeSet<Constant>(highs.keySet()).descendingSet(); // estimate sumHigh as the sum of top maxGroupSize values double maxGroupSize = totalFreq - numGroups + 1; double currSize = 0.0; for (Constant high : desc) { Bucket bkt = highs.get(high); double recsToSum = Math.min(bkt.frequency(), maxGroupSize - currSize); sumHigh = sumHigh.add(high.mul(new DoubleConstant(recsToSum))); currSize += recsToSum; if (Double.compare(currSize, maxGroupSize) >= 0) break; } ConstantRange sumRange = ConstantRange.newInstance(sumLow, true, sumHigh, true); // discard percentiles return new Bucket(sumRange, numGroups, numGroups); }
private static Bucket maxBucket(Collection<Bucket> dist, double numGroups) { Constant maxLow = null, maxHigh = null; Double dv = 0.0; for (Bucket bkt : dist) { if (maxLow == null || bkt.valueRange().low().compareTo(maxLow) < 0) maxLow = bkt.valueRange().low(); if (maxHigh == null || bkt.valueRange().high().compareTo(maxHigh) > 0) maxHigh = bkt.valueRange().high(); dv += bkt.distinctValues(); } ConstantRange maxRange = ConstantRange.newInstance(maxLow, true, maxHigh, true); // discard percentiles return new Bucket(maxRange, numGroups, Math.min(numGroups, dv)); }
/** * Returns a histogram that, for each field, approximates distribution of values in the group-by * and aggregation fields. * * <p>Assumes that: * * <ul> * <li>Distributions of values in group-by fields are independent with each other * <li>Aggregate values in different groups are distinct * <li> * </ul> * * @param hist the input join distribution of field values * @param groupFlds the fields to group by. Can be empty, which means that all records are in a * single group. * @param aggFns the aggregation functions. Optional, can be null. * @return a histogram that, for each field, approximates distribution of values in the group-by * and aggregation fields */ public static Histogram groupByHistogram( Histogram hist, Set<String> groupFlds, Set<AggregationFn> aggFns) { if (Double.compare(hist.recordsOutput(), 1.0) < 0) return new Histogram(hist.fields()); double dvProd = 1.0; // the maximal number of group for (String fld : groupFlds) { double dv = 0.0; for (Bucket bkt : hist.buckets(fld)) dv += bkt.distinctValues(); dvProd *= dv; } double numGroups = Math.min(dvProd, hist.recordsOutput()); double gbReduction = numGroups / hist.recordsOutput(); Histogram gbHist = new Histogram(groupFlds); for (String fld : groupFlds) { for (Bucket bkt : hist.buckets(fld)) { double newFreq = bkt.frequency() * gbReduction; if (Double.compare(newFreq, 1.0) < 0) continue; gbHist.addBucket( fld, new Bucket(bkt.valueRange(), newFreq, bkt.distinctValues(), bkt.valuePercentiles())); } } if (aggFns != null) { for (AggregationFn aggFn : aggFns) { String argFld = aggFn.argumentFieldName(); String fld = aggFn.fieldName(); if (aggFn instanceof SumFn) gbHist.addBucket(fld, sumBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof AvgFn) gbHist.addBucket(fld, avgBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof CountFn) gbHist.addBucket(fld, countBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof DistinctCountFn) gbHist.addBucket(fld, distinctCountBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof MinFn) gbHist.addBucket(fld, minBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof MaxFn) gbHist.addBucket(fld, maxBucket(hist.buckets(argFld), numGroups)); else throw new UnsupportedOperationException(); } } return syncHistogram(gbHist); }