/** * Returns a histogram that, for each field, approximates distribution of values in the group-by * and aggregation fields. * * <p>Assumes that: * * <ul> * <li>Distributions of values in group-by fields are independent with each other * <li>Aggregate values in different groups are distinct * <li> * </ul> * * @param hist the input join distribution of field values * @param groupFlds the fields to group by. Can be empty, which means that all records are in a * single group. * @param aggFns the aggregation functions. Optional, can be null. * @return a histogram that, for each field, approximates distribution of values in the group-by * and aggregation fields */ public static Histogram groupByHistogram( Histogram hist, Set<String> groupFlds, Set<AggregationFn> aggFns) { if (Double.compare(hist.recordsOutput(), 1.0) < 0) return new Histogram(hist.fields()); double dvProd = 1.0; // the maximal number of group for (String fld : groupFlds) { double dv = 0.0; for (Bucket bkt : hist.buckets(fld)) dv += bkt.distinctValues(); dvProd *= dv; } double numGroups = Math.min(dvProd, hist.recordsOutput()); double gbReduction = numGroups / hist.recordsOutput(); Histogram gbHist = new Histogram(groupFlds); for (String fld : groupFlds) { for (Bucket bkt : hist.buckets(fld)) { double newFreq = bkt.frequency() * gbReduction; if (Double.compare(newFreq, 1.0) < 0) continue; gbHist.addBucket( fld, new Bucket(bkt.valueRange(), newFreq, bkt.distinctValues(), bkt.valuePercentiles())); } } if (aggFns != null) { for (AggregationFn aggFn : aggFns) { String argFld = aggFn.argumentFieldName(); String fld = aggFn.fieldName(); if (aggFn instanceof SumFn) gbHist.addBucket(fld, sumBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof AvgFn) gbHist.addBucket(fld, avgBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof CountFn) gbHist.addBucket(fld, countBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof DistinctCountFn) gbHist.addBucket(fld, distinctCountBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof MinFn) gbHist.addBucket(fld, minBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof MaxFn) gbHist.addBucket(fld, maxBucket(hist.buckets(argFld), numGroups)); else throw new UnsupportedOperationException(); } } return syncHistogram(gbHist); }
@Override public long recordsOutput() { return (long) hist.recordsOutput(); }