/** * Returns a histogram that, for each field, approximates distribution of values in the group-by * and aggregation fields. * * <p>Assumes that: * * <ul> * <li>Distributions of values in group-by fields are independent with each other * <li>Aggregate values in different groups are distinct * <li> * </ul> * * @param hist the input join distribution of field values * @param groupFlds the fields to group by. Can be empty, which means that all records are in a * single group. * @param aggFns the aggregation functions. Optional, can be null. * @return a histogram that, for each field, approximates distribution of values in the group-by * and aggregation fields */ public static Histogram groupByHistogram( Histogram hist, Set<String> groupFlds, Set<AggregationFn> aggFns) { if (Double.compare(hist.recordsOutput(), 1.0) < 0) return new Histogram(hist.fields()); double dvProd = 1.0; // the maximal number of group for (String fld : groupFlds) { double dv = 0.0; for (Bucket bkt : hist.buckets(fld)) dv += bkt.distinctValues(); dvProd *= dv; } double numGroups = Math.min(dvProd, hist.recordsOutput()); double gbReduction = numGroups / hist.recordsOutput(); Histogram gbHist = new Histogram(groupFlds); for (String fld : groupFlds) { for (Bucket bkt : hist.buckets(fld)) { double newFreq = bkt.frequency() * gbReduction; if (Double.compare(newFreq, 1.0) < 0) continue; gbHist.addBucket( fld, new Bucket(bkt.valueRange(), newFreq, bkt.distinctValues(), bkt.valuePercentiles())); } } if (aggFns != null) { for (AggregationFn aggFn : aggFns) { String argFld = aggFn.argumentFieldName(); String fld = aggFn.fieldName(); if (aggFn instanceof SumFn) gbHist.addBucket(fld, sumBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof AvgFn) gbHist.addBucket(fld, avgBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof CountFn) gbHist.addBucket(fld, countBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof DistinctCountFn) gbHist.addBucket(fld, distinctCountBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof MinFn) gbHist.addBucket(fld, minBucket(hist.buckets(argFld), numGroups)); else if (aggFn instanceof MaxFn) gbHist.addBucket(fld, maxBucket(hist.buckets(argFld), numGroups)); else throw new UnsupportedOperationException(); } } return syncHistogram(gbHist); }
/** * Creates a group-by plan for the underlying query. The grouping is determined by the specified * collection of group fields, and the aggregation is computed by the specified collection of * aggregation functions. * * @param p a plan for the underlying query * @param groupFlds the fields to group by. Can be empty, which means that all records are in a * single group. * @param aggFns the aggregation functions. Optional, can be null. * @param tx the calling transaction */ public GroupByPlan(Plan p, Set<String> groupFlds, Set<AggregationFn> aggFns, Transaction tx) { schema = new Schema(); this.groupFlds = groupFlds; if (!this.groupFlds.isEmpty()) { for (String fld : groupFlds) schema.add(fld, p.schema()); // sort records by group-by fields with default direction sp = new SortPlan(p, new ArrayList<String>(groupFlds), tx); } else // all records are in a single group, so p is already sorted sp = p; this.aggFns = aggFns; if (aggFns != null) for (AggregationFn fn : aggFns) { Type t = fn.isArgumentTypeDependent() ? p.schema().type(fn.argumentFieldName()) : fn.fieldType(); schema.addField(fn.fieldName(), t); } hist = groupByHistogram(p.histogram(), this.groupFlds, aggFns); }