Ejemplo n.º 1
0
  /**
   * Returns a histogram that, for each field, approximates distribution of values in the group-by
   * and aggregation fields.
   *
   * <p>Assumes that:
   *
   * <ul>
   *   <li>Distributions of values in group-by fields are independent with each other
   *   <li>Aggregate values in different groups are distinct
   *   <li>
   * </ul>
   *
   * @param hist the input join distribution of field values
   * @param groupFlds the fields to group by. Can be empty, which means that all records are in a
   *     single group.
   * @param aggFns the aggregation functions. Optional, can be null.
   * @return a histogram that, for each field, approximates distribution of values in the group-by
   *     and aggregation fields
   */
  public static Histogram groupByHistogram(
      Histogram hist, Set<String> groupFlds, Set<AggregationFn> aggFns) {
    if (Double.compare(hist.recordsOutput(), 1.0) < 0) return new Histogram(hist.fields());

    double dvProd = 1.0; // the maximal number of group
    for (String fld : groupFlds) {
      double dv = 0.0;
      for (Bucket bkt : hist.buckets(fld)) dv += bkt.distinctValues();
      dvProd *= dv;
    }
    double numGroups = Math.min(dvProd, hist.recordsOutput());
    double gbReduction = numGroups / hist.recordsOutput();
    Histogram gbHist = new Histogram(groupFlds);
    for (String fld : groupFlds) {
      for (Bucket bkt : hist.buckets(fld)) {
        double newFreq = bkt.frequency() * gbReduction;
        if (Double.compare(newFreq, 1.0) < 0) continue;
        gbHist.addBucket(
            fld,
            new Bucket(bkt.valueRange(), newFreq, bkt.distinctValues(), bkt.valuePercentiles()));
      }
    }
    if (aggFns != null) {
      for (AggregationFn aggFn : aggFns) {
        String argFld = aggFn.argumentFieldName();
        String fld = aggFn.fieldName();
        if (aggFn instanceof SumFn)
          gbHist.addBucket(fld, sumBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof AvgFn)
          gbHist.addBucket(fld, avgBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof CountFn)
          gbHist.addBucket(fld, countBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof DistinctCountFn)
          gbHist.addBucket(fld, distinctCountBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof MinFn)
          gbHist.addBucket(fld, minBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof MaxFn)
          gbHist.addBucket(fld, maxBucket(hist.buckets(argFld), numGroups));
        else throw new UnsupportedOperationException();
      }
    }
    return syncHistogram(gbHist);
  }
Ejemplo n.º 2
0
 /**
  * Creates a group-by plan for the underlying query. The grouping is determined by the specified
  * collection of group fields, and the aggregation is computed by the specified collection of
  * aggregation functions.
  *
  * @param p a plan for the underlying query
  * @param groupFlds the fields to group by. Can be empty, which means that all records are in a
  *     single group.
  * @param aggFns the aggregation functions. Optional, can be null.
  * @param tx the calling transaction
  */
 public GroupByPlan(Plan p, Set<String> groupFlds, Set<AggregationFn> aggFns, Transaction tx) {
   schema = new Schema();
   this.groupFlds = groupFlds;
   if (!this.groupFlds.isEmpty()) {
     for (String fld : groupFlds) schema.add(fld, p.schema());
     // sort records by group-by fields with default direction
     sp = new SortPlan(p, new ArrayList<String>(groupFlds), tx);
   } else
     // all records are in a single group, so p is already sorted
     sp = p;
   this.aggFns = aggFns;
   if (aggFns != null)
     for (AggregationFn fn : aggFns) {
       Type t =
           fn.isArgumentTypeDependent() ? p.schema().type(fn.argumentFieldName()) : fn.fieldType();
       schema.addField(fn.fieldName(), t);
     }
   hist = groupByHistogram(p.histogram(), this.groupFlds, aggFns);
 }