예제 #1
0
  /**
   * Returns a histogram that, for each field, approximates distribution of values in the group-by
   * and aggregation fields.
   *
   * <p>Assumes that:
   *
   * <ul>
   *   <li>Distributions of values in group-by fields are independent with each other
   *   <li>Aggregate values in different groups are distinct
   *   <li>
   * </ul>
   *
   * @param hist the input join distribution of field values
   * @param groupFlds the fields to group by. Can be empty, which means that all records are in a
   *     single group.
   * @param aggFns the aggregation functions. Optional, can be null.
   * @return a histogram that, for each field, approximates distribution of values in the group-by
   *     and aggregation fields
   */
  public static Histogram groupByHistogram(
      Histogram hist, Set<String> groupFlds, Set<AggregationFn> aggFns) {
    if (Double.compare(hist.recordsOutput(), 1.0) < 0) return new Histogram(hist.fields());

    double dvProd = 1.0; // the maximal number of group
    for (String fld : groupFlds) {
      double dv = 0.0;
      for (Bucket bkt : hist.buckets(fld)) dv += bkt.distinctValues();
      dvProd *= dv;
    }
    double numGroups = Math.min(dvProd, hist.recordsOutput());
    double gbReduction = numGroups / hist.recordsOutput();
    Histogram gbHist = new Histogram(groupFlds);
    for (String fld : groupFlds) {
      for (Bucket bkt : hist.buckets(fld)) {
        double newFreq = bkt.frequency() * gbReduction;
        if (Double.compare(newFreq, 1.0) < 0) continue;
        gbHist.addBucket(
            fld,
            new Bucket(bkt.valueRange(), newFreq, bkt.distinctValues(), bkt.valuePercentiles()));
      }
    }
    if (aggFns != null) {
      for (AggregationFn aggFn : aggFns) {
        String argFld = aggFn.argumentFieldName();
        String fld = aggFn.fieldName();
        if (aggFn instanceof SumFn)
          gbHist.addBucket(fld, sumBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof AvgFn)
          gbHist.addBucket(fld, avgBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof CountFn)
          gbHist.addBucket(fld, countBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof DistinctCountFn)
          gbHist.addBucket(fld, distinctCountBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof MinFn)
          gbHist.addBucket(fld, minBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof MaxFn)
          gbHist.addBucket(fld, maxBucket(hist.buckets(argFld), numGroups));
        else throw new UnsupportedOperationException();
      }
    }
    return syncHistogram(gbHist);
  }
예제 #2
0
 @Override
 public long recordsOutput() {
   return (long) hist.recordsOutput();
 }