Пример #1
0
 private static Bucket countBucket(Collection<Bucket> dist, double numGroups) {
   Constant cntLow = new DoubleConstant(1.0);
   Double totalFreq = 0.0;
   for (Bucket bkt : dist) {
     totalFreq += bkt.frequency();
   }
   double maxGroupSize = totalFreq - numGroups + 1;
   Constant cntHigh = new DoubleConstant(maxGroupSize);
   ConstantRange countRange = ConstantRange.newInstance(cntLow, true, cntHigh, true);
   // discard percentiles
   return new Bucket(countRange, numGroups, numGroups);
 }
Пример #2
0
 private static Bucket avgBucket(Collection<Bucket> dist, double numGroups) {
   Constant avgLow = null, avgHigh = null;
   for (Bucket bkt : dist) {
     if (avgLow == null || bkt.valueRange().low().compareTo(avgLow) < 0)
       avgLow = bkt.valueRange().low();
     if (avgHigh == null || bkt.valueRange().high().compareTo(avgHigh) > 0)
       avgHigh = bkt.valueRange().high();
   }
   ConstantRange avgRange = ConstantRange.newInstance(avgLow, true, avgHigh, true);
   // discard percentiles
   return new Bucket(avgRange, numGroups, numGroups);
 }
Пример #3
0
 private static Bucket distinctCountBucket(Collection<Bucket> dist, double numGroups) {
   Constant dcLow = new DoubleConstant(1.0);
   Double totalFreq = 0.0, dv = 0.0;
   for (Bucket bkt : dist) {
     totalFreq += bkt.frequency();
     dv += bkt.distinctValues();
   }
   double maxGroupSize = totalFreq - numGroups + 1;
   Constant dcHigh = new DoubleConstant(Math.min(maxGroupSize, dv));
   ConstantRange distinctCountRange = ConstantRange.newInstance(dcLow, true, dcHigh, true);
   // discard percentiles
   return new Bucket(distinctCountRange, numGroups, numGroups);
 }
Пример #4
0
 private static Bucket sumBucket(Collection<Bucket> dist, double numGroups) {
   Constant sumLow = null, sumHigh = new DoubleConstant(1.0);
   double totalFreq = 0.0;
   Map<Constant, Bucket> highs = new HashMap<Constant, Bucket>();
   for (Bucket bkt : dist) {
     // estimate sumLow as the only one smallest value in a group
     if (sumLow == null || bkt.valueRange().low().compareTo(sumLow) < 0)
       sumLow = bkt.valueRange().low();
     totalFreq += bkt.frequency();
     highs.put(bkt.valueRange().high(), bkt);
   }
   SortedSet<Constant> desc = new TreeSet<Constant>(highs.keySet()).descendingSet();
   // estimate sumHigh as the sum of top maxGroupSize values
   double maxGroupSize = totalFreq - numGroups + 1;
   double currSize = 0.0;
   for (Constant high : desc) {
     Bucket bkt = highs.get(high);
     double recsToSum = Math.min(bkt.frequency(), maxGroupSize - currSize);
     sumHigh = sumHigh.add(high.mul(new DoubleConstant(recsToSum)));
     currSize += recsToSum;
     if (Double.compare(currSize, maxGroupSize) >= 0) break;
   }
   ConstantRange sumRange = ConstantRange.newInstance(sumLow, true, sumHigh, true);
   // discard percentiles
   return new Bucket(sumRange, numGroups, numGroups);
 }
Пример #5
0
 private static Bucket maxBucket(Collection<Bucket> dist, double numGroups) {
   Constant maxLow = null, maxHigh = null;
   Double dv = 0.0;
   for (Bucket bkt : dist) {
     if (maxLow == null || bkt.valueRange().low().compareTo(maxLow) < 0)
       maxLow = bkt.valueRange().low();
     if (maxHigh == null || bkt.valueRange().high().compareTo(maxHigh) > 0)
       maxHigh = bkt.valueRange().high();
     dv += bkt.distinctValues();
   }
   ConstantRange maxRange = ConstantRange.newInstance(maxLow, true, maxHigh, true);
   // discard percentiles
   return new Bucket(maxRange, numGroups, Math.min(numGroups, dv));
 }
Пример #6
0
  /**
   * Returns a histogram that, for each field, approximates distribution of values in the group-by
   * and aggregation fields.
   *
   * <p>Assumes that:
   *
   * <ul>
   *   <li>Distributions of values in group-by fields are independent with each other
   *   <li>Aggregate values in different groups are distinct
   *   <li>
   * </ul>
   *
   * @param hist the input join distribution of field values
   * @param groupFlds the fields to group by. Can be empty, which means that all records are in a
   *     single group.
   * @param aggFns the aggregation functions. Optional, can be null.
   * @return a histogram that, for each field, approximates distribution of values in the group-by
   *     and aggregation fields
   */
  public static Histogram groupByHistogram(
      Histogram hist, Set<String> groupFlds, Set<AggregationFn> aggFns) {
    if (Double.compare(hist.recordsOutput(), 1.0) < 0) return new Histogram(hist.fields());

    double dvProd = 1.0; // the maximal number of group
    for (String fld : groupFlds) {
      double dv = 0.0;
      for (Bucket bkt : hist.buckets(fld)) dv += bkt.distinctValues();
      dvProd *= dv;
    }
    double numGroups = Math.min(dvProd, hist.recordsOutput());
    double gbReduction = numGroups / hist.recordsOutput();
    Histogram gbHist = new Histogram(groupFlds);
    for (String fld : groupFlds) {
      for (Bucket bkt : hist.buckets(fld)) {
        double newFreq = bkt.frequency() * gbReduction;
        if (Double.compare(newFreq, 1.0) < 0) continue;
        gbHist.addBucket(
            fld,
            new Bucket(bkt.valueRange(), newFreq, bkt.distinctValues(), bkt.valuePercentiles()));
      }
    }
    if (aggFns != null) {
      for (AggregationFn aggFn : aggFns) {
        String argFld = aggFn.argumentFieldName();
        String fld = aggFn.fieldName();
        if (aggFn instanceof SumFn)
          gbHist.addBucket(fld, sumBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof AvgFn)
          gbHist.addBucket(fld, avgBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof CountFn)
          gbHist.addBucket(fld, countBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof DistinctCountFn)
          gbHist.addBucket(fld, distinctCountBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof MinFn)
          gbHist.addBucket(fld, minBucket(hist.buckets(argFld), numGroups));
        else if (aggFn instanceof MaxFn)
          gbHist.addBucket(fld, maxBucket(hist.buckets(argFld), numGroups));
        else throw new UnsupportedOperationException();
      }
    }
    return syncHistogram(gbHist);
  }