Example #1
0
  /**
   * Returns summary statistics for all attributes.
   *
   * @param listwiseDeletion A flag enabling list-wise deletion
   * @return
   */
  @SuppressWarnings({"unchecked", "rawtypes"})
  public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) {

    Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>();
    Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>();
    Map<String, DataScale> scales = new HashMap<String, DataScale>();

    // Detect scales
    for (int col = 0; col < handle.getNumColumns(); col++) {

      // Meta
      String attribute = handle.getAttributeName(col);
      DataType<?> type = handle.getDataType(attribute);

      // Scale
      DataScale scale = type.getDescription().getScale();

      // Try to replace nominal scale with ordinal scale based on base data type
      if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) {
        if (!(handle.getBaseDataType(attribute) instanceof ARXString)
            && getHierarchy(col, true) != null) {
          scale = DataScale.ORDINAL;
        }
      }

      // Store
      scales.put(attribute, scale);
      statistics.put(attribute, new DescriptiveStatistics());
      ordinal.put(
          attribute,
          getSummaryStatisticsOrdinal(
              handle.getGeneralization(attribute),
              handle.getDataType(attribute),
              handle.getBaseDataType(attribute),
              getHierarchy(col, true)));
    }

    // Compute summary statistics
    for (int row = 0; row < handle.getNumRows(); row++) {

      // Check, if we should include this row
      boolean include = true;
      if (listwiseDeletion) {
        for (int col = 0; col < handle.getNumColumns(); col++) {
          if (handle.isSuppressed(row) || DataType.isNull(handle.getValue(row, col))) {
            include = false;
            break;
          }
        }
      }

      // Check
      checkInterrupt();

      // If yes, add
      if (include) {

        // For each column
        for (int col = 0; col < handle.getNumColumns(); col++) {

          // Meta
          String value = handle.getValue(row, col);
          String attribute = handle.getAttributeName(col);
          DataType<?> type = handle.getDataType(attribute);

          // Analyze
          if (!value.equals(handle.getSuppressionString()) && !DataType.isNull(value)) {
            ordinal.get(attribute).addValue(value);
            if (type instanceof DataTypeWithRatioScale) {
              statistics
                  .get(attribute)
                  .addValue(((DataTypeWithRatioScale) type).toDouble(type.parse(value)));
            }
          }
        }
      }
    }

    // Convert
    Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>();
    for (int col = 0; col < handle.getNumColumns(); col++) {

      // Check
      checkInterrupt();

      // Depending on scale
      String attribute = handle.getAttributeName(col);
      DataScale scale = scales.get(attribute);
      DataType<T> type = (DataType<T>) handle.getDataType(attribute);
      ordinal.get(attribute).analyze();
      if (scale == DataScale.NOMINAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.NOMINAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode())));
      } else if (scale == DataScale.ORDINAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.ORDINAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax())));
      } else if (scale == DataScale.INTERVAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        DescriptiveStatistics stats2 = statistics.get(attribute);
        boolean isPeriod = type.getDescription().getWrappedClass() == Date.class;

        // TODO: Something is wrong with commons math's kurtosis
        double kurtosis = stats2.getKurtosis();
        kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
        double range = stats2.getMax() - stats2.getMin();
        double stddev = Math.sqrt(stats2.getVariance());

        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.INTERVAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax()),
                toString(type, stats2.getMean(), false, false),
                toValue(type, stats2.getMean()),
                stats2.getMean(),
                toString(type, stats2.getVariance(), isPeriod, true),
                toValue(type, stats2.getVariance()),
                stats2.getVariance(),
                toString(type, stats2.getPopulationVariance(), isPeriod, true),
                toValue(type, stats2.getPopulationVariance()),
                stats2.getPopulationVariance(),
                toString(type, stddev, isPeriod, false),
                toValue(type, stddev),
                stddev,
                toString(type, range, isPeriod, false),
                toValue(type, range),
                stats2.getMax() - stats2.getMin(),
                toString(type, kurtosis, isPeriod, false),
                toValue(type, kurtosis),
                kurtosis));
      } else if (scale == DataScale.RATIO) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        DescriptiveStatistics stats2 = statistics.get(attribute);

        // TODO: Something is wrong with commons math's kurtosis
        double kurtosis = stats2.getKurtosis();
        kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
        double range = stats2.getMax() - stats2.getMin();
        double stddev = Math.sqrt(stats2.getVariance());

        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.RATIO,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax()),
                toString(type, stats2.getMean(), false, false),
                toValue(type, stats2.getMean()),
                stats2.getMean(),
                toString(type, stats2.getVariance(), false, false),
                toValue(type, stats2.getVariance()),
                stats2.getVariance(),
                toString(type, stats2.getPopulationVariance(), false, false),
                toValue(type, stats2.getPopulationVariance()),
                stats2.getPopulationVariance(),
                toString(type, stddev, false, false),
                toValue(type, stddev),
                stddev,
                toString(type, range, false, false),
                toValue(type, range),
                range,
                toString(type, kurtosis, false, false),
                toValue(type, kurtosis),
                kurtosis,
                toString(type, stats2.getGeometricMean(), false, false),
                toValue(type, stats2.getGeometricMean()),
                stats2.getGeometricMean()));
      }
    }

    return result;
  }