Beispiel #1
0
  /**
   * Returns summary statistics for all attributes.
   *
   * @param listwiseDeletion A flag enabling list-wise deletion
   * @return
   */
  @SuppressWarnings({"unchecked", "rawtypes"})
  public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) {

    Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>();
    Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>();
    Map<String, DataScale> scales = new HashMap<String, DataScale>();

    // Detect scales
    for (int col = 0; col < handle.getNumColumns(); col++) {

      // Meta
      String attribute = handle.getAttributeName(col);
      DataType<?> type = handle.getDataType(attribute);

      // Scale
      DataScale scale = type.getDescription().getScale();

      // Try to replace nominal scale with ordinal scale based on base data type
      if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) {
        if (!(handle.getBaseDataType(attribute) instanceof ARXString)
            && getHierarchy(col, true) != null) {
          scale = DataScale.ORDINAL;
        }
      }

      // Store
      scales.put(attribute, scale);
      statistics.put(attribute, new DescriptiveStatistics());
      ordinal.put(
          attribute,
          getSummaryStatisticsOrdinal(
              handle.getGeneralization(attribute),
              handle.getDataType(attribute),
              handle.getBaseDataType(attribute),
              getHierarchy(col, true)));
    }

    // Compute summary statistics
    for (int row = 0; row < handle.getNumRows(); row++) {

      // Check, if we should include this row
      boolean include = true;
      if (listwiseDeletion) {
        for (int col = 0; col < handle.getNumColumns(); col++) {
          if (handle.isSuppressed(row) || DataType.isNull(handle.getValue(row, col))) {
            include = false;
            break;
          }
        }
      }

      // Check
      checkInterrupt();

      // If yes, add
      if (include) {

        // For each column
        for (int col = 0; col < handle.getNumColumns(); col++) {

          // Meta
          String value = handle.getValue(row, col);
          String attribute = handle.getAttributeName(col);
          DataType<?> type = handle.getDataType(attribute);

          // Analyze
          if (!value.equals(handle.getSuppressionString()) && !DataType.isNull(value)) {
            ordinal.get(attribute).addValue(value);
            if (type instanceof DataTypeWithRatioScale) {
              statistics
                  .get(attribute)
                  .addValue(((DataTypeWithRatioScale) type).toDouble(type.parse(value)));
            }
          }
        }
      }
    }

    // Convert
    Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>();
    for (int col = 0; col < handle.getNumColumns(); col++) {

      // Check
      checkInterrupt();

      // Depending on scale
      String attribute = handle.getAttributeName(col);
      DataScale scale = scales.get(attribute);
      DataType<T> type = (DataType<T>) handle.getDataType(attribute);
      ordinal.get(attribute).analyze();
      if (scale == DataScale.NOMINAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.NOMINAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode())));
      } else if (scale == DataScale.ORDINAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.ORDINAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax())));
      } else if (scale == DataScale.INTERVAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        DescriptiveStatistics stats2 = statistics.get(attribute);
        boolean isPeriod = type.getDescription().getWrappedClass() == Date.class;

        // TODO: Something is wrong with commons math's kurtosis
        double kurtosis = stats2.getKurtosis();
        kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
        double range = stats2.getMax() - stats2.getMin();
        double stddev = Math.sqrt(stats2.getVariance());

        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.INTERVAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax()),
                toString(type, stats2.getMean(), false, false),
                toValue(type, stats2.getMean()),
                stats2.getMean(),
                toString(type, stats2.getVariance(), isPeriod, true),
                toValue(type, stats2.getVariance()),
                stats2.getVariance(),
                toString(type, stats2.getPopulationVariance(), isPeriod, true),
                toValue(type, stats2.getPopulationVariance()),
                stats2.getPopulationVariance(),
                toString(type, stddev, isPeriod, false),
                toValue(type, stddev),
                stddev,
                toString(type, range, isPeriod, false),
                toValue(type, range),
                stats2.getMax() - stats2.getMin(),
                toString(type, kurtosis, isPeriod, false),
                toValue(type, kurtosis),
                kurtosis));
      } else if (scale == DataScale.RATIO) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        DescriptiveStatistics stats2 = statistics.get(attribute);

        // TODO: Something is wrong with commons math's kurtosis
        double kurtosis = stats2.getKurtosis();
        kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
        double range = stats2.getMax() - stats2.getMin();
        double stddev = Math.sqrt(stats2.getVariance());

        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.RATIO,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax()),
                toString(type, stats2.getMean(), false, false),
                toValue(type, stats2.getMean()),
                stats2.getMean(),
                toString(type, stats2.getVariance(), false, false),
                toValue(type, stats2.getVariance()),
                stats2.getVariance(),
                toString(type, stats2.getPopulationVariance(), false, false),
                toValue(type, stats2.getPopulationVariance()),
                stats2.getPopulationVariance(),
                toString(type, stddev, false, false),
                toValue(type, stddev),
                stddev,
                toString(type, range, false, false),
                toValue(type, range),
                range,
                toString(type, kurtosis, false, false),
                toValue(type, kurtosis),
                kurtosis,
                toString(type, stats2.getGeometricMean(), false, false),
                toValue(type, stats2.getGeometricMean()),
                stats2.getGeometricMean()));
      }
    }

    return result;
  }
Beispiel #2
0
  /**
   * Returns an ordered list of the distinct set of data items from the given column. This method
   * assumes that the order of string data items can (and should) be derived from the provided
   * hierarchy
   *
   * @param column The column
   * @param hierarchy The hierarchy, may be null
   * @return
   */
  public String[] getDistinctValuesOrdered(int column, Hierarchy hierarchy) {

    // Reset stop flag
    interrupt = false;

    // Obtain list and data type
    final String[] list = getDistinctValues(column);
    final String attribute = handle.getAttributeName(column);
    final DataType<?> datatype = handle.getDataType(attribute);
    final int level = handle.getGeneralization(attribute);
    final String[][] _hierarchy = hierarchy != null ? hierarchy.getHierarchy() : null;

    // Sort by data type
    if (_hierarchy == null || level == 0) {
      sort(list, datatype, handle.getSuppressionString());
      // Sort by hierarchy and data type
    } else {
      // Build order directly from the hierarchy
      final Map<String, Integer> order = new HashMap<String, Integer>();
      int max = 0; // The order to use for the suppression string

      // Create base order
      Set<String> baseSet = new HashSet<String>();
      DataType<?> baseType = handle.getBaseDataType(attribute);
      for (int i = 0; i < _hierarchy.length; i++) {
        String element = _hierarchy[i][0];
        checkInterrupt();
        // Make sure that only elements from the hierarchy
        // are added that are included in the data
        // TODO: Calling isValid is only a work-around
        if (baseType.isValid(element)) baseSet.add(element);
      }
      String[] baseArray = baseSet.toArray(new String[baseSet.size()]);
      sort(baseArray, handle.getBaseDataType(attribute), handle.getSuppressionString());
      Map<String, Integer> baseOrder = new HashMap<String, Integer>();
      for (int i = 0; i < baseArray.length; i++) {
        checkInterrupt();
        baseOrder.put(baseArray[i], i);
      }

      // Build higher level order from base order
      for (int i = 0; i < _hierarchy.length; i++) {
        checkInterrupt();
        if (!order.containsKey(_hierarchy[i][level])) {
          Integer position = baseOrder.get(_hierarchy[i][0]);
          if (position != null) {
            order.put(_hierarchy[i][level], position);
            max = Math.max(position, max) + 1;
          }
        }
      }

      // Add suppression string
      String supp = handle.getSuppressionString();
      if (supp != null) order.put(supp, max);

      // Sort
      sort(list, order);
    }

    // Done
    return list;
  }