Beispiel #1
0
  /**
   * Returns a frequency distribution for the values in the given column. The order for string data
   * items is derived from the provided hierarchy
   *
   * @param column The column
   * @param hierarchy The hierarchy, may be null
   * @return
   */
  public StatisticsFrequencyDistribution getFrequencyDistribution(int column, Hierarchy hierarchy) {

    // Reset stop flag
    interrupt = false;

    // Init
    String[] values = getDistinctValuesOrdered(column, hierarchy);
    double[] frequencies = new double[values.length];

    // Create map of indexes
    Map<String, Integer> indexes = new HashMap<String, Integer>();
    for (int i = 0; i < values.length; i++) {
      checkInterrupt();
      indexes.put(values[i], i);
    }

    // Count frequencies
    for (int row = 0; row < handle.getNumRows(); row++) {
      checkInterrupt();
      String value = handle.getValue(row, column);
      frequencies[indexes.get(value)]++;
    }

    // Divide by count
    int count = handle.getNumRows();
    for (int i = 0; i < frequencies.length; i++) {
      checkInterrupt();
      frequencies[i] /= (double) count;
    }

    // Return
    return new StatisticsFrequencyDistribution(values, frequencies, count);
  }
Beispiel #2
0
  /**
   * Returns summary statistics for all attributes.
   *
   * @param listwiseDeletion A flag enabling list-wise deletion
   * @return
   */
  @SuppressWarnings({"unchecked", "rawtypes"})
  public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) {

    Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>();
    Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>();
    Map<String, DataScale> scales = new HashMap<String, DataScale>();

    // Detect scales
    for (int col = 0; col < handle.getNumColumns(); col++) {

      // Meta
      String attribute = handle.getAttributeName(col);
      DataType<?> type = handle.getDataType(attribute);

      // Scale
      DataScale scale = type.getDescription().getScale();

      // Try to replace nominal scale with ordinal scale based on base data type
      if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) {
        if (!(handle.getBaseDataType(attribute) instanceof ARXString)
            && getHierarchy(col, true) != null) {
          scale = DataScale.ORDINAL;
        }
      }

      // Store
      scales.put(attribute, scale);
      statistics.put(attribute, new DescriptiveStatistics());
      ordinal.put(
          attribute,
          getSummaryStatisticsOrdinal(
              handle.getGeneralization(attribute),
              handle.getDataType(attribute),
              handle.getBaseDataType(attribute),
              getHierarchy(col, true)));
    }

    // Compute summary statistics
    for (int row = 0; row < handle.getNumRows(); row++) {

      // Check, if we should include this row
      boolean include = true;
      if (listwiseDeletion) {
        for (int col = 0; col < handle.getNumColumns(); col++) {
          if (handle.isSuppressed(row) || DataType.isNull(handle.getValue(row, col))) {
            include = false;
            break;
          }
        }
      }

      // Check
      checkInterrupt();

      // If yes, add
      if (include) {

        // For each column
        for (int col = 0; col < handle.getNumColumns(); col++) {

          // Meta
          String value = handle.getValue(row, col);
          String attribute = handle.getAttributeName(col);
          DataType<?> type = handle.getDataType(attribute);

          // Analyze
          if (!value.equals(handle.getSuppressionString()) && !DataType.isNull(value)) {
            ordinal.get(attribute).addValue(value);
            if (type instanceof DataTypeWithRatioScale) {
              statistics
                  .get(attribute)
                  .addValue(((DataTypeWithRatioScale) type).toDouble(type.parse(value)));
            }
          }
        }
      }
    }

    // Convert
    Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>();
    for (int col = 0; col < handle.getNumColumns(); col++) {

      // Check
      checkInterrupt();

      // Depending on scale
      String attribute = handle.getAttributeName(col);
      DataScale scale = scales.get(attribute);
      DataType<T> type = (DataType<T>) handle.getDataType(attribute);
      ordinal.get(attribute).analyze();
      if (scale == DataScale.NOMINAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.NOMINAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode())));
      } else if (scale == DataScale.ORDINAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.ORDINAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax())));
      } else if (scale == DataScale.INTERVAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        DescriptiveStatistics stats2 = statistics.get(attribute);
        boolean isPeriod = type.getDescription().getWrappedClass() == Date.class;

        // TODO: Something is wrong with commons math's kurtosis
        double kurtosis = stats2.getKurtosis();
        kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
        double range = stats2.getMax() - stats2.getMin();
        double stddev = Math.sqrt(stats2.getVariance());

        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.INTERVAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax()),
                toString(type, stats2.getMean(), false, false),
                toValue(type, stats2.getMean()),
                stats2.getMean(),
                toString(type, stats2.getVariance(), isPeriod, true),
                toValue(type, stats2.getVariance()),
                stats2.getVariance(),
                toString(type, stats2.getPopulationVariance(), isPeriod, true),
                toValue(type, stats2.getPopulationVariance()),
                stats2.getPopulationVariance(),
                toString(type, stddev, isPeriod, false),
                toValue(type, stddev),
                stddev,
                toString(type, range, isPeriod, false),
                toValue(type, range),
                stats2.getMax() - stats2.getMin(),
                toString(type, kurtosis, isPeriod, false),
                toValue(type, kurtosis),
                kurtosis));
      } else if (scale == DataScale.RATIO) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        DescriptiveStatistics stats2 = statistics.get(attribute);

        // TODO: Something is wrong with commons math's kurtosis
        double kurtosis = stats2.getKurtosis();
        kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
        double range = stats2.getMax() - stats2.getMin();
        double stddev = Math.sqrt(stats2.getVariance());

        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.RATIO,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax()),
                toString(type, stats2.getMean(), false, false),
                toValue(type, stats2.getMean()),
                stats2.getMean(),
                toString(type, stats2.getVariance(), false, false),
                toValue(type, stats2.getVariance()),
                stats2.getVariance(),
                toString(type, stats2.getPopulationVariance(), false, false),
                toValue(type, stats2.getPopulationVariance()),
                stats2.getPopulationVariance(),
                toString(type, stddev, false, false),
                toValue(type, stddev),
                stddev,
                toString(type, range, false, false),
                toValue(type, range),
                range,
                toString(type, kurtosis, false, false),
                toValue(type, kurtosis),
                kurtosis,
                toString(type, stats2.getGeometricMean(), false, false),
                toValue(type, stats2.getGeometricMean()),
                stats2.getGeometricMean()));
      }
    }

    return result;
  }
Beispiel #3
0
  /**
   * Returns a contingency table for the given columns. The order for string data items is derived
   * from the provided hierarchies
   *
   * @param column1 The first column
   * @param hierarchy1 The hierarchy for the first column, may be null
   * @param column2 The second column
   * @param hierarchy2 The hierarchy for the second column, may be null
   * @return
   */
  public StatisticsContingencyTable getContingencyTable(
      int column1, Hierarchy hierarchy1, int column2, Hierarchy hierarchy2) {

    // Reset stop flag
    interrupt = false;

    // Init
    String[] values1 = getDistinctValuesOrdered(column1, hierarchy1);
    String[] values2 = getDistinctValuesOrdered(column2, hierarchy2);

    // Create maps of indexes
    Map<String, Integer> indexes1 = new HashMap<String, Integer>();
    for (int i = 0; i < values1.length; i++) {
      checkInterrupt();
      indexes1.put(values1[i], i);
    }
    Map<String, Integer> indexes2 = new HashMap<String, Integer>();
    for (int i = 0; i < values2.length; i++) {
      checkInterrupt();
      indexes2.put(values2[i], i);
    }

    // Create entry set
    int max = Integer.MIN_VALUE;
    final Map<Entry, Integer> entries = new HashMap<Entry, Integer>();
    for (int row = 0; row < handle.getNumRows(); row++) {
      checkInterrupt();
      int index1 = indexes1.get(handle.getValue(row, column1));
      int index2 = indexes2.get(handle.getValue(row, column2));
      Entry entry = new Entry(index1, index2);
      Integer previous = entries.get(entry);
      int value = previous != null ? previous + 1 : 1;
      max = Math.max(max, value);
      entries.put(entry, value);
    }

    // Create iterator
    final int count = handle.getNumRows();
    final Iterator<Entry> internal = entries.keySet().iterator();
    final Iterator<Entry> iterator =
        new Iterator<Entry>() {

          private Map<Entry, Integer> _entries = entries;
          private Iterator<Entry> _internal = internal;

          @Override
          public boolean hasNext() {

            if (_internal == null) return false;
            boolean result = _internal.hasNext();

            // Try to release resources as early as possible
            if (!result) {
              _internal = null;
              _entries = null;
            }
            return result;
          }

          @Override
          public Entry next() {
            if (_internal == null) return null;
            Entry e = _internal.next();
            e.frequency = (double) _entries.get(e) / (double) count;
            return e;
          }

          @Override
          public void remove() {
            throw new UnsupportedOperationException();
          }
        };

    // Result result
    return new StatisticsContingencyTable(
        values1, values2, count, (double) max / (double) count, iterator);
  }