Пример #1
0
  /**
   * Returns a frequency distribution for the values in the given column. The order for string data
   * items is derived from the provided hierarchy
   *
   * @param column The column
   * @param hierarchy The hierarchy, may be null
   * @return
   */
  public StatisticsFrequencyDistribution getFrequencyDistribution(int column, Hierarchy hierarchy) {

    // Reset stop flag
    interrupt = false;

    // Init
    String[] values = getDistinctValuesOrdered(column, hierarchy);
    double[] frequencies = new double[values.length];

    // Create map of indexes
    Map<String, Integer> indexes = new HashMap<String, Integer>();
    for (int i = 0; i < values.length; i++) {
      checkInterrupt();
      indexes.put(values[i], i);
    }

    // Count frequencies
    for (int row = 0; row < handle.getNumRows(); row++) {
      checkInterrupt();
      String value = handle.getValue(row, column);
      frequencies[indexes.get(value)]++;
    }

    // Divide by count
    int count = handle.getNumRows();
    for (int i = 0; i < frequencies.length; i++) {
      checkInterrupt();
      frequencies[i] /= (double) count;
    }

    // Return
    return new StatisticsFrequencyDistribution(values, frequencies, count);
  }
Пример #2
0
  /**
   * Returns the appropriate hierarchy, if any.
   *
   * @param column
   * @param orderFromDefinition
   * @return
   */
  private Hierarchy getHierarchy(int column, boolean orderFromDefinition) {

    // Init
    final String attribute = handle.getAttributeName(column);
    final AttributeType type = handle.getDefinition().getAttributeType(attribute);
    final DataType<?> datatype = handle.getDataType(attribute);
    final Hierarchy hierarchy;

    // Check if hierarchy available
    if (orderFromDefinition && datatype instanceof ARXString && type instanceof Hierarchy) {
      hierarchy = ((Hierarchy) type);
    } else {
      hierarchy = null;
    }

    return hierarchy;
  }
Пример #3
0
  /**
   * Returns summary statistics for all attributes.
   *
   * @param listwiseDeletion A flag enabling list-wise deletion
   * @return
   */
  @SuppressWarnings({"unchecked", "rawtypes"})
  public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) {

    Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>();
    Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>();
    Map<String, DataScale> scales = new HashMap<String, DataScale>();

    // Detect scales
    for (int col = 0; col < handle.getNumColumns(); col++) {

      // Meta
      String attribute = handle.getAttributeName(col);
      DataType<?> type = handle.getDataType(attribute);

      // Scale
      DataScale scale = type.getDescription().getScale();

      // Try to replace nominal scale with ordinal scale based on base data type
      if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) {
        if (!(handle.getBaseDataType(attribute) instanceof ARXString)
            && getHierarchy(col, true) != null) {
          scale = DataScale.ORDINAL;
        }
      }

      // Store
      scales.put(attribute, scale);
      statistics.put(attribute, new DescriptiveStatistics());
      ordinal.put(
          attribute,
          getSummaryStatisticsOrdinal(
              handle.getGeneralization(attribute),
              handle.getDataType(attribute),
              handle.getBaseDataType(attribute),
              getHierarchy(col, true)));
    }

    // Compute summary statistics
    for (int row = 0; row < handle.getNumRows(); row++) {

      // Check, if we should include this row
      boolean include = true;
      if (listwiseDeletion) {
        for (int col = 0; col < handle.getNumColumns(); col++) {
          if (handle.isSuppressed(row) || DataType.isNull(handle.getValue(row, col))) {
            include = false;
            break;
          }
        }
      }

      // Check
      checkInterrupt();

      // If yes, add
      if (include) {

        // For each column
        for (int col = 0; col < handle.getNumColumns(); col++) {

          // Meta
          String value = handle.getValue(row, col);
          String attribute = handle.getAttributeName(col);
          DataType<?> type = handle.getDataType(attribute);

          // Analyze
          if (!value.equals(handle.getSuppressionString()) && !DataType.isNull(value)) {
            ordinal.get(attribute).addValue(value);
            if (type instanceof DataTypeWithRatioScale) {
              statistics
                  .get(attribute)
                  .addValue(((DataTypeWithRatioScale) type).toDouble(type.parse(value)));
            }
          }
        }
      }
    }

    // Convert
    Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>();
    for (int col = 0; col < handle.getNumColumns(); col++) {

      // Check
      checkInterrupt();

      // Depending on scale
      String attribute = handle.getAttributeName(col);
      DataScale scale = scales.get(attribute);
      DataType<T> type = (DataType<T>) handle.getDataType(attribute);
      ordinal.get(attribute).analyze();
      if (scale == DataScale.NOMINAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.NOMINAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode())));
      } else if (scale == DataScale.ORDINAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.ORDINAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax())));
      } else if (scale == DataScale.INTERVAL) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        DescriptiveStatistics stats2 = statistics.get(attribute);
        boolean isPeriod = type.getDescription().getWrappedClass() == Date.class;

        // TODO: Something is wrong with commons math's kurtosis
        double kurtosis = stats2.getKurtosis();
        kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
        double range = stats2.getMax() - stats2.getMin();
        double stddev = Math.sqrt(stats2.getVariance());

        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.INTERVAL,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax()),
                toString(type, stats2.getMean(), false, false),
                toValue(type, stats2.getMean()),
                stats2.getMean(),
                toString(type, stats2.getVariance(), isPeriod, true),
                toValue(type, stats2.getVariance()),
                stats2.getVariance(),
                toString(type, stats2.getPopulationVariance(), isPeriod, true),
                toValue(type, stats2.getPopulationVariance()),
                stats2.getPopulationVariance(),
                toString(type, stddev, isPeriod, false),
                toValue(type, stddev),
                stddev,
                toString(type, range, isPeriod, false),
                toValue(type, range),
                stats2.getMax() - stats2.getMin(),
                toString(type, kurtosis, isPeriod, false),
                toValue(type, kurtosis),
                kurtosis));
      } else if (scale == DataScale.RATIO) {
        StatisticsSummaryOrdinal stats = ordinal.get(attribute);
        DescriptiveStatistics stats2 = statistics.get(attribute);

        // TODO: Something is wrong with commons math's kurtosis
        double kurtosis = stats2.getKurtosis();
        kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
        double range = stats2.getMax() - stats2.getMin();
        double stddev = Math.sqrt(stats2.getVariance());

        result.put(
            attribute,
            new StatisticsSummary<T>(
                DataScale.RATIO,
                stats.getNumberOfMeasures(),
                stats.getMode(),
                type.parse(stats.getMode()),
                stats.getMedian(),
                type.parse(stats.getMedian()),
                stats.getMin(),
                type.parse(stats.getMin()),
                stats.getMax(),
                type.parse(stats.getMax()),
                toString(type, stats2.getMean(), false, false),
                toValue(type, stats2.getMean()),
                stats2.getMean(),
                toString(type, stats2.getVariance(), false, false),
                toValue(type, stats2.getVariance()),
                stats2.getVariance(),
                toString(type, stats2.getPopulationVariance(), false, false),
                toValue(type, stats2.getPopulationVariance()),
                stats2.getPopulationVariance(),
                toString(type, stddev, false, false),
                toValue(type, stddev),
                stddev,
                toString(type, range, false, false),
                toValue(type, range),
                range,
                toString(type, kurtosis, false, false),
                toValue(type, kurtosis),
                kurtosis,
                toString(type, stats2.getGeometricMean(), false, false),
                toValue(type, stats2.getGeometricMean()),
                stats2.getGeometricMean()));
      }
    }

    return result;
  }
Пример #4
0
  /**
   * Returns an ordered list of the distinct set of data items from the given column. This method
   * assumes that the order of string data items can (and should) be derived from the provided
   * hierarchy
   *
   * @param column The column
   * @param hierarchy The hierarchy, may be null
   * @return
   */
  public String[] getDistinctValuesOrdered(int column, Hierarchy hierarchy) {

    // Reset stop flag
    interrupt = false;

    // Obtain list and data type
    final String[] list = getDistinctValues(column);
    final String attribute = handle.getAttributeName(column);
    final DataType<?> datatype = handle.getDataType(attribute);
    final int level = handle.getGeneralization(attribute);
    final String[][] _hierarchy = hierarchy != null ? hierarchy.getHierarchy() : null;

    // Sort by data type
    if (_hierarchy == null || level == 0) {
      sort(list, datatype, handle.getSuppressionString());
      // Sort by hierarchy and data type
    } else {
      // Build order directly from the hierarchy
      final Map<String, Integer> order = new HashMap<String, Integer>();
      int max = 0; // The order to use for the suppression string

      // Create base order
      Set<String> baseSet = new HashSet<String>();
      DataType<?> baseType = handle.getBaseDataType(attribute);
      for (int i = 0; i < _hierarchy.length; i++) {
        String element = _hierarchy[i][0];
        checkInterrupt();
        // Make sure that only elements from the hierarchy
        // are added that are included in the data
        // TODO: Calling isValid is only a work-around
        if (baseType.isValid(element)) baseSet.add(element);
      }
      String[] baseArray = baseSet.toArray(new String[baseSet.size()]);
      sort(baseArray, handle.getBaseDataType(attribute), handle.getSuppressionString());
      Map<String, Integer> baseOrder = new HashMap<String, Integer>();
      for (int i = 0; i < baseArray.length; i++) {
        checkInterrupt();
        baseOrder.put(baseArray[i], i);
      }

      // Build higher level order from base order
      for (int i = 0; i < _hierarchy.length; i++) {
        checkInterrupt();
        if (!order.containsKey(_hierarchy[i][level])) {
          Integer position = baseOrder.get(_hierarchy[i][0]);
          if (position != null) {
            order.put(_hierarchy[i][level], position);
            max = Math.max(position, max) + 1;
          }
        }
      }

      // Add suppression string
      String supp = handle.getSuppressionString();
      if (supp != null) order.put(supp, max);

      // Sort
      sort(list, order);
    }

    // Done
    return list;
  }
Пример #5
0
  /**
   * Returns a contingency table for the given columns. The order for string data items is derived
   * from the provided hierarchies
   *
   * @param column1 The first column
   * @param hierarchy1 The hierarchy for the first column, may be null
   * @param column2 The second column
   * @param hierarchy2 The hierarchy for the second column, may be null
   * @return
   */
  public StatisticsContingencyTable getContingencyTable(
      int column1, Hierarchy hierarchy1, int column2, Hierarchy hierarchy2) {

    // Reset stop flag
    interrupt = false;

    // Init
    String[] values1 = getDistinctValuesOrdered(column1, hierarchy1);
    String[] values2 = getDistinctValuesOrdered(column2, hierarchy2);

    // Create maps of indexes
    Map<String, Integer> indexes1 = new HashMap<String, Integer>();
    for (int i = 0; i < values1.length; i++) {
      checkInterrupt();
      indexes1.put(values1[i], i);
    }
    Map<String, Integer> indexes2 = new HashMap<String, Integer>();
    for (int i = 0; i < values2.length; i++) {
      checkInterrupt();
      indexes2.put(values2[i], i);
    }

    // Create entry set
    int max = Integer.MIN_VALUE;
    final Map<Entry, Integer> entries = new HashMap<Entry, Integer>();
    for (int row = 0; row < handle.getNumRows(); row++) {
      checkInterrupt();
      int index1 = indexes1.get(handle.getValue(row, column1));
      int index2 = indexes2.get(handle.getValue(row, column2));
      Entry entry = new Entry(index1, index2);
      Integer previous = entries.get(entry);
      int value = previous != null ? previous + 1 : 1;
      max = Math.max(max, value);
      entries.put(entry, value);
    }

    // Create iterator
    final int count = handle.getNumRows();
    final Iterator<Entry> internal = entries.keySet().iterator();
    final Iterator<Entry> iterator =
        new Iterator<Entry>() {

          private Map<Entry, Integer> _entries = entries;
          private Iterator<Entry> _internal = internal;

          @Override
          public boolean hasNext() {

            if (_internal == null) return false;
            boolean result = _internal.hasNext();

            // Try to release resources as early as possible
            if (!result) {
              _internal = null;
              _entries = null;
            }
            return result;
          }

          @Override
          public Entry next() {
            if (_internal == null) return null;
            Entry e = _internal.next();
            e.frequency = (double) _entries.get(e) / (double) count;
            return e;
          }

          @Override
          public void remove() {
            throw new UnsupportedOperationException();
          }
        };

    // Result result
    return new StatisticsContingencyTable(
        values1, values2, count, (double) max / (double) count, iterator);
  }