/** * Returns a frequency distribution for the values in the given column. The order for string data * items is derived from the provided hierarchy * * @param column The column * @param hierarchy The hierarchy, may be null * @return */ public StatisticsFrequencyDistribution getFrequencyDistribution(int column, Hierarchy hierarchy) { // Reset stop flag interrupt = false; // Init String[] values = getDistinctValuesOrdered(column, hierarchy); double[] frequencies = new double[values.length]; // Create map of indexes Map<String, Integer> indexes = new HashMap<String, Integer>(); for (int i = 0; i < values.length; i++) { checkInterrupt(); indexes.put(values[i], i); } // Count frequencies for (int row = 0; row < handle.getNumRows(); row++) { checkInterrupt(); String value = handle.getValue(row, column); frequencies[indexes.get(value)]++; } // Divide by count int count = handle.getNumRows(); for (int i = 0; i < frequencies.length; i++) { checkInterrupt(); frequencies[i] /= (double) count; } // Return return new StatisticsFrequencyDistribution(values, frequencies, count); }
/** * Returns summary statistics for all attributes. * * @param listwiseDeletion A flag enabling list-wise deletion * @return */ @SuppressWarnings({"unchecked", "rawtypes"}) public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) { Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>(); Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>(); Map<String, DataScale> scales = new HashMap<String, DataScale>(); // Detect scales for (int col = 0; col < handle.getNumColumns(); col++) { // Meta String attribute = handle.getAttributeName(col); DataType<?> type = handle.getDataType(attribute); // Scale DataScale scale = type.getDescription().getScale(); // Try to replace nominal scale with ordinal scale based on base data type if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) { if (!(handle.getBaseDataType(attribute) instanceof ARXString) && getHierarchy(col, true) != null) { scale = DataScale.ORDINAL; } } // Store scales.put(attribute, scale); statistics.put(attribute, new DescriptiveStatistics()); ordinal.put( attribute, getSummaryStatisticsOrdinal( handle.getGeneralization(attribute), handle.getDataType(attribute), handle.getBaseDataType(attribute), getHierarchy(col, true))); } // Compute summary statistics for (int row = 0; row < handle.getNumRows(); row++) { // Check, if we should include this row boolean include = true; if (listwiseDeletion) { for (int col = 0; col < handle.getNumColumns(); col++) { if (handle.isSuppressed(row) || DataType.isNull(handle.getValue(row, col))) { include = false; break; } } } // Check checkInterrupt(); // If yes, add if (include) { // For each column for (int col = 0; col < handle.getNumColumns(); col++) { // Meta String value = handle.getValue(row, col); String attribute = handle.getAttributeName(col); DataType<?> type = handle.getDataType(attribute); // Analyze if (!value.equals(handle.getSuppressionString()) && !DataType.isNull(value)) { ordinal.get(attribute).addValue(value); if (type instanceof DataTypeWithRatioScale) { statistics .get(attribute) .addValue(((DataTypeWithRatioScale) type).toDouble(type.parse(value))); } } } } } // Convert Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>(); for (int col = 0; col < handle.getNumColumns(); col++) { // Check checkInterrupt(); // Depending on scale String attribute = handle.getAttributeName(col); DataScale scale = scales.get(attribute); DataType<T> type = (DataType<T>) handle.getDataType(attribute); ordinal.get(attribute).analyze(); if (scale == DataScale.NOMINAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); result.put( attribute, new StatisticsSummary<T>( DataScale.NOMINAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()))); } else if (scale == DataScale.ORDINAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); result.put( attribute, new StatisticsSummary<T>( DataScale.ORDINAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()))); } else if (scale == DataScale.INTERVAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); DescriptiveStatistics stats2 = statistics.get(attribute); boolean isPeriod = type.getDescription().getWrappedClass() == Date.class; // TODO: Something is wrong with commons math's kurtosis double kurtosis = stats2.getKurtosis(); kurtosis = kurtosis < 0d ? Double.NaN : kurtosis; double range = stats2.getMax() - stats2.getMin(); double stddev = Math.sqrt(stats2.getVariance()); result.put( attribute, new StatisticsSummary<T>( DataScale.INTERVAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false), toValue(type, stats2.getMean()), stats2.getMean(), toString(type, stats2.getVariance(), isPeriod, true), toValue(type, stats2.getVariance()), stats2.getVariance(), toString(type, stats2.getPopulationVariance(), isPeriod, true), toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(), toString(type, stddev, isPeriod, false), toValue(type, stddev), stddev, toString(type, range, isPeriod, false), toValue(type, range), stats2.getMax() - stats2.getMin(), toString(type, kurtosis, isPeriod, false), toValue(type, kurtosis), kurtosis)); } else if (scale == DataScale.RATIO) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); DescriptiveStatistics stats2 = statistics.get(attribute); // TODO: Something is wrong with commons math's kurtosis double kurtosis = stats2.getKurtosis(); kurtosis = kurtosis < 0d ? Double.NaN : kurtosis; double range = stats2.getMax() - stats2.getMin(); double stddev = Math.sqrt(stats2.getVariance()); result.put( attribute, new StatisticsSummary<T>( DataScale.RATIO, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false), toValue(type, stats2.getMean()), stats2.getMean(), toString(type, stats2.getVariance(), false, false), toValue(type, stats2.getVariance()), stats2.getVariance(), toString(type, stats2.getPopulationVariance(), false, false), toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(), toString(type, stddev, false, false), toValue(type, stddev), stddev, toString(type, range, false, false), toValue(type, range), range, toString(type, kurtosis, false, false), toValue(type, kurtosis), kurtosis, toString(type, stats2.getGeometricMean(), false, false), toValue(type, stats2.getGeometricMean()), stats2.getGeometricMean())); } } return result; }
/** * Returns a contingency table for the given columns. The order for string data items is derived * from the provided hierarchies * * @param column1 The first column * @param hierarchy1 The hierarchy for the first column, may be null * @param column2 The second column * @param hierarchy2 The hierarchy for the second column, may be null * @return */ public StatisticsContingencyTable getContingencyTable( int column1, Hierarchy hierarchy1, int column2, Hierarchy hierarchy2) { // Reset stop flag interrupt = false; // Init String[] values1 = getDistinctValuesOrdered(column1, hierarchy1); String[] values2 = getDistinctValuesOrdered(column2, hierarchy2); // Create maps of indexes Map<String, Integer> indexes1 = new HashMap<String, Integer>(); for (int i = 0; i < values1.length; i++) { checkInterrupt(); indexes1.put(values1[i], i); } Map<String, Integer> indexes2 = new HashMap<String, Integer>(); for (int i = 0; i < values2.length; i++) { checkInterrupt(); indexes2.put(values2[i], i); } // Create entry set int max = Integer.MIN_VALUE; final Map<Entry, Integer> entries = new HashMap<Entry, Integer>(); for (int row = 0; row < handle.getNumRows(); row++) { checkInterrupt(); int index1 = indexes1.get(handle.getValue(row, column1)); int index2 = indexes2.get(handle.getValue(row, column2)); Entry entry = new Entry(index1, index2); Integer previous = entries.get(entry); int value = previous != null ? previous + 1 : 1; max = Math.max(max, value); entries.put(entry, value); } // Create iterator final int count = handle.getNumRows(); final Iterator<Entry> internal = entries.keySet().iterator(); final Iterator<Entry> iterator = new Iterator<Entry>() { private Map<Entry, Integer> _entries = entries; private Iterator<Entry> _internal = internal; @Override public boolean hasNext() { if (_internal == null) return false; boolean result = _internal.hasNext(); // Try to release resources as early as possible if (!result) { _internal = null; _entries = null; } return result; } @Override public Entry next() { if (_internal == null) return null; Entry e = _internal.next(); e.frequency = (double) _entries.get(e) / (double) count; return e; } @Override public void remove() { throw new UnsupportedOperationException(); } }; // Result result return new StatisticsContingencyTable( values1, values2, count, (double) max / (double) count, iterator); }