/** * Returns summary statistics for all attributes. * * @param listwiseDeletion A flag enabling list-wise deletion * @return */ @SuppressWarnings({"unchecked", "rawtypes"}) public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) { Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>(); Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>(); Map<String, DataScale> scales = new HashMap<String, DataScale>(); // Detect scales for (int col = 0; col < handle.getNumColumns(); col++) { // Meta String attribute = handle.getAttributeName(col); DataType<?> type = handle.getDataType(attribute); // Scale DataScale scale = type.getDescription().getScale(); // Try to replace nominal scale with ordinal scale based on base data type if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) { if (!(handle.getBaseDataType(attribute) instanceof ARXString) && getHierarchy(col, true) != null) { scale = DataScale.ORDINAL; } } // Store scales.put(attribute, scale); statistics.put(attribute, new DescriptiveStatistics()); ordinal.put( attribute, getSummaryStatisticsOrdinal( handle.getGeneralization(attribute), handle.getDataType(attribute), handle.getBaseDataType(attribute), getHierarchy(col, true))); } // Compute summary statistics for (int row = 0; row < handle.getNumRows(); row++) { // Check, if we should include this row boolean include = true; if (listwiseDeletion) { for (int col = 0; col < handle.getNumColumns(); col++) { if (handle.isSuppressed(row) || DataType.isNull(handle.getValue(row, col))) { include = false; break; } } } // Check checkInterrupt(); // If yes, add if (include) { // For each column for (int col = 0; col < handle.getNumColumns(); col++) { // Meta String value = handle.getValue(row, col); String attribute = handle.getAttributeName(col); DataType<?> type = handle.getDataType(attribute); // Analyze if (!value.equals(handle.getSuppressionString()) && !DataType.isNull(value)) { ordinal.get(attribute).addValue(value); if (type instanceof DataTypeWithRatioScale) { statistics .get(attribute) .addValue(((DataTypeWithRatioScale) type).toDouble(type.parse(value))); } } } } } // Convert Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>(); for (int col = 0; col < handle.getNumColumns(); col++) { // Check checkInterrupt(); // Depending on scale String attribute = handle.getAttributeName(col); DataScale scale = scales.get(attribute); DataType<T> type = (DataType<T>) handle.getDataType(attribute); ordinal.get(attribute).analyze(); if (scale == DataScale.NOMINAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); result.put( attribute, new StatisticsSummary<T>( DataScale.NOMINAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()))); } else if (scale == DataScale.ORDINAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); result.put( attribute, new StatisticsSummary<T>( DataScale.ORDINAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()))); } else if (scale == DataScale.INTERVAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); DescriptiveStatistics stats2 = statistics.get(attribute); boolean isPeriod = type.getDescription().getWrappedClass() == Date.class; // TODO: Something is wrong with commons math's kurtosis double kurtosis = stats2.getKurtosis(); kurtosis = kurtosis < 0d ? Double.NaN : kurtosis; double range = stats2.getMax() - stats2.getMin(); double stddev = Math.sqrt(stats2.getVariance()); result.put( attribute, new StatisticsSummary<T>( DataScale.INTERVAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false), toValue(type, stats2.getMean()), stats2.getMean(), toString(type, stats2.getVariance(), isPeriod, true), toValue(type, stats2.getVariance()), stats2.getVariance(), toString(type, stats2.getPopulationVariance(), isPeriod, true), toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(), toString(type, stddev, isPeriod, false), toValue(type, stddev), stddev, toString(type, range, isPeriod, false), toValue(type, range), stats2.getMax() - stats2.getMin(), toString(type, kurtosis, isPeriod, false), toValue(type, kurtosis), kurtosis)); } else if (scale == DataScale.RATIO) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); DescriptiveStatistics stats2 = statistics.get(attribute); // TODO: Something is wrong with commons math's kurtosis double kurtosis = stats2.getKurtosis(); kurtosis = kurtosis < 0d ? Double.NaN : kurtosis; double range = stats2.getMax() - stats2.getMin(); double stddev = Math.sqrt(stats2.getVariance()); result.put( attribute, new StatisticsSummary<T>( DataScale.RATIO, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false), toValue(type, stats2.getMean()), stats2.getMean(), toString(type, stats2.getVariance(), false, false), toValue(type, stats2.getVariance()), stats2.getVariance(), toString(type, stats2.getPopulationVariance(), false, false), toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(), toString(type, stddev, false, false), toValue(type, stddev), stddev, toString(type, range, false, false), toValue(type, range), range, toString(type, kurtosis, false, false), toValue(type, kurtosis), kurtosis, toString(type, stats2.getGeometricMean(), false, false), toValue(type, stats2.getGeometricMean()), stats2.getGeometricMean())); } } return result; }
/** * Returns an ordered list of the distinct set of data items from the given column. This method * assumes that the order of string data items can (and should) be derived from the provided * hierarchy * * @param column The column * @param hierarchy The hierarchy, may be null * @return */ public String[] getDistinctValuesOrdered(int column, Hierarchy hierarchy) { // Reset stop flag interrupt = false; // Obtain list and data type final String[] list = getDistinctValues(column); final String attribute = handle.getAttributeName(column); final DataType<?> datatype = handle.getDataType(attribute); final int level = handle.getGeneralization(attribute); final String[][] _hierarchy = hierarchy != null ? hierarchy.getHierarchy() : null; // Sort by data type if (_hierarchy == null || level == 0) { sort(list, datatype, handle.getSuppressionString()); // Sort by hierarchy and data type } else { // Build order directly from the hierarchy final Map<String, Integer> order = new HashMap<String, Integer>(); int max = 0; // The order to use for the suppression string // Create base order Set<String> baseSet = new HashSet<String>(); DataType<?> baseType = handle.getBaseDataType(attribute); for (int i = 0; i < _hierarchy.length; i++) { String element = _hierarchy[i][0]; checkInterrupt(); // Make sure that only elements from the hierarchy // are added that are included in the data // TODO: Calling isValid is only a work-around if (baseType.isValid(element)) baseSet.add(element); } String[] baseArray = baseSet.toArray(new String[baseSet.size()]); sort(baseArray, handle.getBaseDataType(attribute), handle.getSuppressionString()); Map<String, Integer> baseOrder = new HashMap<String, Integer>(); for (int i = 0; i < baseArray.length; i++) { checkInterrupt(); baseOrder.put(baseArray[i], i); } // Build higher level order from base order for (int i = 0; i < _hierarchy.length; i++) { checkInterrupt(); if (!order.containsKey(_hierarchy[i][level])) { Integer position = baseOrder.get(_hierarchy[i][0]); if (position != null) { order.put(_hierarchy[i][level], position); max = Math.max(position, max) + 1; } } } // Add suppression string String supp = handle.getSuppressionString(); if (supp != null) order.put(supp, max); // Sort sort(list, order); } // Done return list; }