public Model learn(ExampleSet exampleSet) throws OperatorException { double value = 0.0; double[] confidences = null; int method = getParameterAsInt(PARAMETER_METHOD); Attribute label = exampleSet.getAttributes().getLabel(); if ((label.isNominal()) && ((method == MEDIAN) || (method == AVERAGE))) { logWarning( "Cannot use method '" + METHODS[method] + "' for nominal labels: changing to 'mode'!"); method = MODE; } else if ((!label.isNominal()) && (method == MODE)) { logWarning( "Cannot use method '" + METHODS[method] + "' for numerical labels: changing to 'average'!"); method = AVERAGE; } switch (method) { case MEDIAN: double[] labels = new double[exampleSet.size()]; Iterator<Example> r = exampleSet.iterator(); int counter = 0; while (r.hasNext()) { Example example = r.next(); labels[counter++] = example.getValue(example.getAttributes().getLabel()); } java.util.Arrays.sort(labels); value = labels[exampleSet.size() / 2]; break; case AVERAGE: exampleSet.recalculateAttributeStatistics(label); value = exampleSet.getStatistics(label, Statistics.AVERAGE); break; case MODE: exampleSet.recalculateAttributeStatistics(label); value = exampleSet.getStatistics(label, Statistics.MODE); confidences = new double[label.getMapping().size()]; for (int i = 0; i < confidences.length; i++) { confidences[i] = exampleSet.getStatistics(label, Statistics.COUNT, label.getMapping().mapIndex(i)) / exampleSet.size(); } break; case CONSTANT: value = getParameterAsDouble(PARAMETER_CONSTANT); break; case ATTRIBUTE: return new AttributeDefaultModel( exampleSet, getParameterAsString(PARAMETER_ATTRIBUTE_NAME)); default: // cannot happen throw new OperatorException("DefaultLearner: Unknown default method '" + method + "'!"); } log( "Default value is '" + (label.isNominal() ? label.getMapping().mapIndex((int) value) : value + "") + "'."); return new DefaultModel(exampleSet, value, confidences); }
private RuleModel createNumericalRuleModel(ExampleSet trainingSet, Attribute attribute) { RuleModel model = new RuleModel(trainingSet); // split by best attribute int oldSize = -1; while ((trainingSet.size() > 0) && (trainingSet.size() != oldSize)) { ExampleSet exampleSet = (ExampleSet) trainingSet.clone(); Split bestSplit = splitter.getBestSplit(exampleSet, attribute, null); double bestSplitValue = bestSplit.getSplitPoint(); if (!Double.isNaN(bestSplitValue)) { SplittedExampleSet splittedSet = SplittedExampleSet.splitByAttribute(exampleSet, attribute, bestSplitValue); Attribute label = splittedSet.getAttributes().getLabel(); splittedSet.selectSingleSubset(0); SplitCondition condition = new LessEqualsSplitCondition(attribute, bestSplitValue); splittedSet.recalculateAttributeStatistics(label); int labelValue = (int) splittedSet.getStatistics(label, Statistics.MODE); String labelName = label.getMapping().mapIndex(labelValue); Rule rule = new Rule(labelName, condition); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) frequencies[counter++] = (int) splittedSet.getStatistics(label, Statistics.COUNT, value); rule.setFrequencies(frequencies); model.addRule(rule); oldSize = trainingSet.size(); trainingSet = rule.removeCovered(trainingSet); } else { break; } } // add default rule if some examples were not yet covered if (trainingSet.size() > 0) { Attribute label = trainingSet.getAttributes().getLabel(); trainingSet.recalculateAttributeStatistics(label); int index = (int) trainingSet.getStatistics(label, Statistics.MODE); String defaultLabel = label.getMapping().mapIndex(index); Rule defaultRule = new Rule(defaultLabel); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) frequencies[counter++] = (int) (trainingSet.getStatistics(label, Statistics.COUNT, value)); defaultRule.setFrequencies(frequencies); model.addRule(defaultRule); } return model; }
private double[] getMeanVector(ExampleSet exampleSet) { exampleSet.recalculateAllAttributeStatistics(); Attributes attributes = exampleSet.getAttributes(); double[] meanVector = new double[attributes.size()]; int i = 0; for (Attribute attribute : attributes) { if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) { meanVector[i] = exampleSet.getStatistics(attribute, Statistics.MINIMUM); } else if (attribute.isNominal()) meanVector[i] = exampleSet.getStatistics(attribute, Statistics.MODE); else meanVector[i] = exampleSet.getStatistics(attribute, Statistics.AVERAGE); i++; } return meanVector; }
private NeuralDataSet getTraining(ExampleSet exampleSet) { double[][] data = new double[exampleSet.size()][exampleSet.getAttributes().size()]; double[][] labels = new double[exampleSet.size()][1]; int index = 0; Attribute label = exampleSet.getAttributes().getLabel(); this.attributeMin = new double[exampleSet.getAttributes().size()]; this.attributeMax = new double[attributeMin.length]; exampleSet.recalculateAllAttributeStatistics(); int a = 0; for (Attribute attribute : exampleSet.getAttributes()) { this.attributeMin[a] = exampleSet.getStatistics(attribute, Statistics.MINIMUM); this.attributeMax[a] = exampleSet.getStatistics(attribute, Statistics.MAXIMUM); a++; } this.labelMin = exampleSet.getStatistics(label, Statistics.MINIMUM); this.labelMax = exampleSet.getStatistics(label, Statistics.MAXIMUM); for (Example example : exampleSet) { // attributes a = 0; for (Attribute attribute : exampleSet.getAttributes()) { if (attributeMin[a] != attributeMax[a]) { data[index][a] = (example.getValue(attribute) - attributeMin[a]) / (attributeMax[a] - attributeMin[a]); } else { data[index][a] = example.getValue(attribute) - attributeMin[a]; } a++; } // label if (label.isNominal()) { labels[index][0] = example.getValue(label); } else { if (labelMax != labelMin) { labels[index][0] = (example.getValue(label) - labelMin) / (labelMax - labelMin); } else { labels[index][0] = example.getValue(label) - labelMin; } } index++; } return new BasicNeuralDataSet(data, labels); }
public PCAModel(ExampleSet eSet, double[] eigenvalues, double[][] eigenvectors) { super(eSet); this.keepAttributes = false; this.attributeNames = new String[eSet.getAttributes().size()]; this.means = new double[eSet.getAttributes().size()]; int counter = 0; eSet.recalculateAllAttributeStatistics(); // ensures that the statistics were created for (Attribute attribute : eSet.getAttributes()) { attributeNames[counter] = attribute.getName(); means[counter] = eSet.getStatistics(attribute, Statistics.AVERAGE); counter++; } this.eigenVectors = new ArrayList<Eigenvector>(eigenvalues.length); for (int i = 0; i < eigenvalues.length; i++) { double[] currentEigenVector = new double[eSet.getAttributes().size()]; for (int j = 0; j < currentEigenVector.length; j++) { currentEigenVector[j] = eigenvectors[j][i]; } this.eigenVectors.add(new Eigenvector(currentEigenVector, eigenvalues[i])); } // order the eigenvectors by the eigenvalues Collections.sort(this.eigenVectors); calculateCumulativeVariance(); }
@Override public PreprocessingModel createPreprocessingModel(ExampleSet exampleSet) throws OperatorException { boolean sortMappings = getParameterAsBoolean(PARAMETER_SORT_MAPPING_ALPHABETICALLY); Map<String, MappingTranslation> translations = new HashMap<String, MappingTranslation>(); exampleSet.recalculateAllAttributeStatistics(); for (Attribute attribute : exampleSet.getAttributes()) { MappingTranslation translation = new MappingTranslation((NominalMapping) attribute.getMapping().clone()); if (attribute.isNominal()) { for (String value : attribute.getMapping().getValues()) { double count = exampleSet.getStatistics(attribute, Statistics.COUNT, value); if (count > 0) { translation.newMapping.mapString(value); } } if (translation.newMapping.size() < attribute.getMapping().size()) { if (sortMappings) { translation.newMapping.sortMappings(); } translations.put(attribute.getName(), translation); } } } return new RemoveUnusedNominalValuesModel(exampleSet, translations); }
@Override public void doWork() throws OperatorException { ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class); // only use numeric attributes Tools.onlyNumericalAttributes(exampleSet, "KernelPCA"); Tools.onlyNonMissingValues(exampleSet, getOperatorClassName(), this); Attributes attributes = exampleSet.getAttributes(); int numberOfExamples = exampleSet.size(); // calculating means for later zero centering exampleSet.recalculateAllAttributeStatistics(); double[] means = new double[exampleSet.getAttributes().size()]; int i = 0; for (Attribute attribute : exampleSet.getAttributes()) { means[i] = exampleSet.getStatistics(attribute, Statistics.AVERAGE); i++; } // kernel Kernel kernel = Kernel.createKernel(this); // copying zero centered exampleValues ArrayList<double[]> exampleValues = new ArrayList<double[]>(numberOfExamples); i = 0; for (Example columnExample : exampleSet) { double[] columnValues = getAttributeValues(columnExample, attributes, means); exampleValues.add(columnValues); i++; } // filling kernel matrix Matrix kernelMatrix = new Matrix(numberOfExamples, numberOfExamples); for (i = 0; i < numberOfExamples; i++) { for (int j = 0; j < numberOfExamples; j++) { kernelMatrix.set( i, j, kernel.calculateDistance(exampleValues.get(i), exampleValues.get(j))); } } // calculating eigenVectors EigenvalueDecomposition eig = kernelMatrix.eig(); Model model = new KernelPCAModel(exampleSet, means, eig.getV(), exampleValues, kernel); if (exampleSetOutput.isConnected()) { exampleSetOutput.deliver(model.apply(exampleSet)); } originalOutput.deliver(exampleSet); modelOutput.deliver(model); }
@Override public void updateStatistics(final ExampleSet exampleSet) { final String days = WHITESPACE + I18N.getMessage( I18N.getGUIBundle(), "gui.label.attribute_statistics.statistics.days.label"); final String hours = WHITESPACE + I18N.getMessage( I18N.getGUIBundle(), "gui.label.attribute_statistics.statistics.hours.label"); long minMilliseconds = (long) exampleSet.getStatistics(getAttribute(), Statistics.MINIMUM); long maxMilliseconds = (long) exampleSet.getStatistics(getAttribute(), Statistics.MAXIMUM); long difference = maxMilliseconds - minMilliseconds; String dura = ""; if (getAttribute().getValueType() == Ontology.DATE) { // days dura += com.rapidminer.tools.Tools.formatIntegerIfPossible( Math.floor(difference / (H_IN_D * M_IN_H * S_IN_M * MS_IN_S)), 3) + days; } else if (getAttribute().getValueType() == Ontology.TIME) { // hours dura += com.rapidminer.tools.Tools.formatIntegerIfPossible( Math.floor(difference / (M_IN_H * S_IN_M * MS_IN_S)), 3) + hours; } else if (getAttribute().getValueType() == Ontology.DATE_TIME) { // days + hours + minutes + seconds dura += com.rapidminer.tools.Tools.formatIntegerIfPossible( Math.floor(difference / (H_IN_D * M_IN_H * S_IN_M * MS_IN_S)), 3) + SHORT_DAY; dura += WHITESPACE; double leftoverMilliSeconds = difference % (H_IN_D * M_IN_H * S_IN_M * MS_IN_S); dura += com.rapidminer.tools.Tools.formatIntegerIfPossible( Math.floor(leftoverMilliSeconds / (M_IN_H * S_IN_M * MS_IN_S)), 3) + SHORT_HOUR; dura += WHITESPACE; leftoverMilliSeconds = leftoverMilliSeconds % (M_IN_H * S_IN_M * MS_IN_S); dura += com.rapidminer.tools.Tools.formatIntegerIfPossible( Math.floor(leftoverMilliSeconds / (S_IN_M * MS_IN_S)), 3) + SHORT_MINUTE; dura += WHITESPACE; leftoverMilliSeconds = leftoverMilliSeconds % (S_IN_M * MS_IN_S); dura += com.rapidminer.tools.Tools.formatIntegerIfPossible( Math.floor(leftoverMilliSeconds / MS_IN_S), 3) + SHORT_SECOND; } String minResult = null; String maxResult = null; if (getAttribute().getValueType() == Ontology.DATE) { minResult = FORMAT_DATE.format(new Date(minMilliseconds)); maxResult = FORMAT_DATE.format(new Date(maxMilliseconds)); } else if (getAttribute().getValueType() == Ontology.TIME) { minResult = FORMAT_TIME.format(new Date(minMilliseconds)); maxResult = FORMAT_TIME.format(new Date(maxMilliseconds)); } else if (getAttribute().getValueType() == Ontology.DATE_TIME) { minResult = FORMAT_DATE_TIME.format(new Date(minMilliseconds)); maxResult = FORMAT_DATE_TIME.format(new Date(maxMilliseconds)); } missing = exampleSet.getStatistics(getAttribute(), Statistics.UNKNOWN); from = minResult; until = maxResult; duration = dura; fireStatisticsChangedEvent(); }
@Override public void doWork() throws OperatorException { ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class); exampleSet.recalculateAllAttributeStatistics(); List<String[]> attributeValueOptions = getParameterList(PARAMETER_ATTRIBUTES); LinkedHashMap<Attribute, Integer> attributeValueOptionsMap = new LinkedHashMap<Attribute, Integer>(); int[] valueOptions = new int[attributeValueOptions.size()]; Pattern[] attributeRegexPatterns = new Pattern[attributeValueOptions.size()]; Attribute[] attributes = new Attribute[attributeValueOptions.size()]; Iterator<String[]> iterator = attributeValueOptions.iterator(); int j = 0; while (iterator.hasNext()) { String[] pair = iterator.next(); String regex = pair[0]; try { attributeRegexPatterns[j] = Pattern.compile(regex); } catch (PatternSyntaxException e) { throw new UserError(this, 206, regex, e.getMessage()); } attributes[j] = exampleSet.getAttributes().get(pair[0]); valueOptions[j] = ((ParameterTypeCategory) ((ParameterTypeList) getParameterType(PARAMETER_ATTRIBUTES)).getValueType()) .getIndex(pair[1]); j++; } for (int i = 0; i < attributeRegexPatterns.length; i++) { Iterator<Attribute> a = exampleSet.getAttributes().allAttributes(); while (a.hasNext()) { Attribute attribute = a.next(); Matcher matcher = attributeRegexPatterns[i].matcher(attribute.getName()); if (matcher.matches()) { attributeValueOptionsMap.put(attribute, valueOptions[i]); } } } double p = getParameterAsDouble(PARAMETER_P); boolean filterAttribute = getParameterAsBoolean(PARAMETER_FILTER_ATTRIBUTE); String iterationMacro = getParameterAsString(PARAMETER_ITERATION_MACRO); // applying on complete set if (getParameterAsBoolean(PARAMETER_APPLY_ON_COMPLETE_SET)) { if (iterationMacro != null) { getProcess().getMacroHandler().addMacro(iterationMacro, "ALL"); } innerExampleSetSource.deliver(exampleSet); getSubprocess(0).execute(); } // applying on subgroups defined by attributes for (Entry<Attribute, Integer> attributeEntry : attributeValueOptionsMap.entrySet()) { Attribute attribute = attributeEntry.getKey(); if (!attribute.isNominal()) { continue; } List<String> values = null; switch (attributeEntry.getValue()) { case VALUE_OPTION_ALL: values = attribute.getMapping().getValues(); break; case VALUE_OPTION_ABOVE_P: values = new Vector<String>(); for (String value : attribute.getMapping().getValues()) { if (exampleSet.getStatistics(attribute, Statistics.COUNT, value) / exampleSet.size() >= p) { values.add(value); } } break; default: values = attribute.getMapping().getValues(); break; } for (String value : values) { if (exampleSet.getStatistics(attribute, Statistics.COUNT, value) > 0) { String className = "attribute_value_filter"; String parameter = attribute.getName() + "=" + value; log("Creating condition '" + className + "' with parameter '" + parameter + "'"); Condition condition = null; try { condition = ConditionedExampleSet.createCondition(className, exampleSet, parameter); } catch (ConditionCreationException e) { throw new UserError(this, 904, className, e.getMessage()); } ExampleSet subgroupSet = new ConditionedExampleSet(exampleSet, condition, false); if (filterAttribute) { subgroupSet.getAttributes().remove(attribute); } if (iterationMacro != null) { getProcess().getMacroHandler().addMacro(iterationMacro, parameter.replace(' ', '_')); } // applying subprocess innerExampleSetSource.deliver(subgroupSet); getSubprocess(0).execute(); if (filterAttribute) { subgroupSet.getAttributes().addRegular(attribute); } } inApplyLoop(); } } if (iterationMacro != null) { getProcess().getMacroHandler().addMacro(iterationMacro, null); } }
@Override public Model learn(ExampleSet exampleSet) throws OperatorException { Attribute label = exampleSet.getAttributes().getLabel(); RuleModel ruleModel = new RuleModel(exampleSet); double pureness = getParameterAsDouble(PARAMETER_PURENESS); TermDetermination termDetermination = new TermDetermination(new AccuracyCriterion(), 0.5d); ExampleSet trainingSet = (ExampleSet) exampleSet.clone(); for (String labelName : label.getMapping().getValues()) { trainingSet.recalculateAttributeStatistics(label); int oldSize = -1; while (trainingSet.size() > 0 && trainingSet.size() != oldSize && trainingSet.getStatistics(label, Statistics.COUNT, labelName) > 0) { Rule rule = new Rule(labelName); ExampleSet oldTrainingSet = (ExampleSet) trainingSet.clone(); // grow rule int growOldSize = -1; ExampleSet growSet = (ExampleSet) trainingSet.clone(); while (growSet.size() > 0 && growSet.size() != growOldSize && !rule.isPure(growSet, pureness) && growSet.getAttributes().size() > 0) { SplitCondition term = termDetermination.getBestTerm(growSet, labelName); if (term == null) { break; } rule.addTerm(term); Attribute splitAttribute = growSet.getAttributes().get(term.getAttributeName()); growSet.getAttributes().remove(splitAttribute); growOldSize = growSet.size(); growSet = rule.getCovered(growSet); } // add rule if not empty if (rule.getTerms().size() > 0) { growSet = rule.getCovered(trainingSet); growSet.recalculateAttributeStatistics(label); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) { frequencies[counter++] = (int) growSet.getStatistics(label, Statistics.COUNT, value); } rule.setFrequencies(frequencies); ruleModel.addRule(rule); oldSize = trainingSet.size(); trainingSet = rule.removeCovered(oldTrainingSet); } else { break; // no other terms found for this class --> next class } trainingSet.recalculateAttributeStatistics(label); } checkForStop(); } // training set not empty? add default rule if (trainingSet.size() > 0) { trainingSet.recalculateAttributeStatistics(label); int index = (int) trainingSet.getStatistics(label, Statistics.MODE); String defaultLabel = label.getMapping().mapIndex(index); Rule defaultRule = new Rule(defaultLabel); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) { frequencies[counter++] = (int) trainingSet.getStatistics(label, Statistics.COUNT, value); } defaultRule.setFrequencies(frequencies); ruleModel.addRule(defaultRule); } return ruleModel; }
@Override public PreprocessingModel createPreprocessingModel(ExampleSet exampleSet) throws OperatorException { HashMap<Attribute, double[]> ranges = new HashMap<Attribute, double[]>(); // Get and check parametervalues boolean useSqrt = getParameterAsBoolean(PARAMETER_USE_SQRT_OF_EXAMPLES); int numberOfBins = 0; if (!useSqrt) { // if not automatic sizing of bins, use parametervalue numberOfBins = getParameterAsInt(PARAMETER_NUMBER_OF_BINS); if (numberOfBins >= (exampleSet.size() - 1)) { throw new UserError( this, 116, PARAMETER_NUMBER_OF_BINS, "number of bins must be smaller than number of examples (here: " + exampleSet.size() + ")"); } } else { exampleSet.recalculateAllAttributeStatistics(); } for (Attribute currentAttribute : exampleSet.getAttributes()) { if (useSqrt) { numberOfBins = (int) Math.round( Math.sqrt( exampleSet.size() - (int) exampleSet.getStatistics(currentAttribute, Statistics.UNKNOWN))); } double[] attributeRanges = new double[numberOfBins]; ExampleSet sortedSet = new SortedExampleSet(exampleSet, currentAttribute, SortedExampleSet.INCREASING); // finding ranges double examplesPerBin = exampleSet.size() / (double) numberOfBins; double currentBinSpace = examplesPerBin; double lastValue = Double.NaN; int currentBin = 0; for (Example example : sortedSet) { double value = example.getValue(currentAttribute); if (!Double.isNaN(value)) { // change bin if full and not last if (currentBinSpace < 1 && currentBin < numberOfBins && value != lastValue) { if (!Double.isNaN(lastValue)) { attributeRanges[currentBin] = (lastValue + value) / 2; currentBin++; currentBinSpace += examplesPerBin; // adding because same values might // cause binspace to be negative if (currentBinSpace < 1) { throw new UserError(this, 944, currentAttribute.getName()); } } } currentBinSpace--; lastValue = value; } } attributeRanges[numberOfBins - 1] = Double.POSITIVE_INFINITY; ranges.put(currentAttribute, attributeRanges); } DiscretizationModel model = new DiscretizationModel(exampleSet); // determine number of digits int numberOfDigits = -1; if (getParameterAsBoolean(PARAMETER_AUTOMATIC_NUMBER_OF_DIGITS) == false) { numberOfDigits = getParameterAsInt(PARAMETER_NUMBER_OF_DIGITS); } model.setRanges(ranges, "range", getParameterAsInt(PARAMETER_RANGE_NAME_TYPE), numberOfDigits); return model; }