private static Map<Integer, MeanVariance> createMeanVariances( com.rapidminer.example.ExampleSet exampleSet) { double[] sum = new double[exampleSet.getAttributes().size()]; double[] squaredSum = new double[sum.length]; Iterator<com.rapidminer.example.Example> reader = exampleSet.iterator(); while (reader.hasNext()) { com.rapidminer.example.Example example = reader.next(); int a = 0; for (Attribute attribute : exampleSet.getAttributes()) { double value = example.getValue(attribute); sum[a] += value; squaredSum[a] += value * value; a++; } } Map<Integer, MeanVariance> meanVariances = new HashMap<Integer, MeanVariance>(); for (int a = 0; a < sum.length; a++) { sum[a] /= exampleSet.size(); squaredSum[a] /= exampleSet.size(); meanVariances.put(a, new MeanVariance(sum[a], squaredSum[a] - (sum[a] * sum[a]))); } return meanVariances; }
@Override public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException { // checking and creating ids if necessary Tools.checkAndCreateIds(exampleSet); // generating assignment RandomGenerator random = RandomGenerator.getRandomGenerator(this); int clusterAssignments[] = new int[exampleSet.size()]; int k = getParameterAsInt(PARAMETER_NUMBER_OF_CLUSTERS); for (int i = 0; i < exampleSet.size(); i++) { clusterAssignments[i] = random.nextInt(k); } ClusterModel model = new ClusterModel( exampleSet, k, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED)); model.setClusterAssignments(clusterAssignments, exampleSet); // generating cluster attribute if (addsClusterAttribute()) { Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(cluster); exampleSet.getAttributes().setCluster(cluster); int i = 0; for (Example example : exampleSet) { example.setValue(cluster, "cluster_" + clusterAssignments[i]); i++; } } return model; }
/** Creates a new evolutionary SVM optimization. */ public ClassificationEvoOptimization( ExampleSet exampleSet, // training data Kernel kernel, double c, // SVM paras int initType, // start population creation type para int maxIterations, int generationsWithoutImprovement, int popSize, // GA paras int selectionType, double tournamentFraction, boolean keepBest, // selection paras int mutationType, // type of mutation double crossoverProb, boolean showConvergencePlot, boolean showPopulationPlot, ExampleSet holdOutSet, RandomGenerator random, LoggingHandler logging, Operator executingOperator) { super( EvoSVM.createBoundArray(0.0d, exampleSet.size()), EvoSVM.determineMax(c, kernel, exampleSet, selectionType, exampleSet.size()), popSize, exampleSet.size(), initType, maxIterations, generationsWithoutImprovement, selectionType, tournamentFraction, keepBest, mutationType, Double.NaN, crossoverProb, showConvergencePlot, showPopulationPlot, random, logging, executingOperator); this.exampleSet = exampleSet; this.holdOutSet = holdOutSet; this.populationSize = popSize; this.kernel = kernel; this.c = getMax(0); // label values this.ys = new double[exampleSet.size()]; Iterator<Example> reader = exampleSet.iterator(); int index = 0; Attribute label = exampleSet.getAttributes().getLabel(); while (reader.hasNext()) { Example example = reader.next(); ys[index++] = example.getLabel() == label.getMapping().getPositiveIndex() ? 1.0d : -1.0d; } // optimization function this.optimizationFunction = new ClassificationOptimizationFunction(selectionType == NON_DOMINATED_SORTING_SELECTION); }
public Model learn(ExampleSet exampleSet) throws OperatorException { double value = 0.0; double[] confidences = null; int method = getParameterAsInt(PARAMETER_METHOD); Attribute label = exampleSet.getAttributes().getLabel(); if ((label.isNominal()) && ((method == MEDIAN) || (method == AVERAGE))) { logWarning( "Cannot use method '" + METHODS[method] + "' for nominal labels: changing to 'mode'!"); method = MODE; } else if ((!label.isNominal()) && (method == MODE)) { logWarning( "Cannot use method '" + METHODS[method] + "' for numerical labels: changing to 'average'!"); method = AVERAGE; } switch (method) { case MEDIAN: double[] labels = new double[exampleSet.size()]; Iterator<Example> r = exampleSet.iterator(); int counter = 0; while (r.hasNext()) { Example example = r.next(); labels[counter++] = example.getValue(example.getAttributes().getLabel()); } java.util.Arrays.sort(labels); value = labels[exampleSet.size() / 2]; break; case AVERAGE: exampleSet.recalculateAttributeStatistics(label); value = exampleSet.getStatistics(label, Statistics.AVERAGE); break; case MODE: exampleSet.recalculateAttributeStatistics(label); value = exampleSet.getStatistics(label, Statistics.MODE); confidences = new double[label.getMapping().size()]; for (int i = 0; i < confidences.length; i++) { confidences[i] = exampleSet.getStatistics(label, Statistics.COUNT, label.getMapping().mapIndex(i)) / exampleSet.size(); } break; case CONSTANT: value = getParameterAsDouble(PARAMETER_CONSTANT); break; case ATTRIBUTE: return new AttributeDefaultModel( exampleSet, getParameterAsString(PARAMETER_ATTRIBUTE_NAME)); default: // cannot happen throw new OperatorException("DefaultLearner: Unknown default method '" + method + "'!"); } log( "Default value is '" + (label.isNominal() ? label.getMapping().mapIndex((int) value) : value + "") + "'."); return new DefaultModel(exampleSet, value, confidences); }
public boolean hasNext() { if (this.nextInvoked) { this.nextInvoked = false; this.currentIndex++; if (this.currentIndex < parent.size()) { this.currentExample = this.parent.getExample(this.currentIndex); return true; } else { return false; } } return (this.currentIndex < parent.size()); }
private RuleModel createNumericalRuleModel(ExampleSet trainingSet, Attribute attribute) { RuleModel model = new RuleModel(trainingSet); // split by best attribute int oldSize = -1; while ((trainingSet.size() > 0) && (trainingSet.size() != oldSize)) { ExampleSet exampleSet = (ExampleSet) trainingSet.clone(); Split bestSplit = splitter.getBestSplit(exampleSet, attribute, null); double bestSplitValue = bestSplit.getSplitPoint(); if (!Double.isNaN(bestSplitValue)) { SplittedExampleSet splittedSet = SplittedExampleSet.splitByAttribute(exampleSet, attribute, bestSplitValue); Attribute label = splittedSet.getAttributes().getLabel(); splittedSet.selectSingleSubset(0); SplitCondition condition = new LessEqualsSplitCondition(attribute, bestSplitValue); splittedSet.recalculateAttributeStatistics(label); int labelValue = (int) splittedSet.getStatistics(label, Statistics.MODE); String labelName = label.getMapping().mapIndex(labelValue); Rule rule = new Rule(labelName, condition); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) frequencies[counter++] = (int) splittedSet.getStatistics(label, Statistics.COUNT, value); rule.setFrequencies(frequencies); model.addRule(rule); oldSize = trainingSet.size(); trainingSet = rule.removeCovered(trainingSet); } else { break; } } // add default rule if some examples were not yet covered if (trainingSet.size() > 0) { Attribute label = trainingSet.getAttributes().getLabel(); trainingSet.recalculateAttributeStatistics(label); int index = (int) trainingSet.getStatistics(label, Statistics.MODE); String defaultLabel = label.getMapping().mapIndex(index); Rule defaultRule = new Rule(defaultLabel); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) frequencies[counter++] = (int) (trainingSet.getStatistics(label, Statistics.COUNT, value)); defaultRule.setFrequencies(frequencies); model.addRule(defaultRule); } return model; }
private NeuralDataSet getTraining(ExampleSet exampleSet) { double[][] data = new double[exampleSet.size()][exampleSet.getAttributes().size()]; double[][] labels = new double[exampleSet.size()][1]; int index = 0; Attribute label = exampleSet.getAttributes().getLabel(); this.attributeMin = new double[exampleSet.getAttributes().size()]; this.attributeMax = new double[attributeMin.length]; exampleSet.recalculateAllAttributeStatistics(); int a = 0; for (Attribute attribute : exampleSet.getAttributes()) { this.attributeMin[a] = exampleSet.getStatistics(attribute, Statistics.MINIMUM); this.attributeMax[a] = exampleSet.getStatistics(attribute, Statistics.MAXIMUM); a++; } this.labelMin = exampleSet.getStatistics(label, Statistics.MINIMUM); this.labelMax = exampleSet.getStatistics(label, Statistics.MAXIMUM); for (Example example : exampleSet) { // attributes a = 0; for (Attribute attribute : exampleSet.getAttributes()) { if (attributeMin[a] != attributeMax[a]) { data[index][a] = (example.getValue(attribute) - attributeMin[a]) / (attributeMax[a] - attributeMin[a]); } else { data[index][a] = example.getValue(attribute) - attributeMin[a]; } a++; } // label if (label.isNominal()) { labels[index][0] = example.getValue(label); } else { if (labelMax != labelMin) { labels[index][0] = (example.getValue(label) - labelMin) / (labelMax - labelMin); } else { labels[index][0] = example.getValue(label) - labelMin; } } index++; } return new BasicNeuralDataSet(data, labels); }
/** @see com.rapidminer.operator.OperatorChain#doWork() */ @Override public void doWork() throws OperatorException { List<Operator> nested = this.getImmediateChildren(); log.info("This StreamProcess has {} nested operators", nested.size()); for (Operator op : nested) { log.info(" op: {}", op); if (op instanceof DataStreamOperator) { log.info("Resetting stream-operator {}", op); ((DataStreamOperator) op).reset(); } } log.info("Starting some work in doWork()"); ExampleSet exampleSet = input.getData(ExampleSet.class); log.info("input is an example set with {} examples", exampleSet.size()); int i = 0; Iterator<Example> it = exampleSet.iterator(); while (it.hasNext()) { Example example = it.next(); log.info("Processing example {}", i); DataObject datum = StreamUtils.wrap(example); log.info("Wrapped data-object is: {}", datum); dataStream.deliver(datum); getSubprocess(0).execute(); inApplyLoop(); i++; } // super.doWork(); log.info("doWork() is finished."); }
@Override public AttributeWeights calculateWeights(ExampleSet exampleSet) throws OperatorException { Attributes attributes = exampleSet.getAttributes(); Attribute labelAttribute = attributes.getLabel(); boolean useSquaredCorrelation = getParameterAsBoolean(PARAMETER_SQUARED_CORRELATION); AttributeWeights weights = new AttributeWeights(exampleSet); getProgress().setTotal(attributes.size()); int progressCounter = 0; int exampleSetSize = exampleSet.size(); int exampleCounter = 0; for (Attribute attribute : attributes) { double correlation = MathFunctions.correlation(exampleSet, labelAttribute, attribute, useSquaredCorrelation); weights.setWeight(attribute.getName(), Math.abs(correlation)); progressCounter++; exampleCounter += exampleSetSize; if (exampleCounter > PROGRESS_UPDATE_STEPS) { exampleCounter = 0; getProgress().setCompleted(progressCounter); } } return weights; }
/** * Gets the input data and macro name and iterates over the example set while updating the current * iteration in the given macro. */ @Override public void doWork() throws OperatorException { outExtender.reset(); ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class); String iterationMacroName = getParameterAsString(PARAMETER_ITERATION_MACRO); boolean innerSinkIsConnected = exampleSetInnerSink.isConnected(); for (iteration = 1; iteration <= exampleSet.size(); iteration++) { getProcess().getMacroHandler().addMacro(iterationMacroName, String.valueOf(iteration)); // passing in clone or if connected the result from last iteration exampleSetInnerSource.deliver( innerSinkIsConnected ? exampleSet : (ExampleSet) exampleSet.clone()); getSubprocess(0).execute(); inApplyLoop(); if (innerSinkIsConnected) { exampleSet = exampleSetInnerSink.getData(ExampleSet.class); } outExtender.collect(); } getProcess().getMacroHandler().removeMacro(iterationMacroName); exampleSetOutput.deliver(exampleSet); }
/** * Creates a {@link HistogramDataset} for this {@link Attribute}. * * @param exampleSet * @return */ private HistogramDataset createHistogramDataset(final ExampleSet exampleSet) { HistogramDataset dataset = new HistogramDataset(); double[] array = new double[exampleSet.size()]; int count = 0; for (Example example : exampleSet) { double value = example.getDataRow().get(getAttribute()); // don't use missing values because otherwise JFreeChart tries to plot them too which // can lead to false histograms if (!Double.isNaN(value)) { array[count++] = value; } } // add points to data set (if any) if (count > 0) { // truncate array if necessary if (count < array.length) { array = Arrays.copyOf(array, count); } dataset.addSeries( getAttribute().getName(), array, Math.min(array.length, MAX_BINS_HISTOGRAM)); } return dataset; }
@Override public PerformanceVector evaluateIndividual(Individual individual) { double[] beta = individual.getValues(); double fitness = 0.0d; for (Example example : exampleSet) { double eta = 0.0d; int i = 0; for (Attribute attribute : example.getAttributes()) { double value = example.getValue(attribute); eta += beta[i] * value; i++; } if (addIntercept) { eta += beta[beta.length - 1]; } double pi = Math.exp(eta) / (1 + Math.exp(eta)); double classValue = example.getValue(label); double currentFitness = classValue * Math.log(pi) + (1 - classValue) * Math.log(1 - pi); double weightValue = 1.0d; if (weight != null) weightValue = example.getValue(weight); fitness += weightValue * currentFitness; } PerformanceVector performanceVector = new PerformanceVector(); performanceVector.addCriterion( new EstimatedPerformance("log_reg_fitness", fitness, exampleSet.size(), false)); return performanceVector; }
/** * Creates a fresh example set of the given size from the RapidMiner example reader. The alpha * values and b are zero, the label will be set if it is known. */ public SVMExamples( com.rapidminer.example.ExampleSet exampleSet, Attribute labelAttribute, Map<Integer, MeanVariance> meanVariances) { this(exampleSet.size(), 0.0d); this.meanVarianceMap = meanVariances; Iterator<com.rapidminer.example.Example> reader = exampleSet.iterator(); Attribute idAttribute = exampleSet.getAttributes().getId(); int exampleCounter = 0; while (reader.hasNext()) { com.rapidminer.example.Example current = reader.next(); Map<Integer, Double> attributeMap = new LinkedHashMap<Integer, Double>(); int a = 0; for (Attribute attribute : exampleSet.getAttributes()) { double value = current.getValue(attribute); if (!com.rapidminer.example.Tools.isDefault(attribute.getDefault(), value)) { attributeMap.put(a, value); } if ((a + 1) > dim) { dim = (a + 1); } a++; } atts[exampleCounter] = new double[attributeMap.size()]; index[exampleCounter] = new int[attributeMap.size()]; Iterator<Map.Entry<Integer, Double>> i = attributeMap.entrySet().iterator(); int attributeCounter = 0; while (i.hasNext()) { Map.Entry<Integer, Double> e = i.next(); Integer indexValue = e.getKey(); Double attributeValue = e.getValue(); index[exampleCounter][attributeCounter] = indexValue.intValue(); double value = attributeValue.doubleValue(); MeanVariance meanVariance = meanVarianceMap.get(indexValue); if (meanVariance != null) { if (meanVariance.getVariance() == 0.0d) { value = 0.0d; } else { value = (value - meanVariance.getMean()) / Math.sqrt(meanVariance.getVariance()); } } atts[exampleCounter][attributeCounter] = value; attributeCounter++; } if (labelAttribute != null) { double label = current.getValue(labelAttribute); if (labelAttribute.isNominal()) { ys[exampleCounter] = (label == labelAttribute.getMapping().getPositiveIndex() ? 1 : -1); } else { ys[exampleCounter] = label; } } if (idAttribute != null) { ids[exampleCounter] = current.getValueAsString(idAttribute); } exampleCounter++; } }
/** * Iterates over all models and returns the class with maximum likelihood. * * @param origExampleSet the set of examples to be classified */ @Override public ExampleSet performPrediction(ExampleSet origExampleSet, Attribute predictedLabel) throws OperatorException { final String attributePrefix = "AdaBoostModelPrediction"; final int numLabels = predictedLabel.getMapping().size(); final Attribute[] specialAttributes = new Attribute[numLabels]; OperatorProgress progress = null; if (getShowProgress() && getOperator() != null && getOperator().getProgress() != null) { progress = getOperator().getProgress(); progress.setTotal(100); } for (int i = 0; i < numLabels; i++) { specialAttributes[i] = com.rapidminer.example.Tools.createSpecialAttribute( origExampleSet, attributePrefix + i, Ontology.NUMERICAL); if (progress != null) { progress.setCompleted((int) (25.0 * (i + 1) / numLabels)); } } Iterator<Example> reader = origExampleSet.iterator(); int progressCounter = 0; while (reader.hasNext()) { Example example = reader.next(); for (int i = 0; i < specialAttributes.length; i++) { example.setValue(specialAttributes[i], 0); } if (progress != null && ++progressCounter % OPERATOR_PROGRESS_STEPS == 0) { progress.setCompleted((int) (25.0 * progressCounter / origExampleSet.size()) + 25); } } reader = origExampleSet.iterator(); for (int modelNr = 0; modelNr < this.getNumberOfModels(); modelNr++) { Model model = this.getModel(modelNr); ExampleSet exampleSet = (ExampleSet) origExampleSet.clone(); exampleSet = model.apply(exampleSet); this.updateEstimates(exampleSet, modelNr, specialAttributes); PredictionModel.removePredictedLabel(exampleSet); if (progress != null) { progress.setCompleted((int) (25.0 * (modelNr + 1) / this.getNumberOfModels()) + 50); } } // Turn prediction weights into confidences and a crisp predcition: this.evaluateSpecialAttributes(origExampleSet, specialAttributes); // Clean up attributes: for (int i = 0; i < numLabels; i++) { origExampleSet.getAttributes().remove(specialAttributes[i]); origExampleSet.getExampleTable().removeAttribute(specialAttributes[i]); if (progress != null) { progress.setCompleted((int) (25.0 * (i + 1) / numLabels) + 75); } } return origExampleSet; }
/** Creates an example set that is splitted into n subsets with the given sampling type. */ public SplittedExampleSet( ExampleSet exampleSet, double[] splitRatios, int samplingType, int seed) { this( exampleSet, new Partition( splitRatios, exampleSet.size(), createPartitionBuilder(exampleSet, samplingType, seed))); }
/** * Creates an example set that is splitted into <i>numberOfSubsets</i> parts with the given * sampling type. */ public SplittedExampleSet( ExampleSet exampleSet, int numberOfSubsets, int samplingType, int seed) { this( exampleSet, new Partition( numberOfSubsets, exampleSet.size(), createPartitionBuilder(exampleSet, samplingType, seed))); }
@Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { Attributes attributes = exampleSet.getAttributes(); // searching example by index int exampleIndex = getParameterAsInt(PARAMETER_EXAMPLE_INDEX); if (exampleIndex == 0) { throw new UserError( this, 207, new Object[] { "0", PARAMETER_EXAMPLE_INDEX, "only positive or negative indices are allowed" }); } if (getParameterAsBoolean(PARAMETER_COUNT_BACKWARDS)) { exampleIndex = exampleSet.size() - exampleIndex; } else { exampleIndex--; } if (exampleIndex >= exampleSet.size()) { throw new UserError(this, 110, exampleIndex); } Example example = exampleSet.getExample(exampleIndex); // now set single value of first parameter if (isParameterSet(PARAMETER_ATTRIBUTE_NAME) && isParameterSet(PARAMETER_VALUE)) { String attributeName = getParameter(PARAMETER_ATTRIBUTE_NAME); String value = getParameterAsString(PARAMETER_VALUE); setData(example, attributeName, value, attributes); } // now set each defined additional value. List<String[]> list = getParameterList(PARAMETER_ADDITIONAL_VALUES); for (String[] pair : list) { setData(example, pair[0], pair[1], attributes); } return exampleSet; }
@Override public void doWork() throws OperatorException { ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class); // only use numeric attributes Tools.onlyNumericalAttributes(exampleSet, "KernelPCA"); Tools.onlyNonMissingValues(exampleSet, getOperatorClassName(), this); Attributes attributes = exampleSet.getAttributes(); int numberOfExamples = exampleSet.size(); // calculating means for later zero centering exampleSet.recalculateAllAttributeStatistics(); double[] means = new double[exampleSet.getAttributes().size()]; int i = 0; for (Attribute attribute : exampleSet.getAttributes()) { means[i] = exampleSet.getStatistics(attribute, Statistics.AVERAGE); i++; } // kernel Kernel kernel = Kernel.createKernel(this); // copying zero centered exampleValues ArrayList<double[]> exampleValues = new ArrayList<double[]>(numberOfExamples); i = 0; for (Example columnExample : exampleSet) { double[] columnValues = getAttributeValues(columnExample, attributes, means); exampleValues.add(columnValues); i++; } // filling kernel matrix Matrix kernelMatrix = new Matrix(numberOfExamples, numberOfExamples); for (i = 0; i < numberOfExamples; i++) { for (int j = 0; j < numberOfExamples; j++) { kernelMatrix.set( i, j, kernel.calculateDistance(exampleValues.get(i), exampleValues.get(j))); } } // calculating eigenVectors EigenvalueDecomposition eig = kernelMatrix.eig(); Model model = new KernelPCAModel(exampleSet, means, eig.getV(), exampleValues, kernel); if (exampleSetOutput.isConnected()) { exampleSetOutput.deliver(model.apply(exampleSet)); } originalOutput.deliver(exampleSet); modelOutput.deliver(model); }
@Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { int size = exampleSet.size(); // cannot bootstrap without any examples if (size < 1) { throw new UserError(this, 117); } RandomGenerator random = RandomGenerator.getRandomGenerator(this); switch (getParameterAsInt(PARAMETER_SAMPLE)) { case SAMPLE_ABSOLUTE: size = getParameterAsInt(PARAMETER_SAMPLE_SIZE); break; case SAMPLE_RELATIVE: size = (int) Math.round(exampleSet.size() * getParameterAsDouble(PARAMETER_SAMPLE_RATIO)); break; } int[] mapping = null; if (getParameterAsBoolean(PARAMETER_USE_WEIGHTS) && exampleSet.getAttributes().getWeight() != null) { mapping = MappedExampleSet.createWeightedBootstrappingMapping(exampleSet, size, random); } else { mapping = MappedExampleSet.createBootstrappingMapping(exampleSet, size, random); } // create and materialize example set ExampleSet mappedExampleSet = new MappedExampleSet(exampleSet, mapping, true); if (getCompatibilityLevel().isAbove(VERSION_6_4_0)) { int type = DataRowFactory.TYPE_DOUBLE_ARRAY; if (exampleSet.size() > 0) { type = exampleSet.getExampleTable().getDataRow(0).getType(); } mappedExampleSet = MaterializeDataInMemory.materializeExampleSet(mappedExampleSet, type); } return mappedExampleSet; }
@Override public void doWork() throws OperatorException { ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class); Attribute attribute = exampleSet.getAttributes().get(getParameterAsString(PARAMETER_ATTRIBUTE_NAME)); if (attribute == null) throw new UserError(this, 111, getParameterAsString(PARAMETER_ATTRIBUTE_NAME)); int index = getParameterAsInt(PARAMETER_EXAMPLE_INDEX); if (index == 0) { throw new UserError( this, 207, "0", PARAMETER_EXAMPLE_INDEX, "only positive or negative indices are allowed"); } if (index < 0) { index = exampleSet.size() + index; } else { index--; } if (index >= exampleSet.size()) { throw new UserError(this, 110, index); } Example example = exampleSet.getExample(index); if (attribute.isNominal() || Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) { currentValue = example.getValueAsString(attribute); isNominal = true; } else { currentValue = Double.valueOf(example.getValue(attribute)); isNominal = false; } exampleSetOutput.deliver(exampleSet); }
public static SplittedExampleSet splitByAttribute( ExampleSet exampleSet, Attribute attribute, double value) { int[] elements = new int[exampleSet.size()]; Iterator<Example> reader = exampleSet.iterator(); int i = 0; while (reader.hasNext()) { Example example = reader.next(); double currentValue = example.getValue(attribute); if (currentValue <= value) elements[i++] = 0; else elements[i++] = 1; } Partition partition = new Partition(elements, 2); return new SplittedExampleSet(exampleSet, partition); }
protected void prepareWeights(ExampleSet exampleSet) { Attribute weightAttr = exampleSet.getAttributes().getWeight(); if (weightAttr == null) { this.oldWeights = null; com.rapidminer.example.Tools.createWeightAttribute(exampleSet); } else { // Back up old weights this.oldWeights = new double[exampleSet.size()]; Iterator<Example> reader = exampleSet.iterator(); for (int i = 0; (reader.hasNext() && i < oldWeights.length); i++) { Example example = reader.next(); if (example != null) { this.oldWeights[i] = example.getWeight(); example.setWeight(1); } } } }
public static SplittedExampleSet splitByAttribute(ExampleSet exampleSet, Attribute attribute) { int[] elements = new int[exampleSet.size()]; int i = 0; Map<Integer, Integer> indexMap = new HashMap<Integer, Integer>(); AtomicInteger currentIndex = new AtomicInteger(0); for (Example example : exampleSet) { int value = (int) example.getValue(attribute); Integer indexObject = indexMap.get(value); if (indexObject == null) { indexMap.put(value, currentIndex.getAndIncrement()); } int intValue = indexMap.get(value).intValue(); elements[i++] = intValue; } int maxNumber = indexMap.size(); indexMap.clear(); Partition partition = new Partition(elements, maxNumber); return new SplittedExampleSet(exampleSet, partition); }
/** * Calculates ranks for an attribute. * * <p>Ranks are returned as double precision values, with 1 as the rank of the smallest value. * Values within +/- fuzz of each other may be considered tied. Tied values receive identical * ranks. Missing values receive rank NaN. * * <p>Note that application of the "fuzz" factor is dependent on the order of the observations in * the example set. For instance, if the first three values encountered are x, x+fuzz and * x+2*fuzz, the first two will be considered tied but the third will not, since x+2*fuzz is not * within +/- fuzz of x. * * @param eSet the example set * @param att the attribute to rank * @param fuzz values within +/- fuzz may be considered tied * @return a double precision array of ranks */ public static double[] rank(ExampleSet eSet, Attribute att, Attribute mappingAtt, double fuzz) { TreeMap<Double, ArrayList<Integer>> map; if (fuzz == 0.0) map = new TreeMap<Double, ArrayList<Integer>>(); else { FuzzyComp fc = new FuzzyComp(fuzz); map = new TreeMap<Double, ArrayList<Integer>>(fc); } double[] rank = new double[eSet.size()]; Iterator<Example> reader = eSet.iterator(); int i = 0; // example index // iterate through the example set while (reader.hasNext()) { // get the attribute values from the next example Example e = reader.next(); double x = e.getValue(att); if (att.isNominal() && mappingAtt != null) { String xString = att.getMapping().mapIndex((int) x); x = mappingAtt.getMapping().getIndex(xString); } // punt if either is missing if (Double.isNaN(x)) rank[i++] = Double.NaN; else { // insert x into the tree if (!map.containsKey(x)) // new key -- create a new entry in the map map.put(x, new ArrayList<Integer>()); map.get(x).add(i++); // add the index to the list } } // convert the map to ranks double r = 0; for (double x : map.keySet()) { ArrayList<Integer> y = map.get(x); double v = r + (1.0 + y.size()) / 2.0; for (int j : y) rank[j] = v; r += y.size(); } return rank; }
@Override public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException { int k = getParameterAsInt(PARAMETER_K); int maxOptimizationSteps = getParameterAsInt(PARAMETER_MAX_OPTIMIZATION_STEPS); boolean useExampleWeights = getParameterAsBoolean(PARAMETER_USE_WEIGHTS); Kernel kernel = Kernel.createKernel(this); // init operator progress getProgress().setTotal(maxOptimizationSteps); // checking and creating ids if necessary Tools.checkAndCreateIds(exampleSet); // additional checks Tools.onlyNonMissingValues(exampleSet, getOperatorClassName(), this, new String[0]); if (exampleSet.size() < k) { throw new UserError(this, 142, k); } // extracting attribute names Attributes attributes = exampleSet.getAttributes(); ArrayList<String> attributeNames = new ArrayList<String>(attributes.size()); for (Attribute attribute : attributes) { attributeNames.add(attribute.getName()); } Attribute weightAttribute = attributes.getWeight(); RandomGenerator generator = RandomGenerator.getRandomGenerator(this); ClusterModel model = new ClusterModel( exampleSet, k, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED)); // init centroids int[] clusterAssignments = new int[exampleSet.size()]; for (int i = 0; i < exampleSet.size(); i++) { clusterAssignments[i] = generator.nextIntInRange(0, k); } // run optimization steps boolean stable = false; for (int step = 0; step < maxOptimizationSteps && !stable; step++) { // calculating cluster kernel properties double[] clusterWeights = new double[k]; double[] clusterKernelCorrection = new double[k]; int i = 0; for (Example firstExample : exampleSet) { double firstExampleWeight = useExampleWeights ? firstExample.getValue(weightAttribute) : 1d; double[] firstExampleValues = getAsDoubleArray(firstExample, attributes); clusterWeights[clusterAssignments[i]] += firstExampleWeight; int j = 0; for (Example secondExample : exampleSet) { if (clusterAssignments[i] == clusterAssignments[j]) { double secondExampleWeight = useExampleWeights ? secondExample.getValue(weightAttribute) : 1d; clusterKernelCorrection[clusterAssignments[i]] += firstExampleWeight * secondExampleWeight * kernel.calculateDistance( firstExampleValues, getAsDoubleArray(secondExample, attributes)); } j++; } i++; } for (int z = 0; z < k; z++) { clusterKernelCorrection[z] /= clusterWeights[z] * clusterWeights[z]; } // assign examples to new centroids int[] newClusterAssignments = new int[exampleSet.size()]; i = 0; for (Example example : exampleSet) { double[] exampleValues = getAsDoubleArray(example, attributes); double exampleKernelValue = kernel.calculateDistance(exampleValues, exampleValues); double nearestDistance = Double.POSITIVE_INFINITY; int nearestIndex = 0; for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) { double distance = 0; // iterating over all examples in cluster to get kernel distance int j = 0; for (Example clusterExample : exampleSet) { if (clusterAssignments[j] == clusterIndex) { distance += (useExampleWeights ? clusterExample.getValue(weightAttribute) : 1d) * kernel.calculateDistance( getAsDoubleArray(clusterExample, attributes), exampleValues); } j++; } distance *= -2d / clusterWeights[clusterIndex]; // copy in outer loop distance += exampleKernelValue; distance += clusterKernelCorrection[clusterIndex]; if (distance < nearestDistance) { nearestDistance = distance; nearestIndex = clusterIndex; } } newClusterAssignments[i] = nearestIndex; i++; } // finishing assignment stable = true; for (int j = 0; j < exampleSet.size() && stable; j++) { stable &= newClusterAssignments[j] == clusterAssignments[j]; } clusterAssignments = newClusterAssignments; // trigger operator progress getProgress().step(); } // setting last clustering into model model.setClusterAssignments(clusterAssignments, exampleSet); getProgress().complete(); if (addsClusterAttribute()) { Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(cluster); exampleSet.getAttributes().setCluster(cluster); int i = 0; for (Example example : exampleSet) { example.setValue(cluster, "cluster_" + clusterAssignments[i]); i++; } } return model; }
public int size() { return parent.size(); }
public SimilarityVisualization(SimilarityMeasureObject sim, ExampleSet exampleSet) { super(); setLayout(new BorderLayout()); DistanceMeasure measure = sim.getDistanceMeasure(); ButtonGroup group = new ButtonGroup(); JPanel togglePanel = new JPanel(new FlowLayout(FlowLayout.LEFT)); // similarity table final JComponent tableView = new SimilarityTable(measure, exampleSet); final JRadioButton tableButton = new JRadioButton("Table View", true); tableButton.addActionListener( new ActionListener() { public void actionPerformed(ActionEvent e) { if (tableButton.isSelected()) { remove(1); add(tableView, BorderLayout.CENTER); repaint(); } } }); group.add(tableButton); togglePanel.add(tableButton); // graph view final JComponent graphView = new GraphViewer<String, String>(new SimilarityGraphCreator(measure, exampleSet)); final JRadioButton graphButton = new JRadioButton("Graph View", false); graphButton.addActionListener( new ActionListener() { public void actionPerformed(ActionEvent e) { if (graphButton.isSelected()) { remove(1); add(graphView, BorderLayout.CENTER); repaint(); } } }); group.add(graphButton); togglePanel.add(graphButton); // histogram view DataTable dataTable = new SimpleDataTable("Histogram", new String[] {"Histogram"}); double sampleRatio = Math.min(1.0d, 500.0d / exampleSet.size()); Random random = new Random(); int i = 0; for (Example example : exampleSet) { int j = 0; for (Example compExample : exampleSet) { if (i != j && random.nextDouble() < sampleRatio) { double simValue = measure.calculateSimilarity(example, compExample); dataTable.add(new SimpleDataTableRow(new double[] {simValue})); } j++; } i++; } final PlotterConfigurationModel settings = new PlotterConfigurationModel(PlotterConfigurationModel.HISTOGRAM_PLOT, dataTable); settings.enablePlotColumn(0); settings.setParameterAsInt(HistogramChart.PARAMETER_NUMBER_OF_BINS, 100); final JRadioButton histogramButton = new JRadioButton("Histogram View", false); histogramButton.addActionListener( new ActionListener() { public void actionPerformed(ActionEvent e) { if (histogramButton.isSelected()) { remove(1); add(settings.getPlotter().getPlotter(), BorderLayout.CENTER); repaint(); } } }); group.add(histogramButton); togglePanel.add(histogramButton); // K distance view final SimilarityKDistanceVisualization kDistancePlotter = new SimilarityKDistanceVisualization(measure, exampleSet); final JRadioButton kdistanceButton = new JRadioButton("k-Distance View", false); kdistanceButton.addActionListener( new ActionListener() { public void actionPerformed(ActionEvent e) { if (kdistanceButton.isSelected()) { remove(1); add(kDistancePlotter, BorderLayout.CENTER); repaint(); } } }); group.add(kdistanceButton); togglePanel.add(kdistanceButton); add(togglePanel, BorderLayout.NORTH); add(tableView, BorderLayout.CENTER); }
@Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { // init char decimalPointCharacter = getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0); Character groupingCharacter = null; if (isParameterSet(PARAMETER_NUMBER_GROUPING_CHARACTER)) { groupingCharacter = getParameterAsString(PARAMETER_NUMBER_GROUPING_CHARACTER).charAt(0); } Set<Attribute> attributeSet = attributeSelector.getAttributeSubset(exampleSet, false); int size = attributeSet.size(); int[] valueTypes = new int[size]; int index = 0; for (Attribute attribute : attributeSet) { valueTypes[index++] = attribute.getValueType(); } // guessing int[] guessedValueTypes = new int[valueTypes.length]; int checkedCounter = 0; for (Example example : exampleSet) { index = 0; for (Attribute attribute : attributeSet) { if (!attribute.isNominal() && !attribute.isNumerical()) { continue; } double originalValue = example.getValue(attribute); if (!Double.isNaN(originalValue)) { if (guessedValueTypes[index] != Ontology.NOMINAL) { try { String valueString = example.getValueAsString(attribute); if (!Attribute.MISSING_NOMINAL_VALUE.equals(valueString)) { if (groupingCharacter != null) { valueString = valueString.replace(groupingCharacter.toString(), ""); } valueString = valueString.replace(decimalPointCharacter, '.'); double value = Double.parseDouble(valueString); if (guessedValueTypes[index] != Ontology.REAL) { if (Tools.isEqual(Math.round(value), value)) { guessedValueTypes[index] = Ontology.INTEGER; } else { guessedValueTypes[index] = Ontology.REAL; } } } } catch (NumberFormatException e) { guessedValueTypes[index] = Ontology.NOMINAL; checkedCounter++; } } } index++; } if (checkedCounter >= guessedValueTypes.length) { break; } } // the example set contains at least one example and the guessing was performed if (exampleSet.size() > 0) { valueTypes = guessedValueTypes; // new attributes List<AttributeRole> newAttributes = new LinkedList<AttributeRole>(); index = 0; for (Attribute attribute : attributeSet) { if (!attribute.isNominal() && !attribute.isNumerical()) { continue; } AttributeRole role = exampleSet.getAttributes().getRole(attribute); Attribute newAttribute = AttributeFactory.createAttribute(valueTypes[index]); exampleSet.getExampleTable().addAttribute(newAttribute); AttributeRole newRole = new AttributeRole(newAttribute); newRole.setSpecial(role.getSpecialName()); newAttributes.add(newRole); // copy data for (Example e : exampleSet) { double oldValue = e.getValue(attribute); if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(valueTypes[index], Ontology.NUMERICAL)) { if (!Double.isNaN(oldValue)) { String valueString = e.getValueAsString(attribute); if (Attribute.MISSING_NOMINAL_VALUE.equals(valueString)) { e.setValue(newAttribute, Double.NaN); } else { if (groupingCharacter != null) { valueString = valueString.replace(groupingCharacter.toString(), ""); } valueString = valueString.replace(decimalPointCharacter, '.'); e.setValue(newAttribute, Double.parseDouble(valueString)); } } else { e.setValue(newAttribute, Double.NaN); } } else { if (!Double.isNaN(oldValue)) { String value = e.getValueAsString(attribute); e.setValue(newAttribute, newAttribute.getMapping().mapString(value)); } else { e.setValue(newAttribute, Double.NaN); } } } // delete attribute and rename the new attribute (due to deletion and data scans: no // more memory used :-) exampleSet.getExampleTable().removeAttribute(attribute); exampleSet.getAttributes().remove(role); newAttribute.setName(attribute.getName()); index++; } for (AttributeRole role : newAttributes) { if (role.isSpecial()) { exampleSet .getAttributes() .setSpecialAttribute(role.getAttribute(), role.getSpecialName()); } else { exampleSet.getAttributes().addRegular(role.getAttribute()); } } } return exampleSet; }
@Override public void doWork() throws OperatorException { ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class); exampleSet.recalculateAllAttributeStatistics(); List<String[]> attributeValueOptions = getParameterList(PARAMETER_ATTRIBUTES); LinkedHashMap<Attribute, Integer> attributeValueOptionsMap = new LinkedHashMap<Attribute, Integer>(); int[] valueOptions = new int[attributeValueOptions.size()]; Pattern[] attributeRegexPatterns = new Pattern[attributeValueOptions.size()]; Attribute[] attributes = new Attribute[attributeValueOptions.size()]; Iterator<String[]> iterator = attributeValueOptions.iterator(); int j = 0; while (iterator.hasNext()) { String[] pair = iterator.next(); String regex = pair[0]; try { attributeRegexPatterns[j] = Pattern.compile(regex); } catch (PatternSyntaxException e) { throw new UserError(this, 206, regex, e.getMessage()); } attributes[j] = exampleSet.getAttributes().get(pair[0]); valueOptions[j] = ((ParameterTypeCategory) ((ParameterTypeList) getParameterType(PARAMETER_ATTRIBUTES)).getValueType()) .getIndex(pair[1]); j++; } for (int i = 0; i < attributeRegexPatterns.length; i++) { Iterator<Attribute> a = exampleSet.getAttributes().allAttributes(); while (a.hasNext()) { Attribute attribute = a.next(); Matcher matcher = attributeRegexPatterns[i].matcher(attribute.getName()); if (matcher.matches()) { attributeValueOptionsMap.put(attribute, valueOptions[i]); } } } double p = getParameterAsDouble(PARAMETER_P); boolean filterAttribute = getParameterAsBoolean(PARAMETER_FILTER_ATTRIBUTE); String iterationMacro = getParameterAsString(PARAMETER_ITERATION_MACRO); // applying on complete set if (getParameterAsBoolean(PARAMETER_APPLY_ON_COMPLETE_SET)) { if (iterationMacro != null) { getProcess().getMacroHandler().addMacro(iterationMacro, "ALL"); } innerExampleSetSource.deliver(exampleSet); getSubprocess(0).execute(); } // applying on subgroups defined by attributes for (Entry<Attribute, Integer> attributeEntry : attributeValueOptionsMap.entrySet()) { Attribute attribute = attributeEntry.getKey(); if (!attribute.isNominal()) { continue; } List<String> values = null; switch (attributeEntry.getValue()) { case VALUE_OPTION_ALL: values = attribute.getMapping().getValues(); break; case VALUE_OPTION_ABOVE_P: values = new Vector<String>(); for (String value : attribute.getMapping().getValues()) { if (exampleSet.getStatistics(attribute, Statistics.COUNT, value) / exampleSet.size() >= p) { values.add(value); } } break; default: values = attribute.getMapping().getValues(); break; } for (String value : values) { if (exampleSet.getStatistics(attribute, Statistics.COUNT, value) > 0) { String className = "attribute_value_filter"; String parameter = attribute.getName() + "=" + value; log("Creating condition '" + className + "' with parameter '" + parameter + "'"); Condition condition = null; try { condition = ConditionedExampleSet.createCondition(className, exampleSet, parameter); } catch (ConditionCreationException e) { throw new UserError(this, 904, className, e.getMessage()); } ExampleSet subgroupSet = new ConditionedExampleSet(exampleSet, condition, false); if (filterAttribute) { subgroupSet.getAttributes().remove(attribute); } if (iterationMacro != null) { getProcess().getMacroHandler().addMacro(iterationMacro, parameter.replace(' ', '_')); } // applying subprocess innerExampleSetSource.deliver(subgroupSet); getSubprocess(0).execute(); if (filterAttribute) { subgroupSet.getAttributes().addRegular(attribute); } } inApplyLoop(); } } if (iterationMacro != null) { getProcess().getMacroHandler().addMacro(iterationMacro, null); } }
@Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { // determine new value types int valueType = Ontology.REAL; Iterator<AttributeRole> a = exampleSet.getAttributes().allAttributeRoles(); while (a.hasNext()) { AttributeRole attributeRole = a.next(); if (!attributeRole.isSpecial() || !attributeRole.getSpecialName().equals(Attributes.ID_NAME)) { if (attributeRole.getAttribute().isNominal()) { valueType = Ontology.NOMINAL; break; } } } // create new attributes List<Attribute> newAttributes = new ArrayList<Attribute>(exampleSet.size()); Attribute newIdAttribute = AttributeFactory.createAttribute(Attributes.ID_NAME, Ontology.NOMINAL); newAttributes.add(newIdAttribute); Attribute oldIdAttribute = exampleSet.getAttributes().getId(); if (oldIdAttribute != null) { for (Example e : exampleSet) { double idValue = e.getValue(oldIdAttribute); String attributeName = "att_" + idValue; if (oldIdAttribute.isNominal()) { if (Double.isNaN(idValue)) { newAttributes.add(AttributeFactory.createAttribute(valueType)); } else { attributeName = oldIdAttribute.getMapping().mapIndex((int) idValue); newAttributes.add(AttributeFactory.createAttribute(attributeName, valueType)); } } else { newAttributes.add(AttributeFactory.createAttribute(attributeName, valueType)); } } } else { for (int i = 0; i < exampleSet.size(); i++) { newAttributes.add(AttributeFactory.createAttribute("att_" + (i + 1), valueType)); } } // create and fill table MemoryExampleTable table = new MemoryExampleTable(newAttributes); a = exampleSet.getAttributes().allAttributeRoles(); while (a.hasNext()) { AttributeRole attributeRole = a.next(); if (!attributeRole.isSpecial() || !attributeRole.getSpecialName().equals(Attributes.ID_NAME)) { Attribute attribute = attributeRole.getAttribute(); double[] data = new double[exampleSet.size() + 1]; data[0] = newIdAttribute.getMapping().mapString(attribute.getName()); int counter = 1; for (Example e : exampleSet) { double currentValue = e.getValue(attribute); data[counter] = currentValue; Attribute newAttribute = newAttributes.get(counter); if (newAttribute.isNominal()) { if (!Double.isNaN(currentValue)) { String currentValueString = currentValue + ""; if (attribute.isNominal()) currentValueString = attribute.getMapping().mapIndex((int) currentValue); data[counter] = newAttribute.getMapping().mapString(currentValueString); } } counter++; } table.addDataRow(new DoubleArrayDataRow(data)); } } // create and deliver example set ExampleSet result = table.createExampleSet(null, null, newIdAttribute); result.getAnnotations().addAll(exampleSet.getAnnotations()); return result; }