@Override public void doWork() throws OperatorException { ExampleSet originalExampleSet = exampleSetInput.getData(ExampleSet.class); double fraction = getParameterAsDouble(PARAMETER_FRACTION); if (fraction < 0 || fraction > 1.0) throw new UserError( this, 207, new Object[] { fraction, "fraction", "Cannot use fractions of less than 0.0 or more than 1.0" }); SplittedExampleSet splitted = new SplittedExampleSet( originalExampleSet, fraction, getParameterAsInt(PARAMETER_SAMPLING_TYPE), getParameterAsBoolean(RandomGenerator.PARAMETER_USE_LOCAL_RANDOM_SEED), getParameterAsInt(RandomGenerator.PARAMETER_LOCAL_RANDOM_SEED)); splitted.selectSingleSubset(0); exampleSubsetInnerSource.deliver(splitted); getSubprocess(0).execute(); modelOutput.deliver(modelInnerSink.getData(IOObject.class)); }
@Override public void estimatePerformance(ExampleSet inputSet) throws OperatorException { // split by attribute Attribute batchAttribute = inputSet.getAttributes().getSpecial(Attributes.BATCH_NAME); if (batchAttribute == null) { throw new UserError(this, 113, Attributes.BATCH_NAME); } SplittedExampleSet splittedES = SplittedExampleSet.splitByAttribute(inputSet, batchAttribute); // start crossvalidation for (iteration = 0; iteration < splittedES.getNumberOfSubsets(); iteration++) { splittedES.selectAllSubsetsBut(iteration); learn(splittedES); splittedES.selectSingleSubset(iteration); evaluate(splittedES); inApplyLoop(); } // end crossvalidation }
private RuleModel createNumericalRuleModel(ExampleSet trainingSet, Attribute attribute) { RuleModel model = new RuleModel(trainingSet); // split by best attribute int oldSize = -1; while ((trainingSet.size() > 0) && (trainingSet.size() != oldSize)) { ExampleSet exampleSet = (ExampleSet) trainingSet.clone(); Split bestSplit = splitter.getBestSplit(exampleSet, attribute, null); double bestSplitValue = bestSplit.getSplitPoint(); if (!Double.isNaN(bestSplitValue)) { SplittedExampleSet splittedSet = SplittedExampleSet.splitByAttribute(exampleSet, attribute, bestSplitValue); Attribute label = splittedSet.getAttributes().getLabel(); splittedSet.selectSingleSubset(0); SplitCondition condition = new LessEqualsSplitCondition(attribute, bestSplitValue); splittedSet.recalculateAttributeStatistics(label); int labelValue = (int) splittedSet.getStatistics(label, Statistics.MODE); String labelName = label.getMapping().mapIndex(labelValue); Rule rule = new Rule(labelName, condition); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) frequencies[counter++] = (int) splittedSet.getStatistics(label, Statistics.COUNT, value); rule.setFrequencies(frequencies); model.addRule(rule); oldSize = trainingSet.size(); trainingSet = rule.removeCovered(trainingSet); } else { break; } } // add default rule if some examples were not yet covered if (trainingSet.size() > 0) { Attribute label = trainingSet.getAttributes().getLabel(); trainingSet.recalculateAttributeStatistics(label); int index = (int) trainingSet.getStatistics(label, Statistics.MODE); String defaultLabel = label.getMapping().mapIndex(index); Rule defaultRule = new Rule(defaultLabel); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) frequencies[counter++] = (int) (trainingSet.getStatistics(label, Statistics.COUNT, value)); defaultRule.setFrequencies(frequencies); model.addRule(defaultRule); } return model; }
private RuleModel createNominalRuleModel(ExampleSet exampleSet, Attribute attribute) { RuleModel model = new RuleModel(exampleSet); SplittedExampleSet splittedSet = SplittedExampleSet.splitByAttribute(exampleSet, attribute); Attribute label = splittedSet.getAttributes().getLabel(); for (int i = 0; i < splittedSet.getNumberOfSubsets(); i++) { splittedSet.selectSingleSubset(i); splittedSet.recalculateAttributeStatistics(label); SplitCondition term = new NominalSplitCondition(attribute, attribute.getMapping().mapIndex(i)); int labelValue = (int) splittedSet.getStatistics(label, Statistics.MODE); String labelName = label.getMapping().mapIndex(labelValue); Rule rule = new Rule(labelName, term); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) frequencies[counter++] = (int) splittedSet.getStatistics(label, Statistics.COUNT, value); rule.setFrequencies(frequencies); model.addRule(rule); } return model; }
@Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { // creating kernel and settings from Parameters int k = Math.min(100, exampleSet.getAttributes().size() * 2); int size = exampleSet.size(); switch (getParameterAsInt(PARAMETER_SAMPLE)) { case SAMPLE_ABSOLUTE: size = getParameterAsInt(PARAMETER_SAMPLE_SIZE); break; case SAMPLE_RELATIVE: size = (int) Math.round(exampleSet.size() * getParameterAsDouble(PARAMETER_SAMPLE_RATIO)); break; } DistanceMeasure distanceMeasure = new EuclideanDistance(); distanceMeasure.init(exampleSet); // finding farthest and nearest example to mean Vector double[] meanVector = getMeanVector(exampleSet); Candidate min = new Candidate(meanVector, Double.POSITIVE_INFINITY, 0); Candidate max = new Candidate(meanVector, Double.NEGATIVE_INFINITY, 0); int i = 0; for (Example example : exampleSet) { double[] exampleValues = getExampleValues(example); Candidate current = new Candidate( exampleValues, Math.abs(distanceMeasure.calculateDistance(meanVector, exampleValues)), i); if (current.compareTo(min) < 0) { min = current; } if (current.compareTo(max) > 0) { max = current; } i++; } ArrayList<Candidate> recentlySelected = new ArrayList<Candidate>(10); int[] partition = new int[exampleSet.size()]; int numberOfSelectedExamples = 2; recentlySelected.add(min); recentlySelected.add(max); partition[min.getExampleIndex()] = 1; partition[max.getExampleIndex()] = 1; double[] minimalDistances = new double[exampleSet.size()]; Arrays.fill(minimalDistances, Double.POSITIVE_INFINITY); // running now through examples, checking for smallest distance to one of the candidates while (numberOfSelectedExamples < size) { TreeSet<Candidate> candidates = new TreeSet<Candidate>(); i = 0; // check distance only for candidates recently selected. for (Example example : exampleSet) { // if example not has been selected allready if (partition[i] == 0) { double[] exampleValues = getExampleValues(example); for (Candidate candidate : recentlySelected) { minimalDistances[i] = Math.min( minimalDistances[i], Math.abs( distanceMeasure.calculateDistance(exampleValues, candidate.getValues()))); } Candidate newCandidate = new Candidate(exampleValues, minimalDistances[i], i); candidates.add(newCandidate); if (candidates.size() > k) { Iterator<Candidate> iterator = candidates.iterator(); iterator.next(); iterator.remove(); } } i++; } // clearing recently selected since now new ones will be selected recentlySelected.clear(); // now running in descending order through candidates and adding to selected // IM: descendingIterator() is not available in Java versions less than 6 !!! // IM: Bad workaround for now by adding all candidates into a list and using a listIterator() // and hasPrevious... /* Iterator<Candidate> descendingIterator = candidates.descendingIterator(); while (descendingIterator.hasNext() && numberOfSelectedExamples < desiredNumber) { Candidate candidate = descendingIterator.next(); */ List<Candidate> reverseCandidateList = new LinkedList<Candidate>(); Iterator<Candidate> it = candidates.iterator(); while (it.hasNext()) { reverseCandidateList.add(it.next()); } ListIterator<Candidate> lit = reverseCandidateList.listIterator(reverseCandidateList.size() - 1); while (lit.hasPrevious()) { Candidate candidate = lit.previous(); // IM: end of workaround boolean existSmallerDistance = false; Iterator<Candidate> addedIterator = recentlySelected.iterator(); // test if a distance to recently selected is smaller than previously calculated minimal // distance // if one exists: This is not selected while (addedIterator.hasNext()) { double distance = Math.abs( distanceMeasure.calculateDistance( addedIterator.next().getValues(), candidate.getValues())); existSmallerDistance = existSmallerDistance || distance < candidate.getDistance(); } if (!existSmallerDistance) { recentlySelected.add(candidate); partition[candidate.getExampleIndex()] = 1; numberOfSelectedExamples++; } else break; } } // building new exampleSet containing only Examples with indices in selectedExamples SplittedExampleSet sample = new SplittedExampleSet(exampleSet, new Partition(partition, 2)); sample.selectSingleSubset(1); return sample; }