@Override
  public void doWork() throws OperatorException {
    ExampleSet originalExampleSet = exampleSetInput.getData(ExampleSet.class);

    double fraction = getParameterAsDouble(PARAMETER_FRACTION);
    if (fraction < 0 || fraction > 1.0)
      throw new UserError(
          this,
          207,
          new Object[] {
            fraction, "fraction", "Cannot use fractions of less than 0.0 or more than 1.0"
          });
    SplittedExampleSet splitted =
        new SplittedExampleSet(
            originalExampleSet,
            fraction,
            getParameterAsInt(PARAMETER_SAMPLING_TYPE),
            getParameterAsBoolean(RandomGenerator.PARAMETER_USE_LOCAL_RANDOM_SEED),
            getParameterAsInt(RandomGenerator.PARAMETER_LOCAL_RANDOM_SEED));
    splitted.selectSingleSubset(0);

    exampleSubsetInnerSource.deliver(splitted);
    getSubprocess(0).execute();
    modelOutput.deliver(modelInnerSink.getData(IOObject.class));
  }
  private RuleModel createNumericalRuleModel(ExampleSet trainingSet, Attribute attribute) {
    RuleModel model = new RuleModel(trainingSet);

    // split by best attribute
    int oldSize = -1;
    while ((trainingSet.size() > 0) && (trainingSet.size() != oldSize)) {
      ExampleSet exampleSet = (ExampleSet) trainingSet.clone();
      Split bestSplit = splitter.getBestSplit(exampleSet, attribute, null);
      double bestSplitValue = bestSplit.getSplitPoint();
      if (!Double.isNaN(bestSplitValue)) {
        SplittedExampleSet splittedSet =
            SplittedExampleSet.splitByAttribute(exampleSet, attribute, bestSplitValue);
        Attribute label = splittedSet.getAttributes().getLabel();
        splittedSet.selectSingleSubset(0);
        SplitCondition condition = new LessEqualsSplitCondition(attribute, bestSplitValue);

        splittedSet.recalculateAttributeStatistics(label);
        int labelValue = (int) splittedSet.getStatistics(label, Statistics.MODE);
        String labelName = label.getMapping().mapIndex(labelValue);
        Rule rule = new Rule(labelName, condition);

        int[] frequencies = new int[label.getMapping().size()];
        int counter = 0;
        for (String value : label.getMapping().getValues())
          frequencies[counter++] = (int) splittedSet.getStatistics(label, Statistics.COUNT, value);
        rule.setFrequencies(frequencies);
        model.addRule(rule);
        oldSize = trainingSet.size();
        trainingSet = rule.removeCovered(trainingSet);
      } else {
        break;
      }
    }

    // add default rule if some examples were not yet covered
    if (trainingSet.size() > 0) {
      Attribute label = trainingSet.getAttributes().getLabel();
      trainingSet.recalculateAttributeStatistics(label);
      int index = (int) trainingSet.getStatistics(label, Statistics.MODE);
      String defaultLabel = label.getMapping().mapIndex(index);
      Rule defaultRule = new Rule(defaultLabel);
      int[] frequencies = new int[label.getMapping().size()];
      int counter = 0;
      for (String value : label.getMapping().getValues())
        frequencies[counter++] = (int) (trainingSet.getStatistics(label, Statistics.COUNT, value));
      defaultRule.setFrequencies(frequencies);
      model.addRule(defaultRule);
    }

    return model;
  }
Beispiel #3
0
  @Override
  public void estimatePerformance(ExampleSet inputSet) throws OperatorException {
    // split by attribute
    Attribute batchAttribute = inputSet.getAttributes().getSpecial(Attributes.BATCH_NAME);
    if (batchAttribute == null) {
      throw new UserError(this, 113, Attributes.BATCH_NAME);
    }
    SplittedExampleSet splittedES = SplittedExampleSet.splitByAttribute(inputSet, batchAttribute);

    // start crossvalidation
    for (iteration = 0; iteration < splittedES.getNumberOfSubsets(); iteration++) {

      splittedES.selectAllSubsetsBut(iteration);
      learn(splittedES);

      splittedES.selectSingleSubset(iteration);
      evaluate(splittedES);

      inApplyLoop();
    }
    // end crossvalidation
  }
  private RuleModel createNominalRuleModel(ExampleSet exampleSet, Attribute attribute) {
    RuleModel model = new RuleModel(exampleSet);
    SplittedExampleSet splittedSet = SplittedExampleSet.splitByAttribute(exampleSet, attribute);
    Attribute label = splittedSet.getAttributes().getLabel();
    for (int i = 0; i < splittedSet.getNumberOfSubsets(); i++) {
      splittedSet.selectSingleSubset(i);
      splittedSet.recalculateAttributeStatistics(label);
      SplitCondition term =
          new NominalSplitCondition(attribute, attribute.getMapping().mapIndex(i));

      int labelValue = (int) splittedSet.getStatistics(label, Statistics.MODE);
      String labelName = label.getMapping().mapIndex(labelValue);
      Rule rule = new Rule(labelName, term);

      int[] frequencies = new int[label.getMapping().size()];
      int counter = 0;
      for (String value : label.getMapping().getValues())
        frequencies[counter++] = (int) splittedSet.getStatistics(label, Statistics.COUNT, value);
      rule.setFrequencies(frequencies);
      model.addRule(rule);
    }
    return model;
  }
  @Override
  public ExampleSet apply(ExampleSet exampleSet) throws OperatorException {
    // creating kernel and settings from Parameters
    int k = Math.min(100, exampleSet.getAttributes().size() * 2);
    int size = exampleSet.size();
    switch (getParameterAsInt(PARAMETER_SAMPLE)) {
      case SAMPLE_ABSOLUTE:
        size = getParameterAsInt(PARAMETER_SAMPLE_SIZE);
        break;
      case SAMPLE_RELATIVE:
        size = (int) Math.round(exampleSet.size() * getParameterAsDouble(PARAMETER_SAMPLE_RATIO));
        break;
    }

    DistanceMeasure distanceMeasure = new EuclideanDistance();
    distanceMeasure.init(exampleSet);

    // finding farthest and nearest example to mean Vector
    double[] meanVector = getMeanVector(exampleSet);
    Candidate min = new Candidate(meanVector, Double.POSITIVE_INFINITY, 0);
    Candidate max = new Candidate(meanVector, Double.NEGATIVE_INFINITY, 0);
    int i = 0;
    for (Example example : exampleSet) {
      double[] exampleValues = getExampleValues(example);
      Candidate current =
          new Candidate(
              exampleValues,
              Math.abs(distanceMeasure.calculateDistance(meanVector, exampleValues)),
              i);
      if (current.compareTo(min) < 0) {
        min = current;
      }
      if (current.compareTo(max) > 0) {
        max = current;
      }
      i++;
    }
    ArrayList<Candidate> recentlySelected = new ArrayList<Candidate>(10);
    int[] partition = new int[exampleSet.size()];
    int numberOfSelectedExamples = 2;
    recentlySelected.add(min);
    recentlySelected.add(max);
    partition[min.getExampleIndex()] = 1;
    partition[max.getExampleIndex()] = 1;
    double[] minimalDistances = new double[exampleSet.size()];
    Arrays.fill(minimalDistances, Double.POSITIVE_INFINITY);

    // running now through examples, checking for smallest distance to one of the candidates
    while (numberOfSelectedExamples < size) {
      TreeSet<Candidate> candidates = new TreeSet<Candidate>();

      i = 0;
      // check distance only for candidates recently selected.
      for (Example example : exampleSet) {
        // if example not has been selected allready
        if (partition[i] == 0) {
          double[] exampleValues = getExampleValues(example);
          for (Candidate candidate : recentlySelected) {
            minimalDistances[i] =
                Math.min(
                    minimalDistances[i],
                    Math.abs(
                        distanceMeasure.calculateDistance(exampleValues, candidate.getValues())));
          }
          Candidate newCandidate = new Candidate(exampleValues, minimalDistances[i], i);
          candidates.add(newCandidate);
          if (candidates.size() > k) {
            Iterator<Candidate> iterator = candidates.iterator();
            iterator.next();
            iterator.remove();
          }
        }
        i++;
      }
      // clearing recently selected since now new ones will be selected
      recentlySelected.clear();

      // now running in descending order through candidates and adding to selected
      // IM: descendingIterator() is not available in Java versions less than 6 !!!
      // IM: Bad workaround for now by adding all candidates into a list and using a listIterator()
      // and hasPrevious...
      /*
      Iterator<Candidate> descendingIterator = candidates.descendingIterator();
      while (descendingIterator.hasNext() && numberOfSelectedExamples < desiredNumber) {
      	Candidate candidate = descendingIterator.next();
       */

      List<Candidate> reverseCandidateList = new LinkedList<Candidate>();
      Iterator<Candidate> it = candidates.iterator();
      while (it.hasNext()) {
        reverseCandidateList.add(it.next());
      }

      ListIterator<Candidate> lit =
          reverseCandidateList.listIterator(reverseCandidateList.size() - 1);
      while (lit.hasPrevious()) {
        Candidate candidate = lit.previous();
        // IM: end of workaround

        boolean existSmallerDistance = false;
        Iterator<Candidate> addedIterator = recentlySelected.iterator();
        // test if a distance to recently selected is smaller than previously calculated minimal
        // distance
        // if one exists: This is not selected
        while (addedIterator.hasNext()) {
          double distance =
              Math.abs(
                  distanceMeasure.calculateDistance(
                      addedIterator.next().getValues(), candidate.getValues()));
          existSmallerDistance = existSmallerDistance || distance < candidate.getDistance();
        }
        if (!existSmallerDistance) {
          recentlySelected.add(candidate);
          partition[candidate.getExampleIndex()] = 1;
          numberOfSelectedExamples++;
        } else break;
      }
    }

    // building new exampleSet containing only Examples with indices in selectedExamples

    SplittedExampleSet sample = new SplittedExampleSet(exampleSet, new Partition(partition, 2));
    sample.selectSingleSubset(1);
    return sample;
  }