public Model learn(ExampleSet exampleSet) throws OperatorException {
   double value = 0.0;
   double[] confidences = null;
   int method = getParameterAsInt(PARAMETER_METHOD);
   Attribute label = exampleSet.getAttributes().getLabel();
   if ((label.isNominal()) && ((method == MEDIAN) || (method == AVERAGE))) {
     logWarning(
         "Cannot use method '" + METHODS[method] + "' for nominal labels: changing to 'mode'!");
     method = MODE;
   } else if ((!label.isNominal()) && (method == MODE)) {
     logWarning(
         "Cannot use method '"
             + METHODS[method]
             + "' for numerical labels: changing to 'average'!");
     method = AVERAGE;
   }
   switch (method) {
     case MEDIAN:
       double[] labels = new double[exampleSet.size()];
       Iterator<Example> r = exampleSet.iterator();
       int counter = 0;
       while (r.hasNext()) {
         Example example = r.next();
         labels[counter++] = example.getValue(example.getAttributes().getLabel());
       }
       java.util.Arrays.sort(labels);
       value = labels[exampleSet.size() / 2];
       break;
     case AVERAGE:
       exampleSet.recalculateAttributeStatistics(label);
       value = exampleSet.getStatistics(label, Statistics.AVERAGE);
       break;
     case MODE:
       exampleSet.recalculateAttributeStatistics(label);
       value = exampleSet.getStatistics(label, Statistics.MODE);
       confidences = new double[label.getMapping().size()];
       for (int i = 0; i < confidences.length; i++) {
         confidences[i] =
             exampleSet.getStatistics(label, Statistics.COUNT, label.getMapping().mapIndex(i))
                 / exampleSet.size();
       }
       break;
     case CONSTANT:
       value = getParameterAsDouble(PARAMETER_CONSTANT);
       break;
     case ATTRIBUTE:
       return new AttributeDefaultModel(
           exampleSet, getParameterAsString(PARAMETER_ATTRIBUTE_NAME));
     default:
       // cannot happen
       throw new OperatorException("DefaultLearner: Unknown default method '" + method + "'!");
   }
   log(
       "Default value is '"
           + (label.isNominal() ? label.getMapping().mapIndex((int) value) : value + "")
           + "'.");
   return new DefaultModel(exampleSet, value, confidences);
 }
  private RuleModel createNumericalRuleModel(ExampleSet trainingSet, Attribute attribute) {
    RuleModel model = new RuleModel(trainingSet);

    // split by best attribute
    int oldSize = -1;
    while ((trainingSet.size() > 0) && (trainingSet.size() != oldSize)) {
      ExampleSet exampleSet = (ExampleSet) trainingSet.clone();
      Split bestSplit = splitter.getBestSplit(exampleSet, attribute, null);
      double bestSplitValue = bestSplit.getSplitPoint();
      if (!Double.isNaN(bestSplitValue)) {
        SplittedExampleSet splittedSet =
            SplittedExampleSet.splitByAttribute(exampleSet, attribute, bestSplitValue);
        Attribute label = splittedSet.getAttributes().getLabel();
        splittedSet.selectSingleSubset(0);
        SplitCondition condition = new LessEqualsSplitCondition(attribute, bestSplitValue);

        splittedSet.recalculateAttributeStatistics(label);
        int labelValue = (int) splittedSet.getStatistics(label, Statistics.MODE);
        String labelName = label.getMapping().mapIndex(labelValue);
        Rule rule = new Rule(labelName, condition);

        int[] frequencies = new int[label.getMapping().size()];
        int counter = 0;
        for (String value : label.getMapping().getValues())
          frequencies[counter++] = (int) splittedSet.getStatistics(label, Statistics.COUNT, value);
        rule.setFrequencies(frequencies);
        model.addRule(rule);
        oldSize = trainingSet.size();
        trainingSet = rule.removeCovered(trainingSet);
      } else {
        break;
      }
    }

    // add default rule if some examples were not yet covered
    if (trainingSet.size() > 0) {
      Attribute label = trainingSet.getAttributes().getLabel();
      trainingSet.recalculateAttributeStatistics(label);
      int index = (int) trainingSet.getStatistics(label, Statistics.MODE);
      String defaultLabel = label.getMapping().mapIndex(index);
      Rule defaultRule = new Rule(defaultLabel);
      int[] frequencies = new int[label.getMapping().size()];
      int counter = 0;
      for (String value : label.getMapping().getValues())
        frequencies[counter++] = (int) (trainingSet.getStatistics(label, Statistics.COUNT, value));
      defaultRule.setFrequencies(frequencies);
      model.addRule(defaultRule);
    }

    return model;
  }
 private double[] getMeanVector(ExampleSet exampleSet) {
   exampleSet.recalculateAllAttributeStatistics();
   Attributes attributes = exampleSet.getAttributes();
   double[] meanVector = new double[attributes.size()];
   int i = 0;
   for (Attribute attribute : attributes) {
     if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) {
       meanVector[i] = exampleSet.getStatistics(attribute, Statistics.MINIMUM);
     } else if (attribute.isNominal())
       meanVector[i] = exampleSet.getStatistics(attribute, Statistics.MODE);
     else meanVector[i] = exampleSet.getStatistics(attribute, Statistics.AVERAGE);
     i++;
   }
   return meanVector;
 }
  private NeuralDataSet getTraining(ExampleSet exampleSet) {
    double[][] data = new double[exampleSet.size()][exampleSet.getAttributes().size()];
    double[][] labels = new double[exampleSet.size()][1];
    int index = 0;
    Attribute label = exampleSet.getAttributes().getLabel();

    this.attributeMin = new double[exampleSet.getAttributes().size()];
    this.attributeMax = new double[attributeMin.length];
    exampleSet.recalculateAllAttributeStatistics();
    int a = 0;
    for (Attribute attribute : exampleSet.getAttributes()) {
      this.attributeMin[a] = exampleSet.getStatistics(attribute, Statistics.MINIMUM);
      this.attributeMax[a] = exampleSet.getStatistics(attribute, Statistics.MAXIMUM);
      a++;
    }

    this.labelMin = exampleSet.getStatistics(label, Statistics.MINIMUM);
    this.labelMax = exampleSet.getStatistics(label, Statistics.MAXIMUM);

    for (Example example : exampleSet) {
      // attributes
      a = 0;
      for (Attribute attribute : exampleSet.getAttributes()) {
        if (attributeMin[a] != attributeMax[a]) {
          data[index][a] =
              (example.getValue(attribute) - attributeMin[a]) / (attributeMax[a] - attributeMin[a]);
        } else {
          data[index][a] = example.getValue(attribute) - attributeMin[a];
        }
        a++;
      }

      // label
      if (label.isNominal()) {
        labels[index][0] = example.getValue(label);
      } else {
        if (labelMax != labelMin) {
          labels[index][0] = (example.getValue(label) - labelMin) / (labelMax - labelMin);
        } else {
          labels[index][0] = example.getValue(label) - labelMin;
        }
      }

      index++;
    }

    return new BasicNeuralDataSet(data, labels);
  }
Exemple #5
0
  public PCAModel(ExampleSet eSet, double[] eigenvalues, double[][] eigenvectors) {
    super(eSet);

    this.keepAttributes = false;
    this.attributeNames = new String[eSet.getAttributes().size()];
    this.means = new double[eSet.getAttributes().size()];
    int counter = 0;
    eSet.recalculateAllAttributeStatistics(); // ensures that the statistics were created
    for (Attribute attribute : eSet.getAttributes()) {
      attributeNames[counter] = attribute.getName();
      means[counter] = eSet.getStatistics(attribute, Statistics.AVERAGE);
      counter++;
    }
    this.eigenVectors = new ArrayList<Eigenvector>(eigenvalues.length);
    for (int i = 0; i < eigenvalues.length; i++) {
      double[] currentEigenVector = new double[eSet.getAttributes().size()];
      for (int j = 0; j < currentEigenVector.length; j++) {
        currentEigenVector[j] = eigenvectors[j][i];
      }
      this.eigenVectors.add(new Eigenvector(currentEigenVector, eigenvalues[i]));
    }

    // order the eigenvectors by the eigenvalues
    Collections.sort(this.eigenVectors);

    calculateCumulativeVariance();
  }
  @Override
  public PreprocessingModel createPreprocessingModel(ExampleSet exampleSet)
      throws OperatorException {
    boolean sortMappings = getParameterAsBoolean(PARAMETER_SORT_MAPPING_ALPHABETICALLY);

    Map<String, MappingTranslation> translations = new HashMap<String, MappingTranslation>();

    exampleSet.recalculateAllAttributeStatistics();
    for (Attribute attribute : exampleSet.getAttributes()) {
      MappingTranslation translation =
          new MappingTranslation((NominalMapping) attribute.getMapping().clone());
      if (attribute.isNominal()) {
        for (String value : attribute.getMapping().getValues()) {
          double count = exampleSet.getStatistics(attribute, Statistics.COUNT, value);
          if (count > 0) {
            translation.newMapping.mapString(value);
          }
        }
        if (translation.newMapping.size() < attribute.getMapping().size()) {
          if (sortMappings) {
            translation.newMapping.sortMappings();
          }
          translations.put(attribute.getName(), translation);
        }
      }
    }
    return new RemoveUnusedNominalValuesModel(exampleSet, translations);
  }
  @Override
  public void doWork() throws OperatorException {
    ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class);

    // only use numeric attributes
    Tools.onlyNumericalAttributes(exampleSet, "KernelPCA");
    Tools.onlyNonMissingValues(exampleSet, getOperatorClassName(), this);

    Attributes attributes = exampleSet.getAttributes();
    int numberOfExamples = exampleSet.size();

    // calculating means for later zero centering
    exampleSet.recalculateAllAttributeStatistics();
    double[] means = new double[exampleSet.getAttributes().size()];
    int i = 0;
    for (Attribute attribute : exampleSet.getAttributes()) {
      means[i] = exampleSet.getStatistics(attribute, Statistics.AVERAGE);
      i++;
    }

    // kernel
    Kernel kernel = Kernel.createKernel(this);

    // copying zero centered exampleValues
    ArrayList<double[]> exampleValues = new ArrayList<double[]>(numberOfExamples);
    i = 0;
    for (Example columnExample : exampleSet) {
      double[] columnValues = getAttributeValues(columnExample, attributes, means);
      exampleValues.add(columnValues);
      i++;
    }

    // filling kernel matrix
    Matrix kernelMatrix = new Matrix(numberOfExamples, numberOfExamples);
    for (i = 0; i < numberOfExamples; i++) {
      for (int j = 0; j < numberOfExamples; j++) {
        kernelMatrix.set(
            i, j, kernel.calculateDistance(exampleValues.get(i), exampleValues.get(j)));
      }
    }

    // calculating eigenVectors
    EigenvalueDecomposition eig = kernelMatrix.eig();
    Model model = new KernelPCAModel(exampleSet, means, eig.getV(), exampleValues, kernel);

    if (exampleSetOutput.isConnected()) {
      exampleSetOutput.deliver(model.apply(exampleSet));
    }
    originalOutput.deliver(exampleSet);
    modelOutput.deliver(model);
  }
  @Override
  public void updateStatistics(final ExampleSet exampleSet) {
    final String days =
        WHITESPACE
            + I18N.getMessage(
                I18N.getGUIBundle(), "gui.label.attribute_statistics.statistics.days.label");
    final String hours =
        WHITESPACE
            + I18N.getMessage(
                I18N.getGUIBundle(), "gui.label.attribute_statistics.statistics.hours.label");

    long minMilliseconds = (long) exampleSet.getStatistics(getAttribute(), Statistics.MINIMUM);
    long maxMilliseconds = (long) exampleSet.getStatistics(getAttribute(), Statistics.MAXIMUM);
    long difference = maxMilliseconds - minMilliseconds;
    String dura = "";
    if (getAttribute().getValueType() == Ontology.DATE) {
      // days
      dura +=
          com.rapidminer.tools.Tools.formatIntegerIfPossible(
                  Math.floor(difference / (H_IN_D * M_IN_H * S_IN_M * MS_IN_S)), 3)
              + days;
    } else if (getAttribute().getValueType() == Ontology.TIME) {
      // hours
      dura +=
          com.rapidminer.tools.Tools.formatIntegerIfPossible(
                  Math.floor(difference / (M_IN_H * S_IN_M * MS_IN_S)), 3)
              + hours;
    } else if (getAttribute().getValueType() == Ontology.DATE_TIME) {
      // days + hours + minutes + seconds
      dura +=
          com.rapidminer.tools.Tools.formatIntegerIfPossible(
                  Math.floor(difference / (H_IN_D * M_IN_H * S_IN_M * MS_IN_S)), 3)
              + SHORT_DAY;
      dura += WHITESPACE;
      double leftoverMilliSeconds = difference % (H_IN_D * M_IN_H * S_IN_M * MS_IN_S);
      dura +=
          com.rapidminer.tools.Tools.formatIntegerIfPossible(
                  Math.floor(leftoverMilliSeconds / (M_IN_H * S_IN_M * MS_IN_S)), 3)
              + SHORT_HOUR;
      dura += WHITESPACE;
      leftoverMilliSeconds = leftoverMilliSeconds % (M_IN_H * S_IN_M * MS_IN_S);
      dura +=
          com.rapidminer.tools.Tools.formatIntegerIfPossible(
                  Math.floor(leftoverMilliSeconds / (S_IN_M * MS_IN_S)), 3)
              + SHORT_MINUTE;
      dura += WHITESPACE;
      leftoverMilliSeconds = leftoverMilliSeconds % (S_IN_M * MS_IN_S);
      dura +=
          com.rapidminer.tools.Tools.formatIntegerIfPossible(
                  Math.floor(leftoverMilliSeconds / MS_IN_S), 3)
              + SHORT_SECOND;
    }
    String minResult = null;
    String maxResult = null;
    if (getAttribute().getValueType() == Ontology.DATE) {
      minResult = FORMAT_DATE.format(new Date(minMilliseconds));
      maxResult = FORMAT_DATE.format(new Date(maxMilliseconds));
    } else if (getAttribute().getValueType() == Ontology.TIME) {
      minResult = FORMAT_TIME.format(new Date(minMilliseconds));
      maxResult = FORMAT_TIME.format(new Date(maxMilliseconds));
    } else if (getAttribute().getValueType() == Ontology.DATE_TIME) {
      minResult = FORMAT_DATE_TIME.format(new Date(minMilliseconds));
      maxResult = FORMAT_DATE_TIME.format(new Date(maxMilliseconds));
    }
    missing = exampleSet.getStatistics(getAttribute(), Statistics.UNKNOWN);
    from = minResult;
    until = maxResult;
    duration = dura;

    fireStatisticsChangedEvent();
  }
  @Override
  public void doWork() throws OperatorException {
    ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class);
    exampleSet.recalculateAllAttributeStatistics();

    List<String[]> attributeValueOptions = getParameterList(PARAMETER_ATTRIBUTES);

    LinkedHashMap<Attribute, Integer> attributeValueOptionsMap =
        new LinkedHashMap<Attribute, Integer>();
    int[] valueOptions = new int[attributeValueOptions.size()];
    Pattern[] attributeRegexPatterns = new Pattern[attributeValueOptions.size()];

    Attribute[] attributes = new Attribute[attributeValueOptions.size()];

    Iterator<String[]> iterator = attributeValueOptions.iterator();
    int j = 0;
    while (iterator.hasNext()) {
      String[] pair = iterator.next();
      String regex = pair[0];
      try {
        attributeRegexPatterns[j] = Pattern.compile(regex);
      } catch (PatternSyntaxException e) {
        throw new UserError(this, 206, regex, e.getMessage());
      }
      attributes[j] = exampleSet.getAttributes().get(pair[0]);
      valueOptions[j] =
          ((ParameterTypeCategory)
                  ((ParameterTypeList) getParameterType(PARAMETER_ATTRIBUTES)).getValueType())
              .getIndex(pair[1]);
      j++;
    }

    for (int i = 0; i < attributeRegexPatterns.length; i++) {
      Iterator<Attribute> a = exampleSet.getAttributes().allAttributes();
      while (a.hasNext()) {
        Attribute attribute = a.next();
        Matcher matcher = attributeRegexPatterns[i].matcher(attribute.getName());
        if (matcher.matches()) {
          attributeValueOptionsMap.put(attribute, valueOptions[i]);
        }
      }
    }

    double p = getParameterAsDouble(PARAMETER_P);
    boolean filterAttribute = getParameterAsBoolean(PARAMETER_FILTER_ATTRIBUTE);
    String iterationMacro = getParameterAsString(PARAMETER_ITERATION_MACRO);

    // applying on complete set
    if (getParameterAsBoolean(PARAMETER_APPLY_ON_COMPLETE_SET)) {
      if (iterationMacro != null) {
        getProcess().getMacroHandler().addMacro(iterationMacro, "ALL");
      }

      innerExampleSetSource.deliver(exampleSet);
      getSubprocess(0).execute();
    }

    // applying on subgroups defined by attributes
    for (Entry<Attribute, Integer> attributeEntry : attributeValueOptionsMap.entrySet()) {
      Attribute attribute = attributeEntry.getKey();
      if (!attribute.isNominal()) {
        continue;
      }
      List<String> values = null;
      switch (attributeEntry.getValue()) {
        case VALUE_OPTION_ALL:
          values = attribute.getMapping().getValues();
          break;
        case VALUE_OPTION_ABOVE_P:
          values = new Vector<String>();
          for (String value : attribute.getMapping().getValues()) {
            if (exampleSet.getStatistics(attribute, Statistics.COUNT, value) / exampleSet.size()
                >= p) {
              values.add(value);
            }
          }
          break;
        default:
          values = attribute.getMapping().getValues();
          break;
      }

      for (String value : values) {
        if (exampleSet.getStatistics(attribute, Statistics.COUNT, value) > 0) {
          String className = "attribute_value_filter";
          String parameter = attribute.getName() + "=" + value;
          log("Creating condition '" + className + "' with parameter '" + parameter + "'");
          Condition condition = null;
          try {
            condition = ConditionedExampleSet.createCondition(className, exampleSet, parameter);
          } catch (ConditionCreationException e) {
            throw new UserError(this, 904, className, e.getMessage());
          }
          ExampleSet subgroupSet = new ConditionedExampleSet(exampleSet, condition, false);
          if (filterAttribute) {
            subgroupSet.getAttributes().remove(attribute);
          }
          if (iterationMacro != null) {
            getProcess().getMacroHandler().addMacro(iterationMacro, parameter.replace(' ', '_'));
          }

          // applying subprocess
          innerExampleSetSource.deliver(subgroupSet);
          getSubprocess(0).execute();

          if (filterAttribute) {
            subgroupSet.getAttributes().addRegular(attribute);
          }
        }
        inApplyLoop();
      }
    }

    if (iterationMacro != null) {
      getProcess().getMacroHandler().addMacro(iterationMacro, null);
    }
  }
  @Override
  public Model learn(ExampleSet exampleSet) throws OperatorException {
    Attribute label = exampleSet.getAttributes().getLabel();
    RuleModel ruleModel = new RuleModel(exampleSet);

    double pureness = getParameterAsDouble(PARAMETER_PURENESS);
    TermDetermination termDetermination = new TermDetermination(new AccuracyCriterion(), 0.5d);
    ExampleSet trainingSet = (ExampleSet) exampleSet.clone();

    for (String labelName : label.getMapping().getValues()) {
      trainingSet.recalculateAttributeStatistics(label);
      int oldSize = -1;
      while (trainingSet.size() > 0
          && trainingSet.size() != oldSize
          && trainingSet.getStatistics(label, Statistics.COUNT, labelName) > 0) {
        Rule rule = new Rule(labelName);
        ExampleSet oldTrainingSet = (ExampleSet) trainingSet.clone();

        // grow rule
        int growOldSize = -1;
        ExampleSet growSet = (ExampleSet) trainingSet.clone();
        while (growSet.size() > 0
            && growSet.size() != growOldSize
            && !rule.isPure(growSet, pureness)
            && growSet.getAttributes().size() > 0) {
          SplitCondition term = termDetermination.getBestTerm(growSet, labelName);
          if (term == null) {
            break;
          }

          rule.addTerm(term);

          Attribute splitAttribute = growSet.getAttributes().get(term.getAttributeName());
          growSet.getAttributes().remove(splitAttribute);
          growOldSize = growSet.size();
          growSet = rule.getCovered(growSet);
        }

        // add rule if not empty
        if (rule.getTerms().size() > 0) {
          growSet = rule.getCovered(trainingSet);
          growSet.recalculateAttributeStatistics(label);
          int[] frequencies = new int[label.getMapping().size()];
          int counter = 0;
          for (String value : label.getMapping().getValues()) {
            frequencies[counter++] = (int) growSet.getStatistics(label, Statistics.COUNT, value);
          }
          rule.setFrequencies(frequencies);
          ruleModel.addRule(rule);
          oldSize = trainingSet.size();

          trainingSet = rule.removeCovered(oldTrainingSet);
        } else {
          break; // no other terms found for this class --> next class
        }

        trainingSet.recalculateAttributeStatistics(label);
      }
      checkForStop();
    }

    // training set not empty? add default rule
    if (trainingSet.size() > 0) {
      trainingSet.recalculateAttributeStatistics(label);
      int index = (int) trainingSet.getStatistics(label, Statistics.MODE);
      String defaultLabel = label.getMapping().mapIndex(index);
      Rule defaultRule = new Rule(defaultLabel);
      int[] frequencies = new int[label.getMapping().size()];
      int counter = 0;
      for (String value : label.getMapping().getValues()) {
        frequencies[counter++] = (int) trainingSet.getStatistics(label, Statistics.COUNT, value);
      }
      defaultRule.setFrequencies(frequencies);
      ruleModel.addRule(defaultRule);
    }

    return ruleModel;
  }
  @Override
  public PreprocessingModel createPreprocessingModel(ExampleSet exampleSet)
      throws OperatorException {
    HashMap<Attribute, double[]> ranges = new HashMap<Attribute, double[]>();
    // Get and check parametervalues
    boolean useSqrt = getParameterAsBoolean(PARAMETER_USE_SQRT_OF_EXAMPLES);
    int numberOfBins = 0;
    if (!useSqrt) {
      // if not automatic sizing of bins, use parametervalue
      numberOfBins = getParameterAsInt(PARAMETER_NUMBER_OF_BINS);
      if (numberOfBins >= (exampleSet.size() - 1)) {
        throw new UserError(
            this,
            116,
            PARAMETER_NUMBER_OF_BINS,
            "number of bins must be smaller than number of examples (here: "
                + exampleSet.size()
                + ")");
      }
    } else {
      exampleSet.recalculateAllAttributeStatistics();
    }

    for (Attribute currentAttribute : exampleSet.getAttributes()) {
      if (useSqrt) {
        numberOfBins =
            (int)
                Math.round(
                    Math.sqrt(
                        exampleSet.size()
                            - (int)
                                exampleSet.getStatistics(currentAttribute, Statistics.UNKNOWN)));
      }
      double[] attributeRanges = new double[numberOfBins];
      ExampleSet sortedSet =
          new SortedExampleSet(exampleSet, currentAttribute, SortedExampleSet.INCREASING);

      // finding ranges
      double examplesPerBin = exampleSet.size() / (double) numberOfBins;
      double currentBinSpace = examplesPerBin;
      double lastValue = Double.NaN;
      int currentBin = 0;

      for (Example example : sortedSet) {
        double value = example.getValue(currentAttribute);
        if (!Double.isNaN(value)) {
          // change bin if full and not last
          if (currentBinSpace < 1 && currentBin < numberOfBins && value != lastValue) {
            if (!Double.isNaN(lastValue)) {
              attributeRanges[currentBin] = (lastValue + value) / 2;
              currentBin++;
              currentBinSpace += examplesPerBin; // adding because same values might
              // cause binspace to be negative
              if (currentBinSpace < 1) {
                throw new UserError(this, 944, currentAttribute.getName());
              }
            }
          }
          currentBinSpace--;
          lastValue = value;
        }
      }
      attributeRanges[numberOfBins - 1] = Double.POSITIVE_INFINITY;
      ranges.put(currentAttribute, attributeRanges);
    }
    DiscretizationModel model = new DiscretizationModel(exampleSet);

    // determine number of digits
    int numberOfDigits = -1;
    if (getParameterAsBoolean(PARAMETER_AUTOMATIC_NUMBER_OF_DIGITS) == false) {
      numberOfDigits = getParameterAsInt(PARAMETER_NUMBER_OF_DIGITS);
    }

    model.setRanges(ranges, "range", getParameterAsInt(PARAMETER_RANGE_NAME_TYPE), numberOfDigits);
    return model;
  }