Пример #1
0
  /**
   * Generate artificial training examples.
   *
   * @param artSize size of examples set to create
   * @param data training data
   * @return the set of unlabeled artificial examples
   */
  protected Instances generateArtificialData(int artSize, Instances data) {
    int numAttributes = data.numAttributes();
    Instances artData = new Instances(data, artSize);
    double[] att;
    Instance artInstance;

    for (int i = 0; i < artSize; i++) {
      att = new double[numAttributes];
      for (int j = 0; j < numAttributes; j++) {
        if (data.attribute(j).isNominal()) {
          // Select nominal value based on the frequency of occurence in the training data
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (double) selectIndexProbabilistically(stats);
        } else if (data.attribute(j).isNumeric()) {
          // Generate numeric value from the Guassian distribution
          // defined by the mean and std dev of the attribute
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0];
        } else System.err.println("Decorate can only handle numeric and nominal values.");
      }
      artInstance = new Instance(1.0, att);
      artData.add(artInstance);
    }
    return artData;
  }
Пример #2
0
 /**
  * Returns an enumeration of the additional measure names
  *
  * @return an enumeration of the measure names
  */
 public Enumeration enumerateMeasures() {
   Vector newVector = new Vector(3);
   newVector.addElement("measureTreeSize");
   newVector.addElement("measureNumLeaves");
   newVector.addElement("measureNumRules");
   return newVector.elements();
 }
Пример #3
0
  /**
   * Compute and store statistics required for generating artificial data.
   *
   * @param data training instances
   * @exception Exception if statistics could not be calculated successfully
   */
  protected void computeStats(Instances data) throws Exception {
    int numAttributes = data.numAttributes();
    m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats

    for (int j = 0; j < numAttributes; j++) {
      if (data.attribute(j).isNominal()) {
        // Compute the probability of occurence of each distinct value
        int[] nomCounts = (data.attributeStats(j)).nominalCounts;
        double[] counts = new double[nomCounts.length];
        if (counts.length < 2)
          throw new Exception("Nominal attribute has less than two distinct values!");
        // Perform Laplace smoothing
        for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1;
        Utils.normalize(counts);
        double[] stats = new double[counts.length - 1];
        stats[0] = counts[0];
        // Calculate cumulative probabilities
        for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i];
        m_AttributeStats.add(j, stats);
      } else if (data.attribute(j).isNumeric()) {
        // Get mean and standard deviation from the training data
        double[] stats = new double[2];
        stats[0] = data.meanOrMode(j);
        stats[1] = Math.sqrt(data.variance(j));
        m_AttributeStats.add(j, stats);
      } else System.err.println("Decorate can only handle numeric and nominal values.");
    }
  }
Пример #4
0
  /**
   * Returns an enumeration describing the available options..
   *
   * <p>Valid options are:
   *
   * <p>-N <number of clusters> <br>
   * Specify the number of clusters to generate. If omitted, FarthestFirst will use cross validation
   * to select the number of clusters automatically.
   *
   * <p>-S <seed> <br>
   * Specify random number seed.
   *
   * <p>
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector newVector = new Vector(2);

    newVector.addElement(new Option("\tnumber of clusters. (default = 2).", "N", 1, "-N <num>"));
    newVector.addElement(new Option("\trandom number seed.\n (default 10)", "S", 1, "-S <num>"));

    return newVector.elements();
  }
Пример #5
0
  /**
   * Returns description of the Decorate classifier.
   *
   * @return description of the Decorate classifier as a string
   */
  public String toString() {

    if (m_Committee == null) {
      return "Decorate: No model built yet.";
    }
    StringBuffer text = new StringBuffer();
    text.append("Decorate base classifiers: \n\n");
    for (int i = 0; i < m_Committee.size(); i++)
      text.append(((Classifier) m_Committee.get(i)).toString() + "\n\n");
    text.append("Number of classifier in the ensemble: " + m_Committee.size() + "\n");
    return text.toString();
  }
 /** The static initializer sets up the options vector */
 static {
   options.addElement(new Option("\tAlpha star. (default = 0.5).", "A", 1, "-A <0-1>"));
   options.addElement(new Option("\tSigma. (default = 1.0).", "S", 1, "-S <num>"));
   options.addElement(
       new Option(
           "\tR. All points that are far away more than this value have a zero similarity. (default = -1).",
           "R",
           1,
           "-R <num>"));
   options.addElement(
       new Option("\tUse sparse matrix representation. (default = false).", "M", 0, "-M"));
 }
Пример #7
0
  /**
   * Calculates the class membership probabilities for the given test instance.
   *
   * @param instance the instance to be classified
   * @return predicted class probability distribution
   * @exception Exception if distribution can't be computed successfully
   */
  public double[] distributionForInstance(Instance instance) throws Exception {
    if (instance.classAttribute().isNumeric()) {
      throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!");
    }
    double[] sums = new double[instance.numClasses()], newProbs;
    Classifier curr;

    for (int i = 0; i < m_Committee.size(); i++) {
      curr = (Classifier) m_Committee.get(i);
      newProbs = curr.distributionForInstance(instance);
      for (int j = 0; j < newProbs.length; j++) sums[j] += newProbs[j];
    }
    if (Utils.eq(Utils.sum(sums), 0)) {
      return sums;
    } else {
      Utils.normalize(sums);
      return sums;
    }
  }
Пример #8
0
  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(7);

    newVector.addElement(
        new Option(
            "\tFull class name of search method, followed\n"
                + "\tby its options.\n"
                + "\teg: \"weka.attributeSelection.BestFirst -D 1\"\n"
                + "\t(default weka.attributeSelection.BestFirst)",
            "S",
            1,
            "-S <search method specification>"));

    newVector.addElement(
        new Option(
            "\tUse cross validation to evaluate features.\n"
                + "\tUse number of folds = 1 for leave one out CV.\n"
                + "\t(Default = leave one out CV)",
            "X",
            1,
            "-X <number of folds>"));

    newVector.addElement(
        new Option(
            "\tPerformance evaluation measure to use for selecting attributes.\n"
                + "\t(Default = accuracy for discrete class and rmse for numeric class)",
            "E",
            1,
            "-E <acc | rmse | mae | auc>"));

    newVector.addElement(
        new Option("\tUse nearest neighbour instead of global table majority.", "I", 0, "-I"));

    newVector.addElement(new Option("\tDisplay decision table rules.\n", "R", 0, "-R"));

    newVector.addElement(
        new Option(
            "",
            "",
            0,
            "\nOptions specific to search method " + m_search.getClass().getName() + ":"));
    Enumeration enu = ((OptionHandler) m_search).listOptions();
    while (enu.hasMoreElements()) {
      newVector.addElement(enu.nextElement());
    }
    return newVector.elements();
  }
Пример #9
0
  /**
   * Returns an enumeration describing the available options
   *
   * @return an enumeration of all the available options
   */
  public Enumeration listOptions() {
    Vector newVector = new Vector(8);

    newVector.addElement(
        new Option("\tDesired size of ensemble.\n" + "\t(default 10)", "E", 1, "-E"));
    newVector.addElement(
        new Option(
            "\tFactor that determines number of artificial examples to generate.\n"
                + "\tSpecified proportional to training set size.\n"
                + "\t(default 1.0)",
            "R",
            1,
            "-R"));

    Enumeration enu = super.listOptions();
    while (enu.hasMoreElements()) {
      newVector.addElement(enu.nextElement());
    }
    return newVector.elements();
  }
Пример #10
0
  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(4);

    newVector.addElement(new Option("\tTurn on debugging output.", "D", 0, "-D"));
    newVector.addElement(
        new Option(
            "\tFull class name of classifier to include, followed\n"
                + "\tby scheme options. May be specified multiple times,\n"
                + "\trequired at least twice.\n"
                + "\teg: \"weka.classifiers.bayes.NaiveBayes -D\"",
            "B",
            1,
            "-B <classifier specification>"));
    newVector.addElement(
        new Option(
            "\tSets the random number seed (default 1).", "S", 1, "-S <random number seed>"));
    newVector.addElement(
        new Option(
            "\tUse cross validation for model selection using the\n"
                + "\tgiven number of folds. (default 0, is to\n"
                + "\tuse training error)",
            "X",
            1,
            "-X <number of folds>"));
    return newVector.elements();
  }
Пример #11
0
  // this method MajorityVoting to decide the probs of the Instance;
  //
  protected double[] distributionForInstanceMajorityVoting(Instance instance) throws Exception {

    double[] probs = new double[instance.classAttribute().numValues()];
    double[] votes = new double[probs.length];

    for (int i = 0; i < class_Array.length; i++) {
      probs = class_Array[i].distributionForInstance(instance);

      int maxIndex = 0;
      for (int j = 0; j < probs.length; j++) {
        if (probs[j] > probs[maxIndex]) maxIndex = j;
      }

      // Consider the cases when multiple classes happen to have the same probability
      for (int j = 0; j < probs.length; j++) {
        if (probs[j] == probs[maxIndex]) votes[j]++;
      }
    }

    int tmpMajorityIndex = 0;
    for (int k = 1; k < votes.length; k++) {
      if (votes[k] > votes[tmpMajorityIndex]) tmpMajorityIndex = k;
    }

    // Consider the cases when multiple classes receive the same amount of votes
    Vector<Integer> majorityIndexes = new Vector<Integer>();
    for (int k = 0; k < votes.length; k++) {
      if (votes[k] == votes[tmpMajorityIndex]) majorityIndexes.add(k);
    }
    // System.out.println("forth");
    // Resolve the ties according to a uniform random distribution
    int majorityIndex = majorityIndexes.get(m_Random.nextInt(majorityIndexes.size()));

    // set the probs of the classes which have not been voted to 0
    for (int k = 0; k < probs.length; k++) probs[k] = 0;
    // the class that have been voted the most receives 1
    probs[majorityIndex] = 1;

    return probs;
  }
Пример #12
0
  /**
   * Build Decorate classifier
   *
   * @param data the training data to be used for generating the classifier
   * @exception Exception if the classifier could not be built successfully
   */
  public void buildClassifier(Instances data) throws Exception {
    if (m_Classifier == null) {
      throw new Exception("A base classifier has not been specified!");
    }
    if (data.checkForStringAttributes()) {
      throw new UnsupportedAttributeTypeException("Cannot handle string attributes!");
    }
    if (data.classAttribute().isNumeric()) {
      throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!");
    }
    if (m_NumIterations < m_DesiredSize)
      throw new Exception("Max number of iterations must be >= desired ensemble size!");

    // initialize random number generator
    if (m_Seed == -1) m_Random = new Random();
    else m_Random = new Random(m_Seed);

    int i = 1; // current committee size
    int numTrials = 1; // number of Decorate iterations
    Instances divData = new Instances(data); // local copy of data - diversity data
    divData.deleteWithMissingClass();
    Instances artData = null; // artificial data

    // compute number of artficial instances to add at each iteration
    int artSize = (int) (Math.abs(m_ArtSize) * divData.numInstances());
    if (artSize == 0) artSize = 1; // atleast add one random example
    computeStats(data); // Compute training data stats for creating artificial examples

    // initialize new committee
    m_Committee = new Vector();
    Classifier newClassifier = m_Classifier;
    newClassifier.buildClassifier(divData);
    m_Committee.add(newClassifier);
    double eComm = computeError(divData); // compute ensemble error
    if (m_Debug)
      System.out.println(
          "Initialize:\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm);

    // repeat till desired committee size is reached OR the max number of iterations is exceeded
    while (i < m_DesiredSize && numTrials < m_NumIterations) {
      // Generate artificial training examples
      artData = generateArtificialData(artSize, data);

      // Label artificial examples
      labelData(artData);
      addInstances(divData, artData); // Add new artificial data

      // Build new classifier
      Classifier tmp[] = Classifier.makeCopies(m_Classifier, 1);
      newClassifier = tmp[0];
      newClassifier.buildClassifier(divData);
      // Remove all the artificial data
      removeInstances(divData, artSize);

      // Test if the new classifier should be added to the ensemble
      m_Committee.add(newClassifier); // add new classifier to current committee
      double currError = computeError(divData);
      if (currError <= eComm) { // adding the new member did not increase the error
        i++;
        eComm = currError;
        if (m_Debug)
          System.out.println(
              "Iteration: "
                  + (1 + numTrials)
                  + "\tClassifier "
                  + i
                  + " added to ensemble. Ensemble error = "
                  + eComm);
      } else { // reject the current classifier because it increased the ensemble error
        m_Committee.removeElementAt(m_Committee.size() - 1); // pop the last member
      }
      numTrials++;
    }
  }
 /**
  * Returns an enumeration describing the available options.
  *
  * <p>
  *
  * @return an enumeration of all the available options
  */
 public Enumeration listOptions() {
   return options.elements();
 }
Пример #14
0
  /**
   * Returns an enumeration describing the available options.
   *
   * <p>Valid options are:
   *
   * <p>-U <br>
   * Use unpruned tree.
   *
   * <p>-C confidence <br>
   * Set confidence threshold for pruning. (Default: 0.25)
   *
   * <p>-M number <br>
   * Set minimum number of instances per leaf. (Default: 2)
   *
   * <p>-R <br>
   * Use reduced error pruning. No subtree raising is performed.
   *
   * <p>-N number <br>
   * Set number of folds for reduced error pruning. One fold is used as the pruning set. (Default:
   * 3)
   *
   * <p>-B <br>
   * Use binary splits for nominal attributes.
   *
   * <p>-S <br>
   * Don't perform subtree raising.
   *
   * <p>-L <br>
   * Do not clean up after the tree has been built.
   *
   * <p>-A <br>
   * If set, Laplace smoothing is used for predicted probabilites.
   *
   * <p>
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(9);

    newVector.addElement(new Option("\tUse unpruned tree.", "U", 0, "-U"));
    newVector.addElement(
        new Option(
            "\tSet confidence threshold for pruning.\n" + "\t(default 0.25)",
            "C",
            1,
            "-C <pruning confidence>"));
    newVector.addElement(
        new Option(
            "\tSet minimum number of instances per leaf.\n" + "\t(default 2)",
            "M",
            1,
            "-M <minimum number of instances>"));
    newVector.addElement(new Option("\tUse reduced error pruning.", "R", 0, "-R"));
    newVector.addElement(
        new Option(
            "\tSet number of folds for reduced error\n"
                + "\tpruning. One fold is used as pruning set.\n"
                + "\t(default 3)",
            "N",
            1,
            "-N <number of folds>"));
    newVector.addElement(new Option("\tUse binary splits only.", "B", 0, "-B"));
    newVector.addElement(new Option("\tDon't perform subtree raising.", "S", 0, "-S"));
    newVector.addElement(
        new Option("\tDo not clean up after the tree has been built.", "L", 0, "-L"));
    newVector.addElement(
        new Option("\tLaplace smoothing for predicted probabilities.", "A", 0, "-A"));

    return newVector.elements();
  }