示例#1
0
  /**
   * Generate artificial training examples.
   *
   * @param artSize size of examples set to create
   * @param data training data
   * @return the set of unlabeled artificial examples
   */
  protected Instances generateArtificialData(int artSize, Instances data) {
    int numAttributes = data.numAttributes();
    Instances artData = new Instances(data, artSize);
    double[] att;
    Instance artInstance;

    for (int i = 0; i < artSize; i++) {
      att = new double[numAttributes];
      for (int j = 0; j < numAttributes; j++) {
        if (data.attribute(j).isNominal()) {
          // Select nominal value based on the frequency of occurence in the training data
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (double) selectIndexProbabilistically(stats);
        } else if (data.attribute(j).isNumeric()) {
          // Generate numeric value from the Guassian distribution
          // defined by the mean and std dev of the attribute
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0];
        } else System.err.println("Decorate can only handle numeric and nominal values.");
      }
      artInstance = new Instance(1.0, att);
      artData.add(artInstance);
    }
    return artData;
  }
示例#2
0
  /**
   * Compute and store statistics required for generating artificial data.
   *
   * @param data training instances
   * @exception Exception if statistics could not be calculated successfully
   */
  protected void computeStats(Instances data) throws Exception {
    int numAttributes = data.numAttributes();
    m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats

    for (int j = 0; j < numAttributes; j++) {
      if (data.attribute(j).isNominal()) {
        // Compute the probability of occurence of each distinct value
        int[] nomCounts = (data.attributeStats(j)).nominalCounts;
        double[] counts = new double[nomCounts.length];
        if (counts.length < 2)
          throw new Exception("Nominal attribute has less than two distinct values!");
        // Perform Laplace smoothing
        for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1;
        Utils.normalize(counts);
        double[] stats = new double[counts.length - 1];
        stats[0] = counts[0];
        // Calculate cumulative probabilities
        for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i];
        m_AttributeStats.add(j, stats);
      } else if (data.attribute(j).isNumeric()) {
        // Get mean and standard deviation from the training data
        double[] stats = new double[2];
        stats[0] = data.meanOrMode(j);
        stats[1] = Math.sqrt(data.variance(j));
        m_AttributeStats.add(j, stats);
      } else System.err.println("Decorate can only handle numeric and nominal values.");
    }
  }
示例#3
0
文件: J48.java 项目: bigbigbug/wekax
 /**
  * Returns an enumeration of the additional measure names
  *
  * @return an enumeration of the measure names
  */
 public Enumeration enumerateMeasures() {
   Vector newVector = new Vector(3);
   newVector.addElement("measureTreeSize");
   newVector.addElement("measureNumLeaves");
   newVector.addElement("measureNumRules");
   return newVector.elements();
 }
示例#4
0
  /**
   * Returns description of the Decorate classifier.
   *
   * @return description of the Decorate classifier as a string
   */
  public String toString() {

    if (m_Committee == null) {
      return "Decorate: No model built yet.";
    }
    StringBuffer text = new StringBuffer();
    text.append("Decorate base classifiers: \n\n");
    for (int i = 0; i < m_Committee.size(); i++)
      text.append(((Classifier) m_Committee.get(i)).toString() + "\n\n");
    text.append("Number of classifier in the ensemble: " + m_Committee.size() + "\n");
    return text.toString();
  }
示例#5
0
  /**
   * Calculates the class membership probabilities for the given test instance.
   *
   * @param instance the instance to be classified
   * @return predicted class probability distribution
   * @exception Exception if distribution can't be computed successfully
   */
  public double[] distributionForInstance(Instance instance) throws Exception {
    if (instance.classAttribute().isNumeric()) {
      throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!");
    }
    double[] sums = new double[instance.numClasses()], newProbs;
    Classifier curr;

    for (int i = 0; i < m_Committee.size(); i++) {
      curr = (Classifier) m_Committee.get(i);
      newProbs = curr.distributionForInstance(instance);
      for (int j = 0; j < newProbs.length; j++) sums[j] += newProbs[j];
    }
    if (Utils.eq(Utils.sum(sums), 0)) {
      return sums;
    } else {
      Utils.normalize(sums);
      return sums;
    }
  }
示例#6
0
  /**
   * Returns an enumeration describing the available options
   *
   * @return an enumeration of all the available options
   */
  public Enumeration listOptions() {
    Vector newVector = new Vector(8);

    newVector.addElement(
        new Option("\tDesired size of ensemble.\n" + "\t(default 10)", "E", 1, "-E"));
    newVector.addElement(
        new Option(
            "\tFactor that determines number of artificial examples to generate.\n"
                + "\tSpecified proportional to training set size.\n"
                + "\t(default 1.0)",
            "R",
            1,
            "-R"));

    Enumeration enu = super.listOptions();
    while (enu.hasMoreElements()) {
      newVector.addElement(enu.nextElement());
    }
    return newVector.elements();
  }
示例#7
0
  /**
   * Build Decorate classifier
   *
   * @param data the training data to be used for generating the classifier
   * @exception Exception if the classifier could not be built successfully
   */
  public void buildClassifier(Instances data) throws Exception {
    if (m_Classifier == null) {
      throw new Exception("A base classifier has not been specified!");
    }
    if (data.checkForStringAttributes()) {
      throw new UnsupportedAttributeTypeException("Cannot handle string attributes!");
    }
    if (data.classAttribute().isNumeric()) {
      throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!");
    }
    if (m_NumIterations < m_DesiredSize)
      throw new Exception("Max number of iterations must be >= desired ensemble size!");

    // initialize random number generator
    if (m_Seed == -1) m_Random = new Random();
    else m_Random = new Random(m_Seed);

    int i = 1; // current committee size
    int numTrials = 1; // number of Decorate iterations
    Instances divData = new Instances(data); // local copy of data - diversity data
    divData.deleteWithMissingClass();
    Instances artData = null; // artificial data

    // compute number of artficial instances to add at each iteration
    int artSize = (int) (Math.abs(m_ArtSize) * divData.numInstances());
    if (artSize == 0) artSize = 1; // atleast add one random example
    computeStats(data); // Compute training data stats for creating artificial examples

    // initialize new committee
    m_Committee = new Vector();
    Classifier newClassifier = m_Classifier;
    newClassifier.buildClassifier(divData);
    m_Committee.add(newClassifier);
    double eComm = computeError(divData); // compute ensemble error
    if (m_Debug)
      System.out.println(
          "Initialize:\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm);

    // repeat till desired committee size is reached OR the max number of iterations is exceeded
    while (i < m_DesiredSize && numTrials < m_NumIterations) {
      // Generate artificial training examples
      artData = generateArtificialData(artSize, data);

      // Label artificial examples
      labelData(artData);
      addInstances(divData, artData); // Add new artificial data

      // Build new classifier
      Classifier tmp[] = Classifier.makeCopies(m_Classifier, 1);
      newClassifier = tmp[0];
      newClassifier.buildClassifier(divData);
      // Remove all the artificial data
      removeInstances(divData, artSize);

      // Test if the new classifier should be added to the ensemble
      m_Committee.add(newClassifier); // add new classifier to current committee
      double currError = computeError(divData);
      if (currError <= eComm) { // adding the new member did not increase the error
        i++;
        eComm = currError;
        if (m_Debug)
          System.out.println(
              "Iteration: "
                  + (1 + numTrials)
                  + "\tClassifier "
                  + i
                  + " added to ensemble. Ensemble error = "
                  + eComm);
      } else { // reject the current classifier because it increased the ensemble error
        m_Committee.removeElementAt(m_Committee.size() - 1); // pop the last member
      }
      numTrials++;
    }
  }
示例#8
0
文件: J48.java 项目: bigbigbug/wekax
  /**
   * Returns an enumeration describing the available options.
   *
   * <p>Valid options are:
   *
   * <p>-U <br>
   * Use unpruned tree.
   *
   * <p>-C confidence <br>
   * Set confidence threshold for pruning. (Default: 0.25)
   *
   * <p>-M number <br>
   * Set minimum number of instances per leaf. (Default: 2)
   *
   * <p>-R <br>
   * Use reduced error pruning. No subtree raising is performed.
   *
   * <p>-N number <br>
   * Set number of folds for reduced error pruning. One fold is used as the pruning set. (Default:
   * 3)
   *
   * <p>-B <br>
   * Use binary splits for nominal attributes.
   *
   * <p>-S <br>
   * Don't perform subtree raising.
   *
   * <p>-L <br>
   * Do not clean up after the tree has been built.
   *
   * <p>-A <br>
   * If set, Laplace smoothing is used for predicted probabilites.
   *
   * <p>
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(9);

    newVector.addElement(new Option("\tUse unpruned tree.", "U", 0, "-U"));
    newVector.addElement(
        new Option(
            "\tSet confidence threshold for pruning.\n" + "\t(default 0.25)",
            "C",
            1,
            "-C <pruning confidence>"));
    newVector.addElement(
        new Option(
            "\tSet minimum number of instances per leaf.\n" + "\t(default 2)",
            "M",
            1,
            "-M <minimum number of instances>"));
    newVector.addElement(new Option("\tUse reduced error pruning.", "R", 0, "-R"));
    newVector.addElement(
        new Option(
            "\tSet number of folds for reduced error\n"
                + "\tpruning. One fold is used as pruning set.\n"
                + "\t(default 3)",
            "N",
            1,
            "-N <number of folds>"));
    newVector.addElement(new Option("\tUse binary splits only.", "B", 0, "-B"));
    newVector.addElement(new Option("\tDon't perform subtree raising.", "S", 0, "-S"));
    newVector.addElement(
        new Option("\tDo not clean up after the tree has been built.", "L", 0, "-L"));
    newVector.addElement(
        new Option("\tLaplace smoothing for predicted probabilities.", "A", 0, "-A"));

    return newVector.elements();
  }