/** * Generate artificial training examples. * * @param artSize size of examples set to create * @param data training data * @return the set of unlabeled artificial examples */ protected Instances generateArtificialData(int artSize, Instances data) { int numAttributes = data.numAttributes(); Instances artData = new Instances(data, artSize); double[] att; Instance artInstance; for (int i = 0; i < artSize; i++) { att = new double[numAttributes]; for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Select nominal value based on the frequency of occurence in the training data double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (double) selectIndexProbabilistically(stats); } else if (data.attribute(j).isNumeric()) { // Generate numeric value from the Guassian distribution // defined by the mean and std dev of the attribute double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0]; } else System.err.println("Decorate can only handle numeric and nominal values."); } artInstance = new Instance(1.0, att); artData.add(artInstance); } return artData; }
/** * Compute and store statistics required for generating artificial data. * * @param data training instances * @exception Exception if statistics could not be calculated successfully */ protected void computeStats(Instances data) throws Exception { int numAttributes = data.numAttributes(); m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Compute the probability of occurence of each distinct value int[] nomCounts = (data.attributeStats(j)).nominalCounts; double[] counts = new double[nomCounts.length]; if (counts.length < 2) throw new Exception("Nominal attribute has less than two distinct values!"); // Perform Laplace smoothing for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1; Utils.normalize(counts); double[] stats = new double[counts.length - 1]; stats[0] = counts[0]; // Calculate cumulative probabilities for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i]; m_AttributeStats.add(j, stats); } else if (data.attribute(j).isNumeric()) { // Get mean and standard deviation from the training data double[] stats = new double[2]; stats[0] = data.meanOrMode(j); stats[1] = Math.sqrt(data.variance(j)); m_AttributeStats.add(j, stats); } else System.err.println("Decorate can only handle numeric and nominal values."); } }
/** * Returns an enumeration of the additional measure names * * @return an enumeration of the measure names */ public Enumeration enumerateMeasures() { Vector newVector = new Vector(3); newVector.addElement("measureTreeSize"); newVector.addElement("measureNumLeaves"); newVector.addElement("measureNumRules"); return newVector.elements(); }
/** * Returns description of the Decorate classifier. * * @return description of the Decorate classifier as a string */ public String toString() { if (m_Committee == null) { return "Decorate: No model built yet."; } StringBuffer text = new StringBuffer(); text.append("Decorate base classifiers: \n\n"); for (int i = 0; i < m_Committee.size(); i++) text.append(((Classifier) m_Committee.get(i)).toString() + "\n\n"); text.append("Number of classifier in the ensemble: " + m_Committee.size() + "\n"); return text.toString(); }
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @exception Exception if distribution can't be computed successfully */ public double[] distributionForInstance(Instance instance) throws Exception { if (instance.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } double[] sums = new double[instance.numClasses()], newProbs; Classifier curr; for (int i = 0; i < m_Committee.size(); i++) { curr = (Classifier) m_Committee.get(i); newProbs = curr.distributionForInstance(instance); for (int j = 0; j < newProbs.length; j++) sums[j] += newProbs[j]; } if (Utils.eq(Utils.sum(sums), 0)) { return sums; } else { Utils.normalize(sums); return sums; } }
/** * Returns an enumeration describing the available options * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(8); newVector.addElement( new Option("\tDesired size of ensemble.\n" + "\t(default 10)", "E", 1, "-E")); newVector.addElement( new Option( "\tFactor that determines number of artificial examples to generate.\n" + "\tSpecified proportional to training set size.\n" + "\t(default 1.0)", "R", 1, "-R")); Enumeration enu = super.listOptions(); while (enu.hasMoreElements()) { newVector.addElement(enu.nextElement()); } return newVector.elements(); }
/** * Build Decorate classifier * * @param data the training data to be used for generating the classifier * @exception Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if (m_Classifier == null) { throw new Exception("A base classifier has not been specified!"); } if (data.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle string attributes!"); } if (data.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } if (m_NumIterations < m_DesiredSize) throw new Exception("Max number of iterations must be >= desired ensemble size!"); // initialize random number generator if (m_Seed == -1) m_Random = new Random(); else m_Random = new Random(m_Seed); int i = 1; // current committee size int numTrials = 1; // number of Decorate iterations Instances divData = new Instances(data); // local copy of data - diversity data divData.deleteWithMissingClass(); Instances artData = null; // artificial data // compute number of artficial instances to add at each iteration int artSize = (int) (Math.abs(m_ArtSize) * divData.numInstances()); if (artSize == 0) artSize = 1; // atleast add one random example computeStats(data); // Compute training data stats for creating artificial examples // initialize new committee m_Committee = new Vector(); Classifier newClassifier = m_Classifier; newClassifier.buildClassifier(divData); m_Committee.add(newClassifier); double eComm = computeError(divData); // compute ensemble error if (m_Debug) System.out.println( "Initialize:\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); // repeat till desired committee size is reached OR the max number of iterations is exceeded while (i < m_DesiredSize && numTrials < m_NumIterations) { // Generate artificial training examples artData = generateArtificialData(artSize, data); // Label artificial examples labelData(artData); addInstances(divData, artData); // Add new artificial data // Build new classifier Classifier tmp[] = Classifier.makeCopies(m_Classifier, 1); newClassifier = tmp[0]; newClassifier.buildClassifier(divData); // Remove all the artificial data removeInstances(divData, artSize); // Test if the new classifier should be added to the ensemble m_Committee.add(newClassifier); // add new classifier to current committee double currError = computeError(divData); if (currError <= eComm) { // adding the new member did not increase the error i++; eComm = currError; if (m_Debug) System.out.println( "Iteration: " + (1 + numTrials) + "\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); } else { // reject the current classifier because it increased the ensemble error m_Committee.removeElementAt(m_Committee.size() - 1); // pop the last member } numTrials++; } }
/** * Returns an enumeration describing the available options. * * <p>Valid options are: * * <p>-U <br> * Use unpruned tree. * * <p>-C confidence <br> * Set confidence threshold for pruning. (Default: 0.25) * * <p>-M number <br> * Set minimum number of instances per leaf. (Default: 2) * * <p>-R <br> * Use reduced error pruning. No subtree raising is performed. * * <p>-N number <br> * Set number of folds for reduced error pruning. One fold is used as the pruning set. (Default: * 3) * * <p>-B <br> * Use binary splits for nominal attributes. * * <p>-S <br> * Don't perform subtree raising. * * <p>-L <br> * Do not clean up after the tree has been built. * * <p>-A <br> * If set, Laplace smoothing is used for predicted probabilites. * * <p> * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(9); newVector.addElement(new Option("\tUse unpruned tree.", "U", 0, "-U")); newVector.addElement( new Option( "\tSet confidence threshold for pruning.\n" + "\t(default 0.25)", "C", 1, "-C <pruning confidence>")); newVector.addElement( new Option( "\tSet minimum number of instances per leaf.\n" + "\t(default 2)", "M", 1, "-M <minimum number of instances>")); newVector.addElement(new Option("\tUse reduced error pruning.", "R", 0, "-R")); newVector.addElement( new Option( "\tSet number of folds for reduced error\n" + "\tpruning. One fold is used as pruning set.\n" + "\t(default 3)", "N", 1, "-N <number of folds>")); newVector.addElement(new Option("\tUse binary splits only.", "B", 0, "-B")); newVector.addElement(new Option("\tDon't perform subtree raising.", "S", 0, "-S")); newVector.addElement( new Option("\tDo not clean up after the tree has been built.", "L", 0, "-L")); newVector.addElement( new Option("\tLaplace smoothing for predicted probabilities.", "A", 0, "-A")); return newVector.elements(); }