/** * Generate artificial training examples. * * @param artSize size of examples set to create * @param data training data * @return the set of unlabeled artificial examples */ protected Instances generateArtificialData(int artSize, Instances data) { int numAttributes = data.numAttributes(); Instances artData = new Instances(data, artSize); double[] att; Instance artInstance; for (int i = 0; i < artSize; i++) { att = new double[numAttributes]; for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Select nominal value based on the frequency of occurence in the training data double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (double) selectIndexProbabilistically(stats); } else if (data.attribute(j).isNumeric()) { // Generate numeric value from the Guassian distribution // defined by the mean and std dev of the attribute double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0]; } else System.err.println("Decorate can only handle numeric and nominal values."); } artInstance = new Instance(1.0, att); artData.add(artInstance); } return artData; }
/** * Returns an enumeration of the additional measure names * * @return an enumeration of the measure names */ public Enumeration enumerateMeasures() { Vector newVector = new Vector(3); newVector.addElement("measureTreeSize"); newVector.addElement("measureNumLeaves"); newVector.addElement("measureNumRules"); return newVector.elements(); }
/** * Compute and store statistics required for generating artificial data. * * @param data training instances * @exception Exception if statistics could not be calculated successfully */ protected void computeStats(Instances data) throws Exception { int numAttributes = data.numAttributes(); m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Compute the probability of occurence of each distinct value int[] nomCounts = (data.attributeStats(j)).nominalCounts; double[] counts = new double[nomCounts.length]; if (counts.length < 2) throw new Exception("Nominal attribute has less than two distinct values!"); // Perform Laplace smoothing for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1; Utils.normalize(counts); double[] stats = new double[counts.length - 1]; stats[0] = counts[0]; // Calculate cumulative probabilities for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i]; m_AttributeStats.add(j, stats); } else if (data.attribute(j).isNumeric()) { // Get mean and standard deviation from the training data double[] stats = new double[2]; stats[0] = data.meanOrMode(j); stats[1] = Math.sqrt(data.variance(j)); m_AttributeStats.add(j, stats); } else System.err.println("Decorate can only handle numeric and nominal values."); } }
/** * Returns an enumeration describing the available options.. * * <p>Valid options are: * * <p>-N <number of clusters> <br> * Specify the number of clusters to generate. If omitted, FarthestFirst will use cross validation * to select the number of clusters automatically. * * <p>-S <seed> <br> * Specify random number seed. * * <p> * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(2); newVector.addElement(new Option("\tnumber of clusters. (default = 2).", "N", 1, "-N <num>")); newVector.addElement(new Option("\trandom number seed.\n (default 10)", "S", 1, "-S <num>")); return newVector.elements(); }
/** * Returns description of the Decorate classifier. * * @return description of the Decorate classifier as a string */ public String toString() { if (m_Committee == null) { return "Decorate: No model built yet."; } StringBuffer text = new StringBuffer(); text.append("Decorate base classifiers: \n\n"); for (int i = 0; i < m_Committee.size(); i++) text.append(((Classifier) m_Committee.get(i)).toString() + "\n\n"); text.append("Number of classifier in the ensemble: " + m_Committee.size() + "\n"); return text.toString(); }
/** The static initializer sets up the options vector */ static { options.addElement(new Option("\tAlpha star. (default = 0.5).", "A", 1, "-A <0-1>")); options.addElement(new Option("\tSigma. (default = 1.0).", "S", 1, "-S <num>")); options.addElement( new Option( "\tR. All points that are far away more than this value have a zero similarity. (default = -1).", "R", 1, "-R <num>")); options.addElement( new Option("\tUse sparse matrix representation. (default = false).", "M", 0, "-M")); }
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @exception Exception if distribution can't be computed successfully */ public double[] distributionForInstance(Instance instance) throws Exception { if (instance.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } double[] sums = new double[instance.numClasses()], newProbs; Classifier curr; for (int i = 0; i < m_Committee.size(); i++) { curr = (Classifier) m_Committee.get(i); newProbs = curr.distributionForInstance(instance); for (int j = 0; j < newProbs.length; j++) sums[j] += newProbs[j]; } if (Utils.eq(Utils.sum(sums), 0)) { return sums; } else { Utils.normalize(sums); return sums; } }
/** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(7); newVector.addElement( new Option( "\tFull class name of search method, followed\n" + "\tby its options.\n" + "\teg: \"weka.attributeSelection.BestFirst -D 1\"\n" + "\t(default weka.attributeSelection.BestFirst)", "S", 1, "-S <search method specification>")); newVector.addElement( new Option( "\tUse cross validation to evaluate features.\n" + "\tUse number of folds = 1 for leave one out CV.\n" + "\t(Default = leave one out CV)", "X", 1, "-X <number of folds>")); newVector.addElement( new Option( "\tPerformance evaluation measure to use for selecting attributes.\n" + "\t(Default = accuracy for discrete class and rmse for numeric class)", "E", 1, "-E <acc | rmse | mae | auc>")); newVector.addElement( new Option("\tUse nearest neighbour instead of global table majority.", "I", 0, "-I")); newVector.addElement(new Option("\tDisplay decision table rules.\n", "R", 0, "-R")); newVector.addElement( new Option( "", "", 0, "\nOptions specific to search method " + m_search.getClass().getName() + ":")); Enumeration enu = ((OptionHandler) m_search).listOptions(); while (enu.hasMoreElements()) { newVector.addElement(enu.nextElement()); } return newVector.elements(); }
/** * Returns an enumeration describing the available options * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(8); newVector.addElement( new Option("\tDesired size of ensemble.\n" + "\t(default 10)", "E", 1, "-E")); newVector.addElement( new Option( "\tFactor that determines number of artificial examples to generate.\n" + "\tSpecified proportional to training set size.\n" + "\t(default 1.0)", "R", 1, "-R")); Enumeration enu = super.listOptions(); while (enu.hasMoreElements()) { newVector.addElement(enu.nextElement()); } return newVector.elements(); }
/** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(4); newVector.addElement(new Option("\tTurn on debugging output.", "D", 0, "-D")); newVector.addElement( new Option( "\tFull class name of classifier to include, followed\n" + "\tby scheme options. May be specified multiple times,\n" + "\trequired at least twice.\n" + "\teg: \"weka.classifiers.bayes.NaiveBayes -D\"", "B", 1, "-B <classifier specification>")); newVector.addElement( new Option( "\tSets the random number seed (default 1).", "S", 1, "-S <random number seed>")); newVector.addElement( new Option( "\tUse cross validation for model selection using the\n" + "\tgiven number of folds. (default 0, is to\n" + "\tuse training error)", "X", 1, "-X <number of folds>")); return newVector.elements(); }
// this method MajorityVoting to decide the probs of the Instance; // protected double[] distributionForInstanceMajorityVoting(Instance instance) throws Exception { double[] probs = new double[instance.classAttribute().numValues()]; double[] votes = new double[probs.length]; for (int i = 0; i < class_Array.length; i++) { probs = class_Array[i].distributionForInstance(instance); int maxIndex = 0; for (int j = 0; j < probs.length; j++) { if (probs[j] > probs[maxIndex]) maxIndex = j; } // Consider the cases when multiple classes happen to have the same probability for (int j = 0; j < probs.length; j++) { if (probs[j] == probs[maxIndex]) votes[j]++; } } int tmpMajorityIndex = 0; for (int k = 1; k < votes.length; k++) { if (votes[k] > votes[tmpMajorityIndex]) tmpMajorityIndex = k; } // Consider the cases when multiple classes receive the same amount of votes Vector<Integer> majorityIndexes = new Vector<Integer>(); for (int k = 0; k < votes.length; k++) { if (votes[k] == votes[tmpMajorityIndex]) majorityIndexes.add(k); } // System.out.println("forth"); // Resolve the ties according to a uniform random distribution int majorityIndex = majorityIndexes.get(m_Random.nextInt(majorityIndexes.size())); // set the probs of the classes which have not been voted to 0 for (int k = 0; k < probs.length; k++) probs[k] = 0; // the class that have been voted the most receives 1 probs[majorityIndex] = 1; return probs; }
/** * Build Decorate classifier * * @param data the training data to be used for generating the classifier * @exception Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if (m_Classifier == null) { throw new Exception("A base classifier has not been specified!"); } if (data.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle string attributes!"); } if (data.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } if (m_NumIterations < m_DesiredSize) throw new Exception("Max number of iterations must be >= desired ensemble size!"); // initialize random number generator if (m_Seed == -1) m_Random = new Random(); else m_Random = new Random(m_Seed); int i = 1; // current committee size int numTrials = 1; // number of Decorate iterations Instances divData = new Instances(data); // local copy of data - diversity data divData.deleteWithMissingClass(); Instances artData = null; // artificial data // compute number of artficial instances to add at each iteration int artSize = (int) (Math.abs(m_ArtSize) * divData.numInstances()); if (artSize == 0) artSize = 1; // atleast add one random example computeStats(data); // Compute training data stats for creating artificial examples // initialize new committee m_Committee = new Vector(); Classifier newClassifier = m_Classifier; newClassifier.buildClassifier(divData); m_Committee.add(newClassifier); double eComm = computeError(divData); // compute ensemble error if (m_Debug) System.out.println( "Initialize:\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); // repeat till desired committee size is reached OR the max number of iterations is exceeded while (i < m_DesiredSize && numTrials < m_NumIterations) { // Generate artificial training examples artData = generateArtificialData(artSize, data); // Label artificial examples labelData(artData); addInstances(divData, artData); // Add new artificial data // Build new classifier Classifier tmp[] = Classifier.makeCopies(m_Classifier, 1); newClassifier = tmp[0]; newClassifier.buildClassifier(divData); // Remove all the artificial data removeInstances(divData, artSize); // Test if the new classifier should be added to the ensemble m_Committee.add(newClassifier); // add new classifier to current committee double currError = computeError(divData); if (currError <= eComm) { // adding the new member did not increase the error i++; eComm = currError; if (m_Debug) System.out.println( "Iteration: " + (1 + numTrials) + "\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); } else { // reject the current classifier because it increased the ensemble error m_Committee.removeElementAt(m_Committee.size() - 1); // pop the last member } numTrials++; } }
/** * Returns an enumeration describing the available options. * * <p> * * @return an enumeration of all the available options */ public Enumeration listOptions() { return options.elements(); }
/** * Returns an enumeration describing the available options. * * <p>Valid options are: * * <p>-U <br> * Use unpruned tree. * * <p>-C confidence <br> * Set confidence threshold for pruning. (Default: 0.25) * * <p>-M number <br> * Set minimum number of instances per leaf. (Default: 2) * * <p>-R <br> * Use reduced error pruning. No subtree raising is performed. * * <p>-N number <br> * Set number of folds for reduced error pruning. One fold is used as the pruning set. (Default: * 3) * * <p>-B <br> * Use binary splits for nominal attributes. * * <p>-S <br> * Don't perform subtree raising. * * <p>-L <br> * Do not clean up after the tree has been built. * * <p>-A <br> * If set, Laplace smoothing is used for predicted probabilites. * * <p> * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(9); newVector.addElement(new Option("\tUse unpruned tree.", "U", 0, "-U")); newVector.addElement( new Option( "\tSet confidence threshold for pruning.\n" + "\t(default 0.25)", "C", 1, "-C <pruning confidence>")); newVector.addElement( new Option( "\tSet minimum number of instances per leaf.\n" + "\t(default 2)", "M", 1, "-M <minimum number of instances>")); newVector.addElement(new Option("\tUse reduced error pruning.", "R", 0, "-R")); newVector.addElement( new Option( "\tSet number of folds for reduced error\n" + "\tpruning. One fold is used as pruning set.\n" + "\t(default 3)", "N", 1, "-N <number of folds>")); newVector.addElement(new Option("\tUse binary splits only.", "B", 0, "-B")); newVector.addElement(new Option("\tDon't perform subtree raising.", "S", 0, "-S")); newVector.addElement( new Option("\tDo not clean up after the tree has been built.", "L", 0, "-L")); newVector.addElement( new Option("\tLaplace smoothing for predicted probabilities.", "A", 0, "-A")); return newVector.elements(); }