// 构造一个tri-trainer分类器。 public Tritrainer( String classifier, String trainingIns_File, String testIns_File, double precentage) { try { this.classifier1 = (Classifier) Class.forName(classifier).newInstance(); this.classifier2 = (Classifier) Class.forName(classifier).newInstance(); this.classifier3 = (Classifier) Class.forName(classifier).newInstance(); Instances trainingInstances = Util.getInstances(trainingIns_File); // 将trainIns_File按照precentage和(1-precentage)的比例切割成labeledIns和unlabeledIns; int length = trainingInstances.numInstances(); int i = new Double(length * precentage).intValue(); labeledIns = new Instances(trainingInstances, 0); for (int j = 0; j < i; j++) { labeledIns.add(trainingInstances.firstInstance()); trainingInstances.delete(0); } unlabeledIns = trainingInstances; testIns = Util.getInstances(testIns_File); Init(); } catch (Exception e) { } }
// 计算h1,h2分类器共同的分类错误率; public double measureBothError(Classifier h1, Classifier h2, Instances test) { int m = test.numInstances(); double value1, value2, value; int error = 0, total = 0; try { for (int i = 0; i < m; i++) { value = test.instance(i).classValue(); value1 = h1.classifyInstance(test.instance(i)); value2 = h2.classifyInstance(test.instance(i)); // 两分类器做出相同决策 if (value1 == value2) { // 两分类器做出相同决策的样本数量 total++; // 两分类器做出相同错误决策 if (value != value1) { // 两分类器做出相同错误决策的样本数量 error++; } } } } catch (Exception e) { System.out.println(e); } // System.out.println("m:=" + m); // System.out.println("error:=" + error +"; total:=" + total); // 两个分类器的分类错误率= 两分类器做出相同错误决策的样本数量/两分类器做出相同决策的样本数量 return (error * 1.0) / total; }
/** * Generate artificial training examples. * * @param artSize size of examples set to create * @param data training data * @return the set of unlabeled artificial examples */ protected Instances generateArtificialData(int artSize, Instances data) { int numAttributes = data.numAttributes(); Instances artData = new Instances(data, artSize); double[] att; Instance artInstance; for (int i = 0; i < artSize; i++) { att = new double[numAttributes]; for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Select nominal value based on the frequency of occurence in the training data double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (double) selectIndexProbabilistically(stats); } else if (data.attribute(j).isNumeric()) { // Generate numeric value from the Guassian distribution // defined by the mean and std dev of the attribute double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0]; } else System.err.println("Decorate can only handle numeric and nominal values."); } artInstance = new Instance(1.0, att); artData.add(artInstance); } return artData; }
// 将样本集中裁剪提取成m个样本组成的集合; public void SubSample(Instances inst, int m) { inst.randomize(new Random()); while (inst.numInstances() != m) { inst.delete(0); } // System.out.println("subsample:=" + inst.numInstances() + " m:=" + m ); }
/** * GetKs - return [K_1,K_2,...,K_L] where each Y_j \in {1,...,K_j}. In the multi-label case, K[j] * = 2 for all j = 1,...,L. * * @param D a dataset * @return an array of the number of values that each label can take */ private static int[] getKs(Instances D) { int L = D.classIndex(); int K[] = new int[L]; for (int k = 0; k < L; k++) { K[k] = D.attribute(k).numValues(); } return K; }
public int getClusterNumber(String objectID) { int datasetIndex = -1; for (int i = 0; i < m_Sequences.numInstances(); i++) { if (objectID.equals(m_Sequences.instance(i).stringValue(0))) datasetIndex = i; } return cluster[datasetIndex]; }
protected void initMinMax(Instances data) { m_Min = new double[data.numAttributes()]; m_Max = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { m_Min[i] = m_Max[i] = Double.NaN; } for (int i = 0; i < data.numInstances(); i++) { updateMinMax(data.instance(i)); } }
/** * Labels the artificially generated data. * * @param artData the artificially generated instances * @exception Exception if instances cannot be labeled successfully */ protected void labelData(Instances artData) throws Exception { Instance curr; double[] probs; for (int i = 0; i < artData.numInstances(); i++) { curr = artData.instance(i); // compute the class membership probs predicted by the current ensemble probs = distributionForInstance(curr); // select class label inversely proportional to the ensemble predictions curr.setClassValue(inverseLabel(probs)); } }
/** * Computes the error in classification on the given data. * * @param data the instances to be classified * @return classification error * @exception Exception if error can not be computed successfully */ protected double computeError(Instances data) throws Exception { double error = 0.0; int numInstances = data.numInstances(); Instance curr; for (int i = 0; i < numInstances; i++) { curr = data.instance(i); // Check if the instance has been misclassified if (curr.classValue() != ((int) classifyInstance(curr))) error++; } return (error / numInstances); }
/** * Inserts an instance into the hash table * * @param inst instance to be inserted * @param instA to create the hash key from * @throws Exception if the instance can't be inserted */ private void insertIntoTable(Instance inst, double[] instA) throws Exception { double[] tempClassDist2; double[] newDist; DecisionTableHashKey thekey; if (instA != null) { thekey = new DecisionTableHashKey(instA); } else { thekey = new DecisionTableHashKey(inst, inst.numAttributes(), false); } // see if this one is already in the table tempClassDist2 = (double[]) m_entries.get(thekey); if (tempClassDist2 == null) { if (m_classIsNominal) { newDist = new double[m_theInstances.classAttribute().numValues()]; // Leplace estimation for (int i = 0; i < m_theInstances.classAttribute().numValues(); i++) { newDist[i] = 1.0; } newDist[(int) inst.classValue()] = inst.weight(); // add to the table m_entries.put(thekey, newDist); } else { newDist = new double[2]; newDist[0] = inst.classValue() * inst.weight(); newDist[1] = inst.weight(); // add to the table m_entries.put(thekey, newDist); } } else { // update the distribution for this instance if (m_classIsNominal) { tempClassDist2[(int) inst.classValue()] += inst.weight(); // update the table m_entries.put(thekey, tempClassDist2); } else { tempClassDist2[0] += (inst.classValue() * inst.weight()); tempClassDist2[1] += inst.weight(); // update the table m_entries.put(thekey, tempClassDist2); } } }
/** * Generates a clusterer by the mean of spectral clustering algorithm. * * @param data set of instances serving as training data * @exception Exception if the clusterer has not been generated successfully */ public void buildClusterer(Instances data) throws java.lang.Exception { m_Sequences = new Instances(data); int n = data.numInstances(); int k = data.numAttributes(); DoubleMatrix2D w; if (useSparseMatrix) w = DoubleFactory2D.sparse.make(n, n); else w = DoubleFactory2D.dense.make(n, n); double[][] v1 = new double[n][]; for (int i = 0; i < n; i++) v1[i] = data.instance(i).toDoubleArray(); v = DoubleFactory2D.dense.make(v1); double sigma_sq = sigma * sigma; // Sets up similarity matrix for (int i = 0; i < n; i++) for (int j = i; j < n; j++) { /*double dist = distnorm2(v.viewRow(i), v.viewRow(j)); if((r == -1) || (dist < r)) { double sim = Math.exp(- (dist * dist) / (2 * sigma_sq)); w.set(i, j, sim); w.set(j, i, sim); }*/ /* String [] key = {data.instance(i).stringValue(0), data.instance(j).stringValue(0)}; System.out.println(key[0]); System.out.println(key[1]); System.out.println(simScoreMap.containsKey(key)); Double simValue = simScoreMap.get(key);*/ double sim = sim_matrix[i][j]; w.set(i, j, sim); w.set(j, i, sim); } // Partitions points int[][] p = partition(w, alpha_star); // Deploys results numOfClusters = p.length; cluster = new int[n]; for (int i = 0; i < p.length; i++) for (int j = 0; j < p[i].length; j++) cluster[p[i][j]] = i; // System.out.println("Final partition:"); // UtilsJS.printMatrix(p); // System.out.println("Cluster:\n"); // UtilsJS.printArray(cluster); this.numOfClusters = cluster[Utils.maxIndex(cluster)] + 1; // System.out.println("Num clusters:\t"+this.numOfClusters); }
protected void updateMinDistance( double[] minDistance, boolean[] selected, Instances data, Instance center) { for (int i = 0; i < selected.length; i++) if (!selected[i]) { double d = distance(center, data.instance(i)); if (d < minDistance[i]) minDistance[i] = d; } }
// 通过h1,h2分类器学习样本集,将h1,h2分类决策相同的样本放入L中,得到标记集合; public void updateL(Classifier h1, Classifier h2, Instances L, Instances test) { int length = unlabeledIns.numInstances(); double value1 = 0.0, value2 = 0.0; try { for (int i = 0; i < length; i++) { value1 = h1.classifyInstance(test.instance(i)); value2 = h2.classifyInstance(test.instance(i)); if (value1 == value2) { // 当两个分类器做出相同决策时重新标记样本的类别; test.instance(i).setClassValue(value1); L.add(test.instance(i)); } } } catch (Exception e) { System.out.println(e); } // return false; }
/** * loads the given dataset and prints the Capabilities necessary to process it. * * <p>Valid parameters: * * <p>-file filename <br> * the file to load * * <p>-c index the explicit index of the class attribute (default: none) * * @param args the commandline arguments * @throws Exception if something goes wrong */ public static void main(String[] args) throws Exception { String tmpStr; String filename; DataSource source; Instances data; int classIndex; Capabilities cap; Iterator iter; if (args.length == 0) { System.out.println( "\nUsage: " + Capabilities.class.getName() + " -file <dataset> [-c <class index>]\n"); return; } // get parameters tmpStr = Utils.getOption("file", args); if (tmpStr.length() == 0) throw new Exception("No file provided with option '-file'!"); else filename = tmpStr; tmpStr = Utils.getOption("c", args); if (tmpStr.length() != 0) { if (tmpStr.equals("first")) classIndex = 0; else if (tmpStr.equals("last")) classIndex = -2; // last else classIndex = Integer.parseInt(tmpStr) - 1; } else { classIndex = -3; // not set } // load data source = new DataSource(filename); if (classIndex == -3) data = source.getDataSet(); else if (classIndex == -2) data = source.getDataSet(source.getStructure().numAttributes() - 1); else data = source.getDataSet(classIndex); // determine and print capabilities cap = forInstances(data); System.out.println("File: " + filename); System.out.println( "Class index: " + ((data.classIndex() == -1) ? "not set" : "" + (data.classIndex() + 1))); System.out.println("Capabilities:"); iter = cap.capabilities(); while (iter.hasNext()) System.out.println("- " + iter.next()); }
/** * Calculates the distance between two instances * * @param test the first instance * @param train the second instance * @return the distance between the two given instances, between 0 and 1 */ protected double distance(Instance first, Instance second) { double distance = 0; int firstI, secondI; for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues(); ) { if (p1 >= first.numValues()) { firstI = m_instances.numAttributes(); } else { firstI = first.index(p1); } if (p2 >= second.numValues()) { secondI = m_instances.numAttributes(); } else { secondI = second.index(p2); } if (firstI == m_instances.classIndex()) { p1++; continue; } if (secondI == m_instances.classIndex()) { p2++; continue; } double diff; if (firstI == secondI) { diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2)); p1++; p2++; } else if (firstI > secondI) { diff = difference(secondI, 0, second.valueSparse(p2)); p2++; } else { diff = difference(firstI, first.valueSparse(p1), 0); p1++; } distance += diff * diff; } return Math.sqrt(distance / m_instances.numAttributes()); }
/** * clusters an instance that has been through the filters * * @param instance the instance to assign a cluster to * @return a cluster number */ protected int clusterProcessedInstance(Instance instance) { double minDist = Double.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < m_NumClusters; i++) { double dist = distance(instance, m_ClusterCentroids.instance(i)); if (dist < minDist) { minDist = dist; bestCluster = i; } } return bestCluster; }
/** * Compute and store statistics required for generating artificial data. * * @param data training instances * @exception Exception if statistics could not be calculated successfully */ protected void computeStats(Instances data) throws Exception { int numAttributes = data.numAttributes(); m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Compute the probability of occurence of each distinct value int[] nomCounts = (data.attributeStats(j)).nominalCounts; double[] counts = new double[nomCounts.length]; if (counts.length < 2) throw new Exception("Nominal attribute has less than two distinct values!"); // Perform Laplace smoothing for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1; Utils.normalize(counts); double[] stats = new double[counts.length - 1]; stats[0] = counts[0]; // Calculate cumulative probabilities for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i]; m_AttributeStats.add(j, stats); } else if (data.attribute(j).isNumeric()) { // Get mean and standard deviation from the training data double[] stats = new double[2]; stats[0] = data.meanOrMode(j); stats[1] = Math.sqrt(data.variance(j)); m_AttributeStats.add(j, stats); } else System.err.println("Decorate can only handle numeric and nominal values."); } }
/** * Buildclassifier selects a classifier from the set of classifiers by minimising error on the * training data. * * @param data the training data to be used for generating the boosted classifier. * @exception Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if (m_Classifiers.length == 0) { throw new Exception("No base classifiers have been set!"); } Instances newData = new Instances(data); newData.deleteWithMissingClass(); newData.randomize(new Random(m_Seed)); if (newData.classAttribute().isNominal() && (m_NumXValFolds > 1)) newData.stratify(m_NumXValFolds); Instances train = newData; // train on all data by default Instances test = newData; // test on training data by default Classifier bestClassifier = null; int bestIndex = -1; double bestPerformance = Double.NaN; int numClassifiers = m_Classifiers.length; for (int i = 0; i < numClassifiers; i++) { Classifier currentClassifier = getClassifier(i); Evaluation evaluation; if (m_NumXValFolds > 1) { evaluation = new Evaluation(newData); for (int j = 0; j < m_NumXValFolds; j++) { train = newData.trainCV(m_NumXValFolds, j); test = newData.testCV(m_NumXValFolds, j); currentClassifier.buildClassifier(train); evaluation.setPriors(train); evaluation.evaluateModel(currentClassifier, test); } } else { currentClassifier.buildClassifier(train); evaluation = new Evaluation(train); evaluation.evaluateModel(currentClassifier, test); } double error = evaluation.errorRate(); if (m_Debug) { System.err.println( "Error rate: " + Utils.doubleToString(error, 6, 4) + " for classifier " + currentClassifier.getClass().getName()); } if ((i == 0) || (error < bestPerformance)) { bestClassifier = currentClassifier; bestPerformance = error; bestIndex = i; } } m_ClassifierIndex = bestIndex; m_Classifier = bestClassifier; if (m_NumXValFolds > 1) { m_Classifier.buildClassifier(newData); } }
/** * Gets the subset of instances that apply to a particluar branch of the split. If the branch * index is -1, the subset will consist of those instances that don't apply to any branch. * * @param branch the index of the branch * @param sourceInstances the instances from which to find the subset * @return the set of instances that apply */ public ReferenceInstances instancesDownBranch(int branch, Instances instances) { ReferenceInstances filteredInstances = new ReferenceInstances(instances, 1); if (branch == -1) { for (Enumeration e = instances.enumerateInstances(); e.hasMoreElements(); ) { Instance inst = (Instance) e.nextElement(); if (inst.isMissing(attIndex)) filteredInstances.addReference(inst); } } else if (branch == 0) { for (Enumeration e = instances.enumerateInstances(); e.hasMoreElements(); ) { Instance inst = (Instance) e.nextElement(); if (!inst.isMissing(attIndex) && inst.value(attIndex) < splitPoint) filteredInstances.addReference(inst); } } else { for (Enumeration e = instances.enumerateInstances(); e.hasMoreElements(); ) { Instance inst = (Instance) e.nextElement(); if (!inst.isMissing(attIndex) && inst.value(attIndex) >= splitPoint) filteredInstances.addReference(inst); } } return filteredInstances; }
/** * return a string describing this clusterer * * @return a description of the clusterer as a string */ public String toString() { StringBuffer temp = new StringBuffer(); temp.append("\n FarthestFirst\n==============\n"); temp.append("\nCluster centroids:\n"); for (int i = 0; i < m_NumClusters; i++) { temp.append("\nCluster " + i + "\n\t"); for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) { if (m_ClusterCentroids.attribute(j).isNominal()) { temp.append( " " + m_ClusterCentroids .attribute(j) .value((int) m_ClusterCentroids.instance(i).value(j))); } else { temp.append(" " + m_ClusterCentroids.instance(i).value(j)); } } } temp.append("\n\n"); return temp.toString(); }
/** @param args */ private void Init() { testIns.setClassIndex(testIns.numAttributes() - 1); labeledIns.setClassIndex(labeledIns.numAttributes() - 1); unlabeledIns.setClassIndex(unlabeledIns.numAttributes() - 1); class_Array[0] = classifier1; class_Array[1] = classifier2; class_Array[2] = classifier3; }
public void buildClusterer(ArrayList<String> seqDB, double[][] sm) { seqList = seqDB; this.setSimMatrix(sm); Attribute seqString = new Attribute("sequence", (FastVector) null); FastVector attrInfo = new FastVector(); attrInfo.addElement(seqString); Instances data = new Instances("data", attrInfo, 0); for (int i = 0; i < seqList.size(); i++) { Instance currentInst = new Instance(1); currentInst.setDataset(data); currentInst.setValue(0, seqList.get(i)); data.add(currentInst); } try { buildClusterer(data); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
@Test public void testSplittable() throws SchedulerException { List<VM> vms = Arrays.asList(vm1, vm2, vm3); Collection<Collection<Node>> parts = new ArrayList<>(); parts.add(Arrays.asList(n1, n2)); parts.add(Arrays.asList(n3)); parts.add(Arrays.asList(n4)); Among single = new Among(vms, parts); /* N1 v1 v2 N2 v3 --- N3 v4 -- N4 v5 */ FixedNodeSetsPartitioning partitionner = new FixedNodeSetsPartitioning(parts); partitionner.setPartitions(parts); List<Instance> instances = partitionner.split( new DefaultParameters(), new Instance(mo, Collections.<SatConstraint>emptyList(), new MinMTTR())); TIntIntHashMap vmIndex = Instances.makeVMIndex(instances); TIntIntHashMap nodeIndex = Instances.makeNodeIndex(instances); splitter.split(single, new Instance(mo, new MinMTTR()), instances, vmIndex, nodeIndex); Among a = (Among) instances.get(0).getSatConstraints().iterator().next(); Assert.assertEquals(a.getGroupsOfNodes().size(), 1); Assert.assertEquals(a.getInvolvedNodes(), Arrays.asList(n1, n2)); for (Instance i : instances) { System.out.println(i.getSatConstraints()); } }
/** * Generates a clusterer. Has to initialize all fields of the clusterer that are not being set via * options. * * @param data set of instances serving as training data * @exception Exception if the clusterer has not been generated successfully */ public void buildClusterer(Instances data) throws Exception { // long start = System.currentTimeMillis(); if (data.checkForStringAttributes()) { throw new Exception("Can't handle string attributes!"); } m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(data); m_instances = Filter.useFilter(data, m_ReplaceMissingFilter); initMinMax(m_instances); m_ClusterCentroids = new Instances(m_instances, m_NumClusters); int n = m_instances.numInstances(); Random r = new Random(m_Seed); boolean[] selected = new boolean[n]; double[] minDistance = new double[n]; for (int i = 0; i < n; i++) minDistance[i] = Double.MAX_VALUE; int firstI = r.nextInt(n); m_ClusterCentroids.add(m_instances.instance(firstI)); selected[firstI] = true; updateMinDistance(minDistance, selected, m_instances, m_instances.instance(firstI)); if (m_NumClusters > n) m_NumClusters = n; for (int i = 1; i < m_NumClusters; i++) { int nextI = farthestAway(minDistance, selected); m_ClusterCentroids.add(m_instances.instance(nextI)); selected[nextI] = true; updateMinDistance(minDistance, selected, m_instances, m_instances.instance(nextI)); } m_instances = new Instances(m_instances, 0); // long end = System.currentTimeMillis(); // System.out.println("Clustering Time = " + (end-start)); }
/** Computes the difference between two given attribute values. */ protected double difference(int index, double val1, double val2) { switch (m_instances.attribute(index).type()) { case Attribute.NOMINAL: // If attribute is nominal if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2) || ((int) val1 != (int) val2)) { return 1; } else { return 0; } case Attribute.NUMERIC: // If attribute is numeric if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) { if (Instance.isMissingValue(val1) && Instance.isMissingValue(val2)) { return 1; } else { double diff; if (Instance.isMissingValue(val2)) { diff = norm(val1, index); } else { diff = norm(val2, index); } if (diff < 0.5) { diff = 1.0 - diff; } return diff; } } else { return norm(val1, index) - norm(val2, index); } default: return 0; } }
/** * Add new instances to the given set of instances. * * @param data given instances * @param newData set of instances to add to given instances */ protected void addInstances(Instances data, Instances newData) { for (int i = 0; i < newData.numInstances(); i++) data.add(newData.instance(i)); }
/** * Removes a specified number of instances from the given set of instances. * * @param data given instances * @param numRemove number of instances to delete from the given instances */ protected void removeInstances(Instances data, int numRemove) { int num = data.numInstances(); for (int i = num - 1; i > num - 1 - numRemove; i--) { data.delete(i); } }
/** * Tests the given data, whether it can be processed by the handler, given its capabilities. * Classifiers implementing the <code>MultiInstanceCapabilitiesHandler</code> interface are * checked automatically for their multi-instance Capabilities (if no bags, then only the * bag-structure, otherwise only the first bag). * * @param data the data to test * @return true if all the tests succeeded * @see #test(Instances, int, int) */ public boolean test(Instances data) { return test(data, 0, data.numAttributes() - 1); }
/** * Build Decorate classifier * * @param data the training data to be used for generating the classifier * @exception Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if (m_Classifier == null) { throw new Exception("A base classifier has not been specified!"); } if (data.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle string attributes!"); } if (data.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } if (m_NumIterations < m_DesiredSize) throw new Exception("Max number of iterations must be >= desired ensemble size!"); // initialize random number generator if (m_Seed == -1) m_Random = new Random(); else m_Random = new Random(m_Seed); int i = 1; // current committee size int numTrials = 1; // number of Decorate iterations Instances divData = new Instances(data); // local copy of data - diversity data divData.deleteWithMissingClass(); Instances artData = null; // artificial data // compute number of artficial instances to add at each iteration int artSize = (int) (Math.abs(m_ArtSize) * divData.numInstances()); if (artSize == 0) artSize = 1; // atleast add one random example computeStats(data); // Compute training data stats for creating artificial examples // initialize new committee m_Committee = new Vector(); Classifier newClassifier = m_Classifier; newClassifier.buildClassifier(divData); m_Committee.add(newClassifier); double eComm = computeError(divData); // compute ensemble error if (m_Debug) System.out.println( "Initialize:\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); // repeat till desired committee size is reached OR the max number of iterations is exceeded while (i < m_DesiredSize && numTrials < m_NumIterations) { // Generate artificial training examples artData = generateArtificialData(artSize, data); // Label artificial examples labelData(artData); addInstances(divData, artData); // Add new artificial data // Build new classifier Classifier tmp[] = Classifier.makeCopies(m_Classifier, 1); newClassifier = tmp[0]; newClassifier.buildClassifier(divData); // Remove all the artificial data removeInstances(divData, artSize); // Test if the new classifier should be added to the ensemble m_Committee.add(newClassifier); // add new classifier to current committee double currError = computeError(divData); if (currError <= eComm) { // adding the new member did not increase the error i++; eComm = currError; if (m_Debug) System.out.println( "Iteration: " + (1 + numTrials) + "\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); } else { // reject the current classifier because it increased the ensemble error m_Committee.removeElementAt(m_Committee.size() - 1); // pop the last member } numTrials++; } }
/** * Tests a certain range of attributes of the given data, whether it can be processed by the * handler, given its capabilities. Classifiers implementing the <code> * MultiInstanceCapabilitiesHandler</code> interface are checked automatically for their * multi-instance Capabilities (if no bags, then only the bag-structure, otherwise only the first * bag). * * @param data the data to test * @param fromIndex the range of attributes - start (incl.) * @param toIndex the range of attributes - end (incl.) * @return true if all the tests succeeded * @see MultiInstanceCapabilitiesHandler * @see #m_InstancesTest * @see #m_MissingValuesTest * @see #m_MissingClassValuesTest * @see #m_MinimumNumberInstancesTest */ public boolean test(Instances data, int fromIndex, int toIndex) { int i; int n; int m; Attribute att; Instance inst; boolean testClass; Capabilities cap; boolean missing; Iterator iter; // shall we test the data? if (!m_InstancesTest) return true; // no Capabilities? -> warning if ((m_Capabilities.size() == 0) || ((m_Capabilities.size() == 1) && handles(Capability.NO_CLASS))) System.err.println(createMessage("No capabilities set!")); // any attributes? if (toIndex - fromIndex < 0) { m_FailReason = new WekaException(createMessage("No attributes!")); return false; } // do wee need to test the class attribute, i.e., is the class attribute // within the range of attributes? testClass = (data.classIndex() > -1) && (data.classIndex() >= fromIndex) && (data.classIndex() <= toIndex); // attributes for (i = fromIndex; i <= toIndex; i++) { att = data.attribute(i); // class is handled separately if (i == data.classIndex()) continue; // check attribute types if (!test(att)) return false; } // class if (!handles(Capability.NO_CLASS) && (data.classIndex() == -1)) { m_FailReason = new UnassignedClassException(createMessage("Class attribute not set!")); return false; } // special case: no class attribute can be handled if (handles(Capability.NO_CLASS) && (data.classIndex() > -1)) { cap = getClassCapabilities(); cap.disable(Capability.NO_CLASS); iter = cap.capabilities(); if (!iter.hasNext()) { m_FailReason = new WekaException(createMessage("Cannot handle any class attribute!")); return false; } } if (testClass && !handles(Capability.NO_CLASS)) { att = data.classAttribute(); if (!test(att, true)) return false; // special handling of RELATIONAL class // TODO: store additional Capabilities for this case // missing class labels if (m_MissingClassValuesTest) { if (!handles(Capability.MISSING_CLASS_VALUES)) { for (i = 0; i < data.numInstances(); i++) { if (data.instance(i).classIsMissing()) { m_FailReason = new WekaException(createMessage("Cannot handle missing class values!")); return false; } } } else { if (m_MinimumNumberInstancesTest) { int hasClass = 0; for (i = 0; i < data.numInstances(); i++) { if (!data.instance(i).classIsMissing()) hasClass++; } // not enough instances with class labels? if (hasClass < getMinimumNumberInstances()) { m_FailReason = new WekaException( createMessage( "Not enough training instances with class labels (required: " + getMinimumNumberInstances() + ", provided: " + hasClass + ")!")); return false; } } } } } // missing values if (m_MissingValuesTest) { if (!handles(Capability.MISSING_VALUES)) { missing = false; for (i = 0; i < data.numInstances(); i++) { inst = data.instance(i); if (inst instanceof SparseInstance) { for (m = 0; m < inst.numValues(); m++) { n = inst.index(m); // out of scope? if (n < fromIndex) continue; if (n > toIndex) break; // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } else { for (n = fromIndex; n <= toIndex; n++) { // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } if (missing) { m_FailReason = new NoSupportForMissingValuesException( createMessage("Cannot handle missing values!")); return false; } } } } // instances if (m_MinimumNumberInstancesTest) { if (data.numInstances() < getMinimumNumberInstances()) { m_FailReason = new WekaException( createMessage( "Not enough training instances (required: " + getMinimumNumberInstances() + ", provided: " + data.numInstances() + ")!")); return false; } } // Multi-Instance? -> check structure (regardless of attribute range!) if (handles(Capability.ONLY_MULTIINSTANCE)) { // number of attributes? if (data.numAttributes() != 3) { m_FailReason = new WekaException( createMessage("Incorrect Multi-Instance format, must be 'bag-id, bag, class'!")); return false; } // type of attributes and position of class? if (!data.attribute(0).isNominal() || !data.attribute(1).isRelationValued() || (data.classIndex() != data.numAttributes() - 1)) { m_FailReason = new WekaException( createMessage( "Incorrect Multi-Instance format, must be 'NOMINAL att, RELATIONAL att, CLASS att'!")); return false; } // check data immediately if (getOwner() instanceof MultiInstanceCapabilitiesHandler) { MultiInstanceCapabilitiesHandler handler = (MultiInstanceCapabilitiesHandler) getOwner(); cap = handler.getMultiInstanceCapabilities(); boolean result; if (data.numInstances() > 0) result = cap.test(data.attribute(1).relation(0)); else result = cap.test(data.attribute(1).relation()); if (!result) { m_FailReason = cap.m_FailReason; return false; } } } // passed all tests! return true; }