/** * Set the model to use * * @param model the model to use * @param modelHeader the header of the training data used to train the model * @param dataHeader the header of the incoming data * @throws DistributedWekaException if more than 50% of the attributes expected by the model are * missing or have a type mismatch with the incoming data */ public void setModel(Object model, Instances modelHeader, Instances dataHeader) throws DistributedWekaException { m_missingMismatch.clear(); if (dataHeader == null || modelHeader == null) { throw new DistributedWekaException( "Can't continue without a header for the model and incoming data"); } try { m_isUsingStringAttributes = modelHeader.checkForStringAttributes(); m_model = ScoringModel.createScorer(model); if (modelHeader != null) { m_model.setHeader(modelHeader); } if (m_model.isBatchPredicor()) { m_batchScoringData = new Instances(modelHeader, 0); Environment env = Environment.getSystemWide(); String batchSize = ((BatchPredictor) model).getBatchSize(); if (!DistributedJobConfig.isEmpty(batchSize)) { m_batchSize = Integer.parseInt(env.substitute(batchSize)); } else { m_batchSize = 1000; } } } catch (Exception ex) { throw new DistributedWekaException(ex); } buildAttributeMap(modelHeader, dataHeader); }
/** * Generates a clusterer. Has to initialize all fields of the clusterer that are not being set via * options. * * @param data set of instances serving as training data * @exception Exception if the clusterer has not been generated successfully */ public void buildClusterer(Instances data) throws Exception { // long start = System.currentTimeMillis(); if (data.checkForStringAttributes()) { throw new Exception("Can't handle string attributes!"); } m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(data); m_instances = Filter.useFilter(data, m_ReplaceMissingFilter); initMinMax(m_instances); m_ClusterCentroids = new Instances(m_instances, m_NumClusters); int n = m_instances.numInstances(); Random r = new Random(m_Seed); boolean[] selected = new boolean[n]; double[] minDistance = new double[n]; for (int i = 0; i < n; i++) minDistance[i] = Double.MAX_VALUE; int firstI = r.nextInt(n); m_ClusterCentroids.add(m_instances.instance(firstI)); selected[firstI] = true; updateMinDistance(minDistance, selected, m_instances, m_instances.instance(firstI)); if (m_NumClusters > n) m_NumClusters = n; for (int i = 1; i < m_NumClusters; i++) { int nextI = farthestAway(minDistance, selected); m_ClusterCentroids.add(m_instances.instance(nextI)); selected[nextI] = true; updateMinDistance(minDistance, selected, m_instances, m_instances.instance(nextI)); } m_instances = new Instances(m_instances, 0); // long end = System.currentTimeMillis(); // System.out.println("Clustering Time = " + (end-start)); }
/** * Build Decorate classifier * * @param data the training data to be used for generating the classifier * @exception Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if (m_Classifier == null) { throw new Exception("A base classifier has not been specified!"); } if (data.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle string attributes!"); } if (data.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } if (m_NumIterations < m_DesiredSize) throw new Exception("Max number of iterations must be >= desired ensemble size!"); // initialize random number generator if (m_Seed == -1) m_Random = new Random(); else m_Random = new Random(m_Seed); int i = 1; // current committee size int numTrials = 1; // number of Decorate iterations Instances divData = new Instances(data); // local copy of data - diversity data divData.deleteWithMissingClass(); Instances artData = null; // artificial data // compute number of artficial instances to add at each iteration int artSize = (int) (Math.abs(m_ArtSize) * divData.numInstances()); if (artSize == 0) artSize = 1; // atleast add one random example computeStats(data); // Compute training data stats for creating artificial examples // initialize new committee m_Committee = new Vector(); Classifier newClassifier = m_Classifier; newClassifier.buildClassifier(divData); m_Committee.add(newClassifier); double eComm = computeError(divData); // compute ensemble error if (m_Debug) System.out.println( "Initialize:\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); // repeat till desired committee size is reached OR the max number of iterations is exceeded while (i < m_DesiredSize && numTrials < m_NumIterations) { // Generate artificial training examples artData = generateArtificialData(artSize, data); // Label artificial examples labelData(artData); addInstances(divData, artData); // Add new artificial data // Build new classifier Classifier tmp[] = Classifier.makeCopies(m_Classifier, 1); newClassifier = tmp[0]; newClassifier.buildClassifier(divData); // Remove all the artificial data removeInstances(divData, artSize); // Test if the new classifier should be added to the ensemble m_Committee.add(newClassifier); // add new classifier to current committee double currError = computeError(divData); if (currError <= eComm) { // adding the new member did not increase the error i++; eComm = currError; if (m_Debug) System.out.println( "Iteration: " + (1 + numTrials) + "\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); } else { // reject the current classifier because it increased the ensemble error m_Committee.removeElementAt(m_Committee.size() - 1); // pop the last member } numTrials++; } }
/** * Method that generates all large itemsets with a minimum support, and from these all association * rules. * * @param instances the instances to be used for generating the associations * @exception Exception if rules can't be built successfully */ public void buildAssociations(Instances instances) throws Exception { int temp = m_premiseCount, exactNumber = m_numRules - 5; if (instances.checkForStringAttributes()) { throw new Exception("Can't handle string attributes!"); } m_instances = instances; m_instances.setClassIndex(m_instances.numAttributes() - 1); // prior estimation m_priorEstimator = new PriorEstimation(m_instances, m_numRandRules, m_numIntervals, false); m_priors = m_priorEstimator.estimatePrior(); m_midPoints = m_priorEstimator.getMidPoints(); m_Ls = new FastVector(); m_hashtables = new FastVector(); for (int i = 1; i < m_instances.numAttributes(); i++) { m_bestChanged = false; // find large item sets findLargeItemSets(i); // find association rules (rule generation procedure) findRulesQuickly(); if (m_bestChanged) { temp = m_premiseCount; while (RuleGeneration.expectation(m_premiseCount, m_premiseCount, m_midPoints, m_priors) <= m_expectation) { m_premiseCount++; if (m_premiseCount > m_instances.numInstances()) break; } } if (m_premiseCount > m_instances.numInstances()) { // Reserve space for variables m_allTheRules = new FastVector[3]; m_allTheRules[0] = new FastVector(); m_allTheRules[1] = new FastVector(); m_allTheRules[2] = new FastVector(); int k = 0; while (m_best.size() > 0 && exactNumber > 0) { m_allTheRules[0].insertElementAt((ItemSet) ((RuleItem) m_best.last()).premise(), k); m_allTheRules[1].insertElementAt((ItemSet) ((RuleItem) m_best.last()).consequence(), k); m_allTheRules[2].insertElementAt(new Double(((RuleItem) m_best.last()).accuracy()), k); boolean remove = m_best.remove(m_best.last()); k++; exactNumber--; } return; } if (temp != m_premiseCount && m_Ls.size() > 0) { FastVector kSets = (FastVector) m_Ls.lastElement(); m_Ls.removeElementAt(m_Ls.size() - 1); kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE); m_Ls.addElement(kSets); } } // Reserve space for variables m_allTheRules = new FastVector[3]; m_allTheRules[0] = new FastVector(); m_allTheRules[1] = new FastVector(); m_allTheRules[2] = new FastVector(); int k = 0; while (m_best.size() > 0 && exactNumber > 0) { m_allTheRules[0].insertElementAt((ItemSet) ((RuleItem) m_best.last()).premise(), k); m_allTheRules[1].insertElementAt((ItemSet) ((RuleItem) m_best.last()).consequence(), k); m_allTheRules[2].insertElementAt(new Double(((RuleItem) m_best.last()).accuracy()), k); boolean remove = m_best.remove(m_best.last()); k++; exactNumber--; } }
/** * Carry out the bias-variance decomposition * * @throws Exception if the decomposition couldn't be carried out */ public void decompose() throws Exception { Reader dataReader = new BufferedReader(new FileReader(m_DataFileName)); Instances data = new Instances(dataReader); if (m_ClassIndex < 0) { data.setClassIndex(data.numAttributes() - 1); } else { data.setClassIndex(m_ClassIndex); } if (data.classAttribute().type() != Attribute.NOMINAL) { throw new Exception("Class attribute must be nominal"); } int numClasses = data.numClasses(); data.deleteWithMissingClass(); if (data.checkForStringAttributes()) { throw new Exception("Can't handle string attributes!"); } if (data.numInstances() < 2 * m_TrainPoolSize) { throw new Exception( "The dataset must contain at least " + (2 * m_TrainPoolSize) + " instances"); } Random random = new Random(m_Seed); data.randomize(random); Instances trainPool = new Instances(data, 0, m_TrainPoolSize); Instances test = new Instances(data, m_TrainPoolSize, data.numInstances() - m_TrainPoolSize); int numTest = test.numInstances(); double[][] instanceProbs = new double[numTest][numClasses]; m_Error = 0; for (int i = 0; i < m_TrainIterations; i++) { if (m_Debug) { System.err.println("Iteration " + (i + 1)); } trainPool.randomize(random); Instances train = new Instances(trainPool, 0, m_TrainPoolSize / 2); Classifier current = AbstractClassifier.makeCopy(m_Classifier); current.buildClassifier(train); //// Evaluate the classifier on test, updating BVD stats for (int j = 0; j < numTest; j++) { int pred = (int) current.classifyInstance(test.instance(j)); if (pred != test.instance(j).classValue()) { m_Error++; } instanceProbs[j][pred]++; } } m_Error /= (m_TrainIterations * numTest); // Average the BV over each instance in test. m_Bias = 0; m_Variance = 0; m_Sigma = 0; for (int i = 0; i < numTest; i++) { Instance current = test.instance(i); double[] predProbs = instanceProbs[i]; double pActual, pPred; double bsum = 0, vsum = 0, ssum = 0; for (int j = 0; j < numClasses; j++) { pActual = (current.classValue() == j) ? 1 : 0; // Or via 1NN from test data? pPred = predProbs[j] / m_TrainIterations; bsum += (pActual - pPred) * (pActual - pPred) - pPred * (1 - pPred) / (m_TrainIterations - 1); vsum += pPred * pPred; ssum += pActual * pActual; } m_Bias += bsum; m_Variance += (1 - vsum); m_Sigma += (1 - ssum); } m_Bias /= (2 * numTest); m_Variance /= (2 * numTest); m_Sigma /= (2 * numTest); if (m_Debug) { System.err.println("Decomposition finished"); } }