/** * Normalize the weights for the next iteration. * * @param training the training instances * @throws Exception if something goes wrong */ protected void normalizeWeights(Instances training, double oldSumOfWeights) throws Exception { // Renormalize weights double newSumOfWeights = training.sumOfWeights(); for (Instance instance : training) { instance.setWeight(instance.weight() * oldSumOfWeights / newSumOfWeights); } }
/** * Sets the weights for the next iteration. * * @param training the data to train with * @param reweight the reweighting factor * @throws Exception in case of an error */ protected void setWeights(Instances training, double reweight) throws Exception { int subCmtySize = m_Classifiers.length / m_NumSubCmtys; if ((m_NumIterationsPerformed + 1) % subCmtySize == 0) { if (getDebug()) System.err.println(m_NumIterationsPerformed + " " + subCmtySize); double oldSumOfWeights = training.sumOfWeights(); // Randomly set the weights of the training instances to the poisson distributon for (int i = 0; i < training.numInstances(); i++) { training.instance(i).setWeight(-Math.log((m_Random.nextDouble() * 9999) / 10000)); } // Renormailise weights double sumProbs = training.sumOfWeights(); for (int i = 0; i < training.numInstances(); i++) { training.instance(i).setWeight(training.instance(i).weight() * oldSumOfWeights / sumProbs); } } else { super.setWeights(training, reweight); } }
private void newDistribution(Instances data) throws Exception { Instances[] localInstances; m_localModel.resetDistribution(data); m_train = data; if (!m_isLeaf) { localInstances = (Instances[]) m_localModel.split(data); for (int i = 0; i < m_sons.length; i++) { m_sons[i].newDistribution(localInstances[i]); } } else { // Check whether there are some instances at the leaf now! if (!Utils.eq(data.sumOfWeights(), 0)) { m_isEmpty = false; } } }
/** * Boosting method. * * @param data the training data to be used for generating the boosted classifier. * @throws Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { super.buildClassifier(data); // can classifier handle the data? getCapabilities().testWithFail(data); // remove instances with missing class data = new Instances(data); data.deleteWithMissingClass(); m_SumOfWeights = data.sumOfWeights(); if ((!m_UseResampling) && (m_Classifier instanceof WeightedInstancesHandler)) { buildClassifierWithWeights(data); } else { buildClassifierUsingResampling(data); } }
public void buildTree(Instances data) throws Exception { Instances[] localInstances; // m_test = null; m_isLeaf = false; m_isEmpty = false; m_sons = null; m_localModel = m_toSelectModel.selectModel(data); if (m_localModel.numSubsets() > 1) { localInstances = m_localModel.split(data); data = null; m_sons = new myJ48ClassifierTree[m_localModel.numSubsets()]; for (int i = 0; i < m_sons.length; i++) { m_sons[i] = getNewTree(localInstances[i]); localInstances[i] = null; } } else { m_isLeaf = true; if (Utils.eq(data.sumOfWeights(), 0)) { m_isEmpty = true; } data = null; } }
/** * Compute the combined DL of the ruleset in this class, i.e. theory DL and data DL. Note this * procedure computes the combined DL according to the current status of the ruleset in this class * * @param expFPRate expected FP/(FP+FN), used in dataDL calculation * @param predicted the default classification if ruleset covers null * @return the combined class */ public double combinedDL(double expFPRate, double predicted) { double rt = 0; if (getRulesetSize() > 0) { double[] stats = (double[]) m_SimpleStats.lastElement(); for (int j = getRulesetSize() - 2; j >= 0; j--) { stats[0] += getSimpleStats(j)[0]; stats[2] += getSimpleStats(j)[2]; stats[4] += getSimpleStats(j)[4]; } rt += dataDL(expFPRate, stats[0], stats[1], stats[4], stats[5]); // Data DL } else { // Null coverage ruleset double fn = 0.0; for (int j = 0; j < m_Data.numInstances(); j++) if ((int) m_Data.instance(j).classValue() == (int) predicted) fn += m_Data.instance(j).weight(); rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn); } for (int i = 0; i < getRulesetSize(); i++) // Theory DL rt += theoryDL(i); return rt; }
/** * Generates the classifier. * * @param instances set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ public void buildClassifier(Instances instances) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_headerInfo = new Instances(instances, 0); m_numClasses = instances.numClasses(); m_numAttributes = instances.numAttributes(); m_probOfWordGivenClass = new double[m_numClasses][]; /* initialising the matrix of word counts NOTE: Laplace estimator introduced in case a word that does not appear for a class in the training set does so for the test set */ for (int c = 0; c < m_numClasses; c++) { m_probOfWordGivenClass[c] = new double[m_numAttributes]; for (int att = 0; att < m_numAttributes; att++) { m_probOfWordGivenClass[c][att] = 1; } } // enumerate through the instances Instance instance; int classIndex; double numOccurences; double[] docsPerClass = new double[m_numClasses]; double[] wordsPerClass = new double[m_numClasses]; java.util.Enumeration enumInsts = instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { instance = (Instance) enumInsts.nextElement(); classIndex = (int) instance.value(instance.classIndex()); docsPerClass[classIndex] += instance.weight(); for (int a = 0; a < instance.numValues(); a++) if (instance.index(a) != instance.classIndex()) { if (!instance.isMissing(a)) { numOccurences = instance.valueSparse(a) * instance.weight(); if (numOccurences < 0) throw new Exception("Numeric attribute values must all be greater or equal to zero."); wordsPerClass[classIndex] += numOccurences; m_probOfWordGivenClass[classIndex][instance.index(a)] += numOccurences; } } } /* normalising probOfWordGivenClass values and saving each value as the log of each value */ for (int c = 0; c < m_numClasses; c++) for (int v = 0; v < m_numAttributes; v++) m_probOfWordGivenClass[c][v] = Math.log(m_probOfWordGivenClass[c][v] / (wordsPerClass[c] + m_numAttributes - 1)); /* calculating Pr(H) NOTE: Laplace estimator introduced in case a class does not get mentioned in the set of training instances */ final double numDocs = instances.sumOfWeights() + m_numClasses; m_probOfClass = new double[m_numClasses]; for (int h = 0; h < m_numClasses; h++) m_probOfClass[h] = (double) (docsPerClass[h] + 1) / numDocs; }