예제 #1
0
  /**
   * Normalize the weights for the next iteration.
   *
   * @param training the training instances
   * @throws Exception if something goes wrong
   */
  protected void normalizeWeights(Instances training, double oldSumOfWeights) throws Exception {

    // Renormalize weights
    double newSumOfWeights = training.sumOfWeights();
    for (Instance instance : training) {
      instance.setWeight(instance.weight() * oldSumOfWeights / newSumOfWeights);
    }
  }
예제 #2
0
  /**
   * Sets the weights for the next iteration.
   *
   * @param training the data to train with
   * @param reweight the reweighting factor
   * @throws Exception in case of an error
   */
  protected void setWeights(Instances training, double reweight) throws Exception {

    int subCmtySize = m_Classifiers.length / m_NumSubCmtys;

    if ((m_NumIterationsPerformed + 1) % subCmtySize == 0) {

      if (getDebug()) System.err.println(m_NumIterationsPerformed + " " + subCmtySize);

      double oldSumOfWeights = training.sumOfWeights();

      // Randomly set the weights of the training instances to the poisson distributon
      for (int i = 0; i < training.numInstances(); i++) {
        training.instance(i).setWeight(-Math.log((m_Random.nextDouble() * 9999) / 10000));
      }

      // Renormailise weights
      double sumProbs = training.sumOfWeights();
      for (int i = 0; i < training.numInstances(); i++) {
        training.instance(i).setWeight(training.instance(i).weight() * oldSumOfWeights / sumProbs);
      }
    } else {
      super.setWeights(training, reweight);
    }
  }
 private void newDistribution(Instances data) throws Exception {
   Instances[] localInstances;
   m_localModel.resetDistribution(data);
   m_train = data;
   if (!m_isLeaf) {
     localInstances = (Instances[]) m_localModel.split(data);
     for (int i = 0; i < m_sons.length; i++) {
       m_sons[i].newDistribution(localInstances[i]);
     }
   } else {
     // Check whether there are some instances at the leaf now!
     if (!Utils.eq(data.sumOfWeights(), 0)) {
       m_isEmpty = false;
     }
   }
 }
예제 #4
0
  /**
   * Boosting method.
   *
   * @param data the training data to be used for generating the boosted classifier.
   * @throws Exception if the classifier could not be built successfully
   */
  public void buildClassifier(Instances data) throws Exception {

    super.buildClassifier(data);

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    m_SumOfWeights = data.sumOfWeights();

    if ((!m_UseResampling) && (m_Classifier instanceof WeightedInstancesHandler)) {
      buildClassifierWithWeights(data);
    } else {
      buildClassifierUsingResampling(data);
    }
  }
 public void buildTree(Instances data) throws Exception {
   Instances[] localInstances;
   // m_test = null;
   m_isLeaf = false;
   m_isEmpty = false;
   m_sons = null;
   m_localModel = m_toSelectModel.selectModel(data);
   if (m_localModel.numSubsets() > 1) {
     localInstances = m_localModel.split(data);
     data = null;
     m_sons = new myJ48ClassifierTree[m_localModel.numSubsets()];
     for (int i = 0; i < m_sons.length; i++) {
       m_sons[i] = getNewTree(localInstances[i]);
       localInstances[i] = null;
     }
   } else {
     m_isLeaf = true;
     if (Utils.eq(data.sumOfWeights(), 0)) {
       m_isEmpty = true;
     }
     data = null;
   }
 }
예제 #6
0
  /**
   * Compute the combined DL of the ruleset in this class, i.e. theory DL and data DL. Note this
   * procedure computes the combined DL according to the current status of the ruleset in this class
   *
   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation
   * @param predicted the default classification if ruleset covers null
   * @return the combined class
   */
  public double combinedDL(double expFPRate, double predicted) {
    double rt = 0;

    if (getRulesetSize() > 0) {
      double[] stats = (double[]) m_SimpleStats.lastElement();
      for (int j = getRulesetSize() - 2; j >= 0; j--) {
        stats[0] += getSimpleStats(j)[0];
        stats[2] += getSimpleStats(j)[2];
        stats[4] += getSimpleStats(j)[4];
      }
      rt += dataDL(expFPRate, stats[0], stats[1], stats[4], stats[5]); // Data DL
    } else { // Null coverage ruleset
      double fn = 0.0;
      for (int j = 0; j < m_Data.numInstances(); j++)
        if ((int) m_Data.instance(j).classValue() == (int) predicted)
          fn += m_Data.instance(j).weight();
      rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn);
    }

    for (int i = 0; i < getRulesetSize(); i++) // Theory DL
    rt += theoryDL(i);

    return rt;
  }
예제 #7
0
  /**
   * Generates the classifier.
   *
   * @param instances set of instances serving as training data
   * @throws Exception if the classifier has not been generated successfully
   */
  public void buildClassifier(Instances instances) throws Exception {
    // can classifier handle the data?
    getCapabilities().testWithFail(instances);

    // remove instances with missing class
    instances = new Instances(instances);
    instances.deleteWithMissingClass();

    m_headerInfo = new Instances(instances, 0);
    m_numClasses = instances.numClasses();
    m_numAttributes = instances.numAttributes();
    m_probOfWordGivenClass = new double[m_numClasses][];

    /*
      initialising the matrix of word counts
      NOTE: Laplace estimator introduced in case a word that does not appear for a class in the
      training set does so for the test set
    */
    for (int c = 0; c < m_numClasses; c++) {
      m_probOfWordGivenClass[c] = new double[m_numAttributes];
      for (int att = 0; att < m_numAttributes; att++) {
        m_probOfWordGivenClass[c][att] = 1;
      }
    }

    // enumerate through the instances
    Instance instance;
    int classIndex;
    double numOccurences;
    double[] docsPerClass = new double[m_numClasses];
    double[] wordsPerClass = new double[m_numClasses];

    java.util.Enumeration enumInsts = instances.enumerateInstances();
    while (enumInsts.hasMoreElements()) {
      instance = (Instance) enumInsts.nextElement();
      classIndex = (int) instance.value(instance.classIndex());
      docsPerClass[classIndex] += instance.weight();

      for (int a = 0; a < instance.numValues(); a++)
        if (instance.index(a) != instance.classIndex()) {
          if (!instance.isMissing(a)) {
            numOccurences = instance.valueSparse(a) * instance.weight();
            if (numOccurences < 0)
              throw new Exception("Numeric attribute values must all be greater or equal to zero.");
            wordsPerClass[classIndex] += numOccurences;
            m_probOfWordGivenClass[classIndex][instance.index(a)] += numOccurences;
          }
        }
    }

    /*
      normalising probOfWordGivenClass values
      and saving each value as the log of each value
    */
    for (int c = 0; c < m_numClasses; c++)
      for (int v = 0; v < m_numAttributes; v++)
        m_probOfWordGivenClass[c][v] =
            Math.log(m_probOfWordGivenClass[c][v] / (wordsPerClass[c] + m_numAttributes - 1));

    /*
      calculating Pr(H)
      NOTE: Laplace estimator introduced in case a class does not get mentioned in the set of
      training instances
    */
    final double numDocs = instances.sumOfWeights() + m_numClasses;
    m_probOfClass = new double[m_numClasses];
    for (int h = 0; h < m_numClasses; h++)
      m_probOfClass[h] = (double) (docsPerClass[h] + 1) / numDocs;
  }