コード例 #1
0
  /**
   * Computes class distribution for an attribute. Not used anymore in 0.99. Based on the splitData
   * function from "weka.classifiers.trees.RandomTree", with the following changes:
   *
   * <ul>
   *   <li>entropy pre-split is not computed at this point as the only thing relevant for the
   *       (comparative) goodness of a split is entropy after splitting
   *   <li>dist[][] is now computed only after the split point has been found, and not updated
   *       continually by copying from currDist
   *   <li>also, in Weka's RandomTree it was possible to create a split 'in the middle' of instance
   *       0, which would result in empty nodes after the split; this is now fixed
   *   <li>instance 0 is now generally skipped when looking for split points, as the split point
   *       'before instance 0' is not sensible; in versions prior to 0.96 this change introduced a
   *       bug where attributes with all missing values had their dists computed wrongly, which
   *       might result in useless (but harmless) branches being added to the tree
   * </ul>
   *
   * @param props gets filled with relative sizes of branches (total = 1), indexed first per
   *     attribute
   * @param dists these are the contingency matrices, indexed first per attribute
   * @param att the attribute index (which one to change)
   * @param sortedIndices the sorted indices of the vals
   */
  protected double distribution(
      double[][] props, double[][][] dists, int att, int[] sortedIndices) {

    double splitPoint = -Double.MAX_VALUE;
    double[][] dist = null; // a contingency table of the split point vs class
    int i;

    if (data.isAttrNominal(att)) { // ====================== nominal attributes

      dist = new double[data.attNumVals[att]][data.numClasses];
      for (i = 0; i < sortedIndices.length; i++) {
        int inst = sortedIndices[i];
        if (data.isValueMissing(att, inst)) break;
        dist[(int) data.vals[att][inst]][data.instClassValues[inst]] += data.instWeights[inst];
      }

      splitPoint = 0; // signals we've found a sensible split point; by
      // definition, a split on a nominal attribute is sensible

    } else { // ============================================ numeric attributes

      double[][] currDist = new double[2][data.numClasses];
      dist = new double[2][data.numClasses];

      // begin with moving all instances into second subset
      for (int j = 0; j < sortedIndices.length; j++) {
        int inst = sortedIndices[j];
        if (data.isValueMissing(att, inst)) break;
        currDist[1][data.instClassValues[inst]] += data.instWeights[inst];
      }
      copyDists(currDist, dist);
      // for (int j = 0; j < currDist.length; j++)
      //  System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length);

      double currVal = -Double.MAX_VALUE; // current value of splitting criterion
      double bestVal = -Double.MAX_VALUE; // best value of splitting criterion
      int bestI = 0; // the value of "i" BEFORE which the splitpoint is placed

      for (i = 1; i < sortedIndices.length; i++) { // --- try all split points

        int inst = sortedIndices[i];
        if (data.isValueMissing(att, inst)) break;

        int prevInst = sortedIndices[i - 1];

        currDist[0][data.instClassValues[prevInst]] += data.instWeights[prevInst];
        currDist[1][data.instClassValues[prevInst]] -= data.instWeights[prevInst];

        // do not allow splitting between two instances with the same value
        if (data.vals[att][inst] > data.vals[att][prevInst]) {

          // we want the lowest impurity after split; at this point, we don't
          // really care what we've had before spliting
          currVal = -SplitCriteria.entropyConditionedOnRows(currDist);

          if (currVal > bestVal) {
            bestVal = currVal;
            bestI = i;
          }
        }
      } // ------- end split points

      /*
       * Determine the best split point:
       * bestI == 0 only if all instances had missing values, or there were
       * less than 2 instances; splitPoint will remain set as -Double.MAX_VALUE.
       * This is not really a useful split, as all of the instances are 'below'
       * the split line, but at least it's formally correct. And the dists[]
       * also has a default value set previously.
       */
      if (bestI > 0) { // ...at least one valid splitpoint was found

        int instJustBeforeSplit = sortedIndices[bestI - 1];
        int instJustAfterSplit = sortedIndices[bestI];
        splitPoint =
            (data.vals[att][instJustAfterSplit] + data.vals[att][instJustBeforeSplit]) / 2.0;

        // Now make the correct dist[] from the default dist[] (all instances
        // in the second branch, by iterating through instances until we reach
        // bestI, and then stop.
        for (int ii = 0; ii < bestI; ii++) {
          int inst = sortedIndices[ii];
          dist[0][data.instClassValues[inst]] += data.instWeights[inst];
          dist[1][data.instClassValues[inst]] -= data.instWeights[inst];
        }
      }
    } // ================================================== nominal or numeric?

    // compute total weights for each branch (= props)
    props[att] = countsToFreqs(dist);

    // distribute counts of instances with missing values

    // ver 0.96 - check for special case when *all* instances have missing vals
    if (data.isValueMissing(att, sortedIndices[0])) i = 0;

    while (i < sortedIndices.length) {
      int inst = sortedIndices[i];
      for (int branch = 0; branch < dist.length; branch++) {
        dist[branch][data.instClassValues[inst]] += props[att][branch] * data.instWeights[inst];
      }
      i++;
    }

    // return distribution after split and best split point
    dists[att] = dist;
    return splitPoint;
  }
コード例 #2
0
  /**
   * Computes class distribution for an attribute. New in FastRF 0.99, main changes:
   *
   * <ul>
   *   <li>now reuses the temporary counting arrays (this.tempDists, this.tempDistsOthers) instead
   *       of creating/destroying arrays
   *   <li>does not create a new "dists" for each attribute it examines; instead it replaces the
   *       existing "dists" (supplied as a parameter) but only if the split is better than the
   *       previous best split
   *   <li>always creates binary splits, even for categorical variables; thus might give slightly
   *       different classification results than the old RandomForest
   * </ul>
   *
   * @param propsBestAtt gets filled with relative sizes of branches (total = 1) for the best
   *     examined attribute so far; updated ONLY if current attribute is better that the previous
   *     best
   * @param distsBestAtt these are the contingency matrices for the best examined attribute so far;
   *     updated ONLY if current attribute is better that the previous best
   * @param scoreBestAtt Checked against the score of the attToExamine to determine if the
   *     propsBestAtt and distsBestAtt need to be updated.
   * @param attToExamine the attribute index (which one to examine, and change the above matrices if
   *     the attribute is better than the previous one)
   * @param sortedIndices the sorted indices of the vals for the attToExamine.
   * @param startAt Index in sortedIndicesOfAtt; do not touch anything below this index.
   * @param endAt Index in sortedIndicesOfAtt; do not touch anything after this index.
   */
  protected double distributionSequentialAtt(
      double[] propsBestAtt,
      double[][] distsBestAtt,
      double scoreBestAtt,
      int attToExamine,
      int[] sortedIndicesOfAtt,
      int startAt,
      int endAt) {

    double splitPoint = -Double.MAX_VALUE;

    // a contingency table of the split point vs class.
    double[][] dist = this.tempDists;
    Arrays.fill(dist[0], 0.0);
    Arrays.fill(dist[1], 0.0);
    double[][] currDist = this.tempDistsOther;
    Arrays.fill(currDist[0], 0.0);
    Arrays.fill(currDist[1], 0.0);
    // double[][] dist = new double[2][data.numClasses];
    // double[][] currDist = new double[2][data.numClasses];

    int i;
    int sortedIndicesOfAttLength = endAt - startAt + 1;

    // find how many missing values we have for this attribute (they're always at the end)
    int lastNonmissingValIdx = endAt;
    for (int j = endAt; j >= startAt; j--) {
      if (data.isValueMissing(attToExamine, sortedIndicesOfAtt[j])) {
        lastNonmissingValIdx = j - 1;
      } else {
        break;
      }
    }
    if (lastNonmissingValIdx < startAt) { // only missing values in this feature??
      return Double.NaN; // we cannot split on it
    }

    if (data.isAttrNominal(attToExamine)) { // ====================== nominal attributes

      // 0.99: new routine - makes a one-vs-all split on categorical attributes

      int numLvls = data.attNumVals[attToExamine];
      int bestLvl = 0; // the index of the category which is best to "split out"

      // note: if we have only two levels, it doesn't matter which one we "split out"
      // we can thus safely check only the first one
      if (numLvls <= 2) {

        bestLvl = 0; // this means that the category with index 0 always
        // goes 'above' the split and category with index 1 goes 'below' the split
        for (i = startAt; i <= lastNonmissingValIdx; i++) {
          int inst = sortedIndicesOfAtt[i];
          dist[(int) data.vals[attToExamine][inst]][data.instClassValues[inst]] +=
              data.instWeights[inst];
        }

      } else { // for >2 levels, we have to search different splits

        // begin with moving all instances into second subset ("below split")
        for (int j = startAt; j <= lastNonmissingValIdx; j++) {
          int inst = sortedIndicesOfAtt[j];
          currDist[1][data.instClassValues[inst]] += data.instWeights[inst];
        }
        // create a default dist[] which we'll modify after we find the best class to split out
        copyDists(currDist, dist);

        double currVal = -Double.MAX_VALUE; // current value of splitting criterion
        double bestVal = -Double.MAX_VALUE; // best value of splitting criterion
        int lastSeen = startAt; // used to avoid looping through all instances for every lvl

        for (int lvl = 0; lvl < numLvls; lvl++) {

          // reset the currDist to the default (everything "below split") - conveniently stored in
          // dist[][]
          copyDists(dist, currDist);

          for (i = lastSeen; i <= lastNonmissingValIdx; i++) {

            lastSeen = i;
            int inst = sortedIndicesOfAtt[i];
            if ((int) data.vals[attToExamine][inst] < lvl) {
              continue;
            } else if ((int) data.vals[attToExamine][inst] == lvl) {
              // move to "above split" from "below split"
              currDist[0][data.instClassValues[inst]] += data.instWeights[inst];
              currDist[1][data.instClassValues[inst]] -= data.instWeights[inst];
            } else {
              break; // no need to loop forward, no more instances of this category
            }
          }

          // we filled the "dist" for the current level, find score and see if we like it
          currVal = -SplitCriteria.entropyConditionedOnRows(currDist);
          if (currVal > bestVal) {
            bestVal = currVal;
            bestLvl = lvl;
          }
        } // examine how well "splitting out" of individual levels works for us

        // remember the contingency table from the best "lvl" and store it in "dist"
        for (i = startAt; i <= lastNonmissingValIdx; i++) {

          int inst = sortedIndicesOfAtt[i];
          if ((int) data.vals[attToExamine][inst] == bestLvl) {
            // move to "above split" from "below split"
            dist[0][data.instClassValues[inst]] += data.instWeights[inst];
            dist[1][data.instClassValues[inst]] -= data.instWeights[inst];
          } else {
            break; // no need to loop forward, no more instances of this category
          }
        }
      }

      splitPoint = bestLvl; // signals we've found a sensible split point; by
      // definition, a split on a nominal attribute
      // will always be sensible

    } else { // ============================================ numeric attributes

      // re-use the 2 x nClass temporary arrays created when tree was initialized
      // Arrays.fill( dist[0], 0.0 );
      // Arrays.fill( dist[1], 0.0 );

      // begin with moving all instances into second subset ("below split")
      for (int j = startAt; j <= lastNonmissingValIdx; j++) {
        int inst = sortedIndicesOfAtt[j];
        currDist[1][data.instClassValues[inst]] += data.instWeights[inst];
      }
      copyDists(currDist, dist);

      double currVal = -Double.MAX_VALUE; // current value of splitting criterion
      double bestVal = -Double.MAX_VALUE; // best value of splitting criterion
      int bestI = 0; // the value of "i" BEFORE which the splitpoint is placed

      for (i = startAt + 1; i <= lastNonmissingValIdx; i++) { // --- try all split points

        int inst = sortedIndicesOfAtt[i];

        int prevInst = sortedIndicesOfAtt[i - 1];

        currDist[0][data.instClassValues[prevInst]] += data.instWeights[prevInst];
        currDist[1][data.instClassValues[prevInst]] -= data.instWeights[prevInst];

        // do not allow splitting between two instances with the same value
        if (data.vals[attToExamine][inst] > data.vals[attToExamine][prevInst]) {

          // we want the lowest impurity after split; at this point, we don't
          // really care what we've had before spliting
          currVal = -SplitCriteria.entropyConditionedOnRows(currDist);

          if (currVal > bestVal) {
            bestVal = currVal;
            bestI = i;
          }
        }
      } // ------- end trying split points

      /*
       * Determine the best split point:
       * bestI == 0 only if all instances had missing values, or there were
       * less than 2 instances; splitPoint will remain set as -Double.MAX_VALUE.
       * This is not really a useful split, as all of the instances are 'below'
       * the split line, but at least it's formally correct. And the dists[]
       * also has a default value set previously.
       */
      if (bestI > startAt) { // ...at least one valid splitpoint was found

        int instJustBeforeSplit = sortedIndicesOfAtt[bestI - 1];
        int instJustAfterSplit = sortedIndicesOfAtt[bestI];
        splitPoint =
            (data.vals[attToExamine][instJustAfterSplit]
                    + data.vals[attToExamine][instJustBeforeSplit])
                / 2.0;

        // now make the correct dist[] (for the best split point) from the
        // default dist[] (all instances in the second branch, by iterating
        // through instances until we reach bestI, and then stop.
        for (int ii = startAt; ii < bestI; ii++) {
          int inst = sortedIndicesOfAtt[ii];
          dist[0][data.instClassValues[inst]] += data.instWeights[inst];
          dist[1][data.instClassValues[inst]] -= data.instWeights[inst];
        }
      }
    } // ================================================== nominal or numeric?

    // compute total weights for each branch (= props)
    // again, we reuse the tempProps of the tree not to create/destroy new arrays
    double[] props = this.tempProps;
    countsToFreqs(dist, props); // props gets overwritten, previous contents don't matters

    // distribute *counts* of instances with missing values using the "props"
    i = lastNonmissingValIdx + 1; // / start 1 after the non-missing val (if there is anything)
    while (i <= endAt) {
      int inst = sortedIndicesOfAtt[i];
      dist[0][data.instClassValues[inst]] += props[0] * data.instWeights[inst];
      dist[1][data.instClassValues[inst]] += props[1] * data.instWeights[inst];
      i++;
    }

    // update the distribution after split and best split point
    // but ONLY if better than the previous one -- we need to recalculate the
    // entropy (because this changes after redistributing the instances with
    // missing values in the current attribute). Also, for categorical variables
    // it was not calculated before.
    double curScore = -SplitCriteria.entropyConditionedOnRows(dist);
    if (curScore > scoreBestAtt
        && splitPoint
            > -Double
                .MAX_VALUE) { // overwrite the "distsBestAtt" and "propsBestAtt" with current values
      copyDists(dist, distsBestAtt);
      System.arraycopy(props, 0, propsBestAtt, 0, props.length);
      return splitPoint;
    } else {
      // returns a NaN instead of the splitpoint if the attribute was not better than a previous
      // one.
      return Double.NaN;
    }
  }
コード例 #3
0
  /**
   * Recursively generates a tree. A derivative of the buildTree function from the
   * "weka.classifiers.trees.RandomTree" class, with the following changes made:
   *
   * <ul>
   *   <li>m_ClassProbs are now remembered only in leaves, not in every node of the tree
   *   <li>m_Distribution has been removed
   *   <li>members of dists, splits, props and vals arrays which are not used are dereferenced prior
   *       to recursion to reduce memory requirements
   *   <li>a check for "branch with no training instances" is now (FastRF 0.98) made before
   *       recursion; with the current implementation of splitData(), empty branches can appear only
   *       with nominal attributes with more than two categories
   *   <li>each new 'tree' (i.e. node or leaf) is passed a reference to its 'mother forest',
   *       necessary to look up parameters such as maxDepth and K
   *   <li>pre-split entropy is not recalculated unnecessarily
   *   <li>uses DataCache instead of weka.core.Instances, the reference to the DataCache is stored
   *       as a field in FastRandomTree class and not passed recursively down new buildTree() calls
   *   <li>similarly, a reference to the random number generator is stored in a field of the
   *       DataCache
   *   <li>m_ClassProbs are now normalized by dividing with number of instances in leaf, instead of
   *       forcing the sum of class probabilities to 1.0; this has a large effect when
   *       class/instance weights are set by user
   *   <li>a little imprecision is allowed in checking whether there was a decrease in entropy after
   *       splitting
   *   <li>0.99: the temporary arrays splits, props, vals now are not wide as the full number of
   *       attributes in the dataset (of which only "k" columns of randomly chosen attributes get
   *       filled). Now, it's just a single array which gets replaced as the k features are
   *       evaluated sequentially, but it gets replaced only if a next feature is better than a
   *       previous one.
   *   <li>0.99: the SortedIndices are now not cut up into smaller arrays on every split, but rather
   *       re-sorted within the same array in the splitDataNew(), and passed down to buildTree() as
   *       the original large matrix, but with start and end points explicitly specified
   * </ul>
   *
   * @param sortedIndices the indices of the instances of the whole bootstrap replicate
   * @param startAt First index of the instance to consider in this split; inclusive.
   * @param endAt Last index of the instance to consider; inclusive.
   * @param classProbs the class distribution
   * @param debug whether debugging is on
   * @param attIndicesWindow the attribute window to choose attributes from
   * @param depth the current depth
   */
  protected void buildTree(
      int[][] sortedIndices,
      int startAt,
      int endAt,
      double[] classProbs,
      boolean debug,
      int[] attIndicesWindow,
      int depth) {

    m_Debug = debug;
    int sortedIndicesLength = endAt - startAt + 1;

    // Check if node doesn't contain enough instances or is pure
    // or maximum depth reached, make leaf.
    if ((sortedIndicesLength < Math.max(2, getMinNum())) // small
        || Utils.eq(classProbs[Utils.maxIndex(classProbs)], Utils.sum(classProbs)) // pure
        || ((getMaxDepth() > 0) && (depth >= getMaxDepth())) // deep
    ) {
      m_Attribute = -1; // indicates leaf (no useful attribute to split on)

      // normalize by dividing with the number of instances (as of ver. 0.97)
      // unless leaf is empty - this can happen with splits on nominal
      // attributes with more than two categories
      if (sortedIndicesLength != 0)
        for (int c = 0; c < classProbs.length; c++) {
          classProbs[c] /= sortedIndicesLength;
        }
      m_ClassProbs = classProbs;
      this.data = null;
      return;
    } // (leaf making)

    // new 0.99: all the following are for the best attribute only! they're updated while
    // sequentially through the attributes
    double val = Double.NaN; // value of splitting criterion
    double[][] dist =
        new double[2]
            [data.numClasses]; // class distributions (contingency table), indexed first by branch,
                               // then by class
    double[] prop = new double[2]; // the branch sizes (as fraction)
    double split = Double.NaN; // split point

    // Investigate K random attributes
    int attIndex = 0;
    int windowSize = attIndicesWindow.length;
    int k = getKValue();
    boolean sensibleSplitFound = false;
    double prior = Double.NaN;
    double bestNegPosterior = -Double.MAX_VALUE;
    int bestAttIdx = -1;

    while ((windowSize > 0) && (k-- > 0 || !sensibleSplitFound)) {

      int chosenIndex = data.reusableRandomGenerator.nextInt(windowSize);
      attIndex = attIndicesWindow[chosenIndex];

      // shift chosen attIndex out of window
      attIndicesWindow[chosenIndex] = attIndicesWindow[windowSize - 1];
      attIndicesWindow[windowSize - 1] = attIndex;
      windowSize--;

      // new: 0.99
      double candidateSplit =
          distributionSequentialAtt(
              prop, dist, bestNegPosterior, attIndex, sortedIndices[attIndex], startAt, endAt);

      if (Double.isNaN(candidateSplit)) {
        continue; // we did not improve over a previous attribute! "dist" is unchanged from before
      }
      // by this point we know we have an improvement, so we keep the new split point
      split = candidateSplit;
      bestAttIdx = attIndex;

      if (Double.isNaN(
          prior)) { // needs to be computed only once per branch - is same for all attributes (even
                    // regardless of missing values)
        prior = SplitCriteria.entropyOverColumns(dist);
      }

      double negPosterior =
          -SplitCriteria.entropyConditionedOnRows(dist); // this is an updated dist
      if (negPosterior > bestNegPosterior) {
        bestNegPosterior = negPosterior;
      } else {
        throw new IllegalArgumentException("Very strange!");
      }

      val = prior - (-negPosterior); // we want the greatest reduction in entropy
      if (val > 1e-2) { // we allow some leeway here to compensate
        sensibleSplitFound = true; // for imprecision in entropy computation
      }
    } // feature by feature in window

    if (sensibleSplitFound) {

      m_Attribute = bestAttIdx; // find best attribute
      m_SplitPoint = split;
      m_Prop = prop;
      prop = null; // can be GC'ed

      // int[][][] subsetIndices =
      //        new int[dist.length][data.numAttributes][];
      // splitData( subsetIndices, m_Attribute,
      //        m_SplitPoint, sortedIndices );
      // int numInstancesBeforeSplit = sortedIndices[0].length;

      int belowTheSplitStartsAt =
          splitDataNew(m_Attribute, m_SplitPoint, sortedIndices, startAt, endAt);

      m_Successors = new FastRandomTree[dist.length]; // dist.length now always == 2
      for (int i = 0; i < dist.length; i++) {
        m_Successors[i] = new FastRandomTree();
        m_Successors[i].m_MotherForest = this.m_MotherForest;
        m_Successors[i].data = this.data;
        // new in 0.99 - used in distributionSequentialAtt()
        m_Successors[i].tempDists = this.tempDists;
        m_Successors[i].tempDistsOther = this.tempDistsOther;
        m_Successors[i].tempProps = this.tempProps;

        // check if we're about to make an empty branch - this can happen with
        // nominal attributes with more than two categories (as of ver. 0.98)
        if (belowTheSplitStartsAt - startAt == 0) {
          // in this case, modify the chosenAttDists[i] so that it contains
          // the current, before-split class probabilities, properly normalized
          // by the number of instances (as we won't be able to normalize
          // after the split)
          for (int j = 0; j < dist[i].length; j++) dist[i][j] = classProbs[j] / sortedIndicesLength;
        }

        if (i == 0) { // before split
          m_Successors[i].buildTree(
              sortedIndices,
              startAt,
              belowTheSplitStartsAt - 1,
              dist[i],
              m_Debug,
              attIndicesWindow,
              depth + 1);
        } else { // after split
          m_Successors[i].buildTree(
              sortedIndices,
              belowTheSplitStartsAt,
              endAt,
              dist[i],
              m_Debug,
              attIndicesWindow,
              depth + 1);
        }

        dist[i] = null;
      }
      sortedIndices = null;

    } else { // ------ make leaf --------

      m_Attribute = -1;

      // normalize by dividing with the number of instances (as of ver. 0.97)
      // unless leaf is empty - this can happen with splits on nominal attributes
      if (sortedIndicesLength != 0)
        for (int c = 0; c < classProbs.length; c++) {
          classProbs[c] /= sortedIndicesLength;
        }

      m_ClassProbs = classProbs;
    }

    this.data = null; // dereference all pointers so data can be GC'd after tree is built
  }