コード例 #1
0
  /**
   * Computes class distribution of an instance using the FastRandomTree.
   *
   * <p>Works correctly only if the DataCache has the same attributes as the one used to train the
   * FastRandomTree - but this function does not check for that!
   *
   * <p>Main use of this is to compute out-of-bag error (also when finding feature importances).
   *
   * @param instance the instance to compute the distribution for
   * @return the computed class distribution
   * @throws Exception if computation fails
   */
  public double[] distributionForInstanceInDataCache(DataCache data, int instIdx) {

    double[] returnedDist = null;

    if (m_Attribute > -1) { // ============================ node is not a leaf

      if (data.isValueMissing(m_Attribute, instIdx)) { // ---------------- missing value

        returnedDist = new double[m_MotherForest.getM_Info().numClasses()];
        // split instance up
        for (int i = 0; i < m_Successors.length; i++) {
          double[] help = m_Successors[i].distributionForInstanceInDataCache(data, instIdx);
          if (help != null) {
            for (int j = 0; j < help.length; j++) {
              returnedDist[j] += m_Prop[i] * help[j];
            }
          }
        }

      } else if (data.isAttrNominal(m_Attribute)) { // ------ nominal

        // returnedDist = m_Successors[(int) instance.value(m_Attribute)]
        //        .distributionForInstance(instance);

        // 0.99: new - binary splits (also) for nominal attributes
        if (data.vals[m_Attribute][instIdx] == m_SplitPoint) {
          returnedDist = m_Successors[0].distributionForInstanceInDataCache(data, instIdx);
        } else {
          returnedDist = m_Successors[1].distributionForInstanceInDataCache(data, instIdx);
        }

      } else { // ------------------------------------------ numeric attributes

        if (data.vals[m_Attribute][instIdx] < m_SplitPoint) {
          returnedDist = m_Successors[0].distributionForInstanceInDataCache(data, instIdx);
        } else {
          returnedDist = m_Successors[1].distributionForInstanceInDataCache(data, instIdx);
        }
      }

      return returnedDist;

    } else { // =============================================== node is a leaf

      return m_ClassProbs;
    }
  }
コード例 #2
0
  /**
   * Computes class distribution for an attribute. New in FastRF 0.99, main changes:
   *
   * <ul>
   *   <li>now reuses the temporary counting arrays (this.tempDists, this.tempDistsOthers) instead
   *       of creating/destroying arrays
   *   <li>does not create a new "dists" for each attribute it examines; instead it replaces the
   *       existing "dists" (supplied as a parameter) but only if the split is better than the
   *       previous best split
   *   <li>always creates binary splits, even for categorical variables; thus might give slightly
   *       different classification results than the old RandomForest
   * </ul>
   *
   * @param propsBestAtt gets filled with relative sizes of branches (total = 1) for the best
   *     examined attribute so far; updated ONLY if current attribute is better that the previous
   *     best
   * @param distsBestAtt these are the contingency matrices for the best examined attribute so far;
   *     updated ONLY if current attribute is better that the previous best
   * @param scoreBestAtt Checked against the score of the attToExamine to determine if the
   *     propsBestAtt and distsBestAtt need to be updated.
   * @param attToExamine the attribute index (which one to examine, and change the above matrices if
   *     the attribute is better than the previous one)
   * @param sortedIndices the sorted indices of the vals for the attToExamine.
   * @param startAt Index in sortedIndicesOfAtt; do not touch anything below this index.
   * @param endAt Index in sortedIndicesOfAtt; do not touch anything after this index.
   */
  protected double distributionSequentialAtt(
      double[] propsBestAtt,
      double[][] distsBestAtt,
      double scoreBestAtt,
      int attToExamine,
      int[] sortedIndicesOfAtt,
      int startAt,
      int endAt) {

    double splitPoint = -Double.MAX_VALUE;

    // a contingency table of the split point vs class.
    double[][] dist = this.tempDists;
    Arrays.fill(dist[0], 0.0);
    Arrays.fill(dist[1], 0.0);
    double[][] currDist = this.tempDistsOther;
    Arrays.fill(currDist[0], 0.0);
    Arrays.fill(currDist[1], 0.0);
    // double[][] dist = new double[2][data.numClasses];
    // double[][] currDist = new double[2][data.numClasses];

    int i;
    int sortedIndicesOfAttLength = endAt - startAt + 1;

    // find how many missing values we have for this attribute (they're always at the end)
    int lastNonmissingValIdx = endAt;
    for (int j = endAt; j >= startAt; j--) {
      if (data.isValueMissing(attToExamine, sortedIndicesOfAtt[j])) {
        lastNonmissingValIdx = j - 1;
      } else {
        break;
      }
    }
    if (lastNonmissingValIdx < startAt) { // only missing values in this feature??
      return Double.NaN; // we cannot split on it
    }

    if (data.isAttrNominal(attToExamine)) { // ====================== nominal attributes

      // 0.99: new routine - makes a one-vs-all split on categorical attributes

      int numLvls = data.attNumVals[attToExamine];
      int bestLvl = 0; // the index of the category which is best to "split out"

      // note: if we have only two levels, it doesn't matter which one we "split out"
      // we can thus safely check only the first one
      if (numLvls <= 2) {

        bestLvl = 0; // this means that the category with index 0 always
        // goes 'above' the split and category with index 1 goes 'below' the split
        for (i = startAt; i <= lastNonmissingValIdx; i++) {
          int inst = sortedIndicesOfAtt[i];
          dist[(int) data.vals[attToExamine][inst]][data.instClassValues[inst]] +=
              data.instWeights[inst];
        }

      } else { // for >2 levels, we have to search different splits

        // begin with moving all instances into second subset ("below split")
        for (int j = startAt; j <= lastNonmissingValIdx; j++) {
          int inst = sortedIndicesOfAtt[j];
          currDist[1][data.instClassValues[inst]] += data.instWeights[inst];
        }
        // create a default dist[] which we'll modify after we find the best class to split out
        copyDists(currDist, dist);

        double currVal = -Double.MAX_VALUE; // current value of splitting criterion
        double bestVal = -Double.MAX_VALUE; // best value of splitting criterion
        int lastSeen = startAt; // used to avoid looping through all instances for every lvl

        for (int lvl = 0; lvl < numLvls; lvl++) {

          // reset the currDist to the default (everything "below split") - conveniently stored in
          // dist[][]
          copyDists(dist, currDist);

          for (i = lastSeen; i <= lastNonmissingValIdx; i++) {

            lastSeen = i;
            int inst = sortedIndicesOfAtt[i];
            if ((int) data.vals[attToExamine][inst] < lvl) {
              continue;
            } else if ((int) data.vals[attToExamine][inst] == lvl) {
              // move to "above split" from "below split"
              currDist[0][data.instClassValues[inst]] += data.instWeights[inst];
              currDist[1][data.instClassValues[inst]] -= data.instWeights[inst];
            } else {
              break; // no need to loop forward, no more instances of this category
            }
          }

          // we filled the "dist" for the current level, find score and see if we like it
          currVal = -SplitCriteria.entropyConditionedOnRows(currDist);
          if (currVal > bestVal) {
            bestVal = currVal;
            bestLvl = lvl;
          }
        } // examine how well "splitting out" of individual levels works for us

        // remember the contingency table from the best "lvl" and store it in "dist"
        for (i = startAt; i <= lastNonmissingValIdx; i++) {

          int inst = sortedIndicesOfAtt[i];
          if ((int) data.vals[attToExamine][inst] == bestLvl) {
            // move to "above split" from "below split"
            dist[0][data.instClassValues[inst]] += data.instWeights[inst];
            dist[1][data.instClassValues[inst]] -= data.instWeights[inst];
          } else {
            break; // no need to loop forward, no more instances of this category
          }
        }
      }

      splitPoint = bestLvl; // signals we've found a sensible split point; by
      // definition, a split on a nominal attribute
      // will always be sensible

    } else { // ============================================ numeric attributes

      // re-use the 2 x nClass temporary arrays created when tree was initialized
      // Arrays.fill( dist[0], 0.0 );
      // Arrays.fill( dist[1], 0.0 );

      // begin with moving all instances into second subset ("below split")
      for (int j = startAt; j <= lastNonmissingValIdx; j++) {
        int inst = sortedIndicesOfAtt[j];
        currDist[1][data.instClassValues[inst]] += data.instWeights[inst];
      }
      copyDists(currDist, dist);

      double currVal = -Double.MAX_VALUE; // current value of splitting criterion
      double bestVal = -Double.MAX_VALUE; // best value of splitting criterion
      int bestI = 0; // the value of "i" BEFORE which the splitpoint is placed

      for (i = startAt + 1; i <= lastNonmissingValIdx; i++) { // --- try all split points

        int inst = sortedIndicesOfAtt[i];

        int prevInst = sortedIndicesOfAtt[i - 1];

        currDist[0][data.instClassValues[prevInst]] += data.instWeights[prevInst];
        currDist[1][data.instClassValues[prevInst]] -= data.instWeights[prevInst];

        // do not allow splitting between two instances with the same value
        if (data.vals[attToExamine][inst] > data.vals[attToExamine][prevInst]) {

          // we want the lowest impurity after split; at this point, we don't
          // really care what we've had before spliting
          currVal = -SplitCriteria.entropyConditionedOnRows(currDist);

          if (currVal > bestVal) {
            bestVal = currVal;
            bestI = i;
          }
        }
      } // ------- end trying split points

      /*
       * Determine the best split point:
       * bestI == 0 only if all instances had missing values, or there were
       * less than 2 instances; splitPoint will remain set as -Double.MAX_VALUE.
       * This is not really a useful split, as all of the instances are 'below'
       * the split line, but at least it's formally correct. And the dists[]
       * also has a default value set previously.
       */
      if (bestI > startAt) { // ...at least one valid splitpoint was found

        int instJustBeforeSplit = sortedIndicesOfAtt[bestI - 1];
        int instJustAfterSplit = sortedIndicesOfAtt[bestI];
        splitPoint =
            (data.vals[attToExamine][instJustAfterSplit]
                    + data.vals[attToExamine][instJustBeforeSplit])
                / 2.0;

        // now make the correct dist[] (for the best split point) from the
        // default dist[] (all instances in the second branch, by iterating
        // through instances until we reach bestI, and then stop.
        for (int ii = startAt; ii < bestI; ii++) {
          int inst = sortedIndicesOfAtt[ii];
          dist[0][data.instClassValues[inst]] += data.instWeights[inst];
          dist[1][data.instClassValues[inst]] -= data.instWeights[inst];
        }
      }
    } // ================================================== nominal or numeric?

    // compute total weights for each branch (= props)
    // again, we reuse the tempProps of the tree not to create/destroy new arrays
    double[] props = this.tempProps;
    countsToFreqs(dist, props); // props gets overwritten, previous contents don't matters

    // distribute *counts* of instances with missing values using the "props"
    i = lastNonmissingValIdx + 1; // / start 1 after the non-missing val (if there is anything)
    while (i <= endAt) {
      int inst = sortedIndicesOfAtt[i];
      dist[0][data.instClassValues[inst]] += props[0] * data.instWeights[inst];
      dist[1][data.instClassValues[inst]] += props[1] * data.instWeights[inst];
      i++;
    }

    // update the distribution after split and best split point
    // but ONLY if better than the previous one -- we need to recalculate the
    // entropy (because this changes after redistributing the instances with
    // missing values in the current attribute). Also, for categorical variables
    // it was not calculated before.
    double curScore = -SplitCriteria.entropyConditionedOnRows(dist);
    if (curScore > scoreBestAtt
        && splitPoint
            > -Double
                .MAX_VALUE) { // overwrite the "distsBestAtt" and "propsBestAtt" with current values
      copyDists(dist, distsBestAtt);
      System.arraycopy(props, 0, propsBestAtt, 0, props.length);
      return splitPoint;
    } else {
      // returns a NaN instead of the splitpoint if the attribute was not better than a previous
      // one.
      return Double.NaN;
    }
  }
コード例 #3
0
  /**
   * Computes class distribution for an attribute. Not used anymore in 0.99. Based on the splitData
   * function from "weka.classifiers.trees.RandomTree", with the following changes:
   *
   * <ul>
   *   <li>entropy pre-split is not computed at this point as the only thing relevant for the
   *       (comparative) goodness of a split is entropy after splitting
   *   <li>dist[][] is now computed only after the split point has been found, and not updated
   *       continually by copying from currDist
   *   <li>also, in Weka's RandomTree it was possible to create a split 'in the middle' of instance
   *       0, which would result in empty nodes after the split; this is now fixed
   *   <li>instance 0 is now generally skipped when looking for split points, as the split point
   *       'before instance 0' is not sensible; in versions prior to 0.96 this change introduced a
   *       bug where attributes with all missing values had their dists computed wrongly, which
   *       might result in useless (but harmless) branches being added to the tree
   * </ul>
   *
   * @param props gets filled with relative sizes of branches (total = 1), indexed first per
   *     attribute
   * @param dists these are the contingency matrices, indexed first per attribute
   * @param att the attribute index (which one to change)
   * @param sortedIndices the sorted indices of the vals
   */
  protected double distribution(
      double[][] props, double[][][] dists, int att, int[] sortedIndices) {

    double splitPoint = -Double.MAX_VALUE;
    double[][] dist = null; // a contingency table of the split point vs class
    int i;

    if (data.isAttrNominal(att)) { // ====================== nominal attributes

      dist = new double[data.attNumVals[att]][data.numClasses];
      for (i = 0; i < sortedIndices.length; i++) {
        int inst = sortedIndices[i];
        if (data.isValueMissing(att, inst)) break;
        dist[(int) data.vals[att][inst]][data.instClassValues[inst]] += data.instWeights[inst];
      }

      splitPoint = 0; // signals we've found a sensible split point; by
      // definition, a split on a nominal attribute is sensible

    } else { // ============================================ numeric attributes

      double[][] currDist = new double[2][data.numClasses];
      dist = new double[2][data.numClasses];

      // begin with moving all instances into second subset
      for (int j = 0; j < sortedIndices.length; j++) {
        int inst = sortedIndices[j];
        if (data.isValueMissing(att, inst)) break;
        currDist[1][data.instClassValues[inst]] += data.instWeights[inst];
      }
      copyDists(currDist, dist);
      // for (int j = 0; j < currDist.length; j++)
      //  System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length);

      double currVal = -Double.MAX_VALUE; // current value of splitting criterion
      double bestVal = -Double.MAX_VALUE; // best value of splitting criterion
      int bestI = 0; // the value of "i" BEFORE which the splitpoint is placed

      for (i = 1; i < sortedIndices.length; i++) { // --- try all split points

        int inst = sortedIndices[i];
        if (data.isValueMissing(att, inst)) break;

        int prevInst = sortedIndices[i - 1];

        currDist[0][data.instClassValues[prevInst]] += data.instWeights[prevInst];
        currDist[1][data.instClassValues[prevInst]] -= data.instWeights[prevInst];

        // do not allow splitting between two instances with the same value
        if (data.vals[att][inst] > data.vals[att][prevInst]) {

          // we want the lowest impurity after split; at this point, we don't
          // really care what we've had before spliting
          currVal = -SplitCriteria.entropyConditionedOnRows(currDist);

          if (currVal > bestVal) {
            bestVal = currVal;
            bestI = i;
          }
        }
      } // ------- end split points

      /*
       * Determine the best split point:
       * bestI == 0 only if all instances had missing values, or there were
       * less than 2 instances; splitPoint will remain set as -Double.MAX_VALUE.
       * This is not really a useful split, as all of the instances are 'below'
       * the split line, but at least it's formally correct. And the dists[]
       * also has a default value set previously.
       */
      if (bestI > 0) { // ...at least one valid splitpoint was found

        int instJustBeforeSplit = sortedIndices[bestI - 1];
        int instJustAfterSplit = sortedIndices[bestI];
        splitPoint =
            (data.vals[att][instJustAfterSplit] + data.vals[att][instJustBeforeSplit]) / 2.0;

        // Now make the correct dist[] from the default dist[] (all instances
        // in the second branch, by iterating through instances until we reach
        // bestI, and then stop.
        for (int ii = 0; ii < bestI; ii++) {
          int inst = sortedIndices[ii];
          dist[0][data.instClassValues[inst]] += data.instWeights[inst];
          dist[1][data.instClassValues[inst]] -= data.instWeights[inst];
        }
      }
    } // ================================================== nominal or numeric?

    // compute total weights for each branch (= props)
    props[att] = countsToFreqs(dist);

    // distribute counts of instances with missing values

    // ver 0.96 - check for special case when *all* instances have missing vals
    if (data.isValueMissing(att, sortedIndices[0])) i = 0;

    while (i < sortedIndices.length) {
      int inst = sortedIndices[i];
      for (int branch = 0; branch < dist.length; branch++) {
        dist[branch][data.instClassValues[inst]] += props[att][branch] * data.instWeights[inst];
      }
      i++;
    }

    // return distribution after split and best split point
    dists[att] = dist;
    return splitPoint;
  }
コード例 #4
0
  /**
   * Splits instances into subsets; new for FastRF 0.99. Does not create new arrays with split
   * indices, but rather reorganizes the indices within the supplied sortedIndices to conform with
   * the split. Works only within given boundaries.
   *
   * <p>Note: as of 0.99, all splits (incl. categorical) are always binary.
   *
   * @param att the attribute index
   * @param splitPoint the splitpoint for numeric attributes
   * @param sortedIndices the sorted indices of the whole set - gets overwritten!
   * @param startAt Inclusive, 0-based index. Does not touch anything before this value.
   * @param endAt Inclusive, 0-based index. Does not touch anything after this value.
   * @return the first index of the "below the split" instances
   */
  protected int splitDataNew(
      int att, double splitPoint, int[][] sortedIndices, int startAt, int endAt) {

    Random random = data.reusableRandomGenerator;
    int j;
    // 0.99: we have binary splits also for nominal data
    int[] num = new int[2]; // how many instances go to each branch

    // we might possibly want to recycle this array for the whole tree
    int[] tempArr = new int[endAt - startAt + 1];

    if (data.isAttrNominal(att)) { // ============================ if nominal

      for (j = startAt; j <= endAt; j++) {

        int inst = sortedIndices[att][j];

        if (data.isValueMissing(att, inst)) { // ---------- has missing value

          // decide where to put this instance randomly, with bigger branches
          // getting a higher chance
          double rn = random.nextDouble();
          int myBranch = -1;
          for (int k = 0; k < m_Prop.length; k++) {
            rn -= m_Prop[k];
            if ((rn <= 0) || k == (m_Prop.length - 1)) {
              myBranch = k;
              break;
            }
          }

          data.whatGoesWhere[inst] = myBranch;
          num[myBranch]++;

        } else { // ----------------------------- does not have missing value

          // if it matches the category to "split out", put above split
          // all other categories go below split
          int subset = (data.vals[att][inst] == splitPoint) ? 0 : 1;
          data.whatGoesWhere[inst] = subset;
          num[subset]++;
        } // --------------------------------------- end if has missing value
      }

    } else { // =================================================== if numeric

      num = new int[2];

      for (j = startAt; j <= endAt; j++) {

        int inst = sortedIndices[att][j];

        // Instance inst = data.instance(sortedIndices[att][j]);

        if (data.isValueMissing(att, inst)) { // ---------- has missing value

          // decide if instance goes into subset 0 or 1 randomly,
          // with bigger subsets having a greater probability of getting
          // the instance assigned to them
          // instances with missing values get processed LAST (sort order)
          // so branch sizes are known by now (and stored in m_Prop)
          double rn = random.nextDouble();
          int branch = (rn > m_Prop[0]) ? 1 : 0;
          data.whatGoesWhere[inst] = branch;
          num[branch]++;

        } else { // ----------------------------- does not have missing value

          int branch = (data.vals[att][inst] < splitPoint) ? 0 : 1;

          data.whatGoesWhere[inst] = branch;
          num[branch]++;
        } // --------------------------------------- end if has missing value
      } // end for instance by instance
    } // ============================================ end if nominal / numeric

    for (int a = 0; a < data.numAttributes; a++) { // xxxxxxxxxx attr by attr

      if (a == data.classIndex) continue;

      // the first index of the sortedIndices in the above branch, and the first index in the below
      int startAbove = 0,
          startBelow = num[0]; // always only 2 sub-branches, remember where second starts

      Arrays.fill(tempArr, 0);

      // for (int branch = 0; branch < num.length; branch++) {
      //  num[branch] = 0;
      // }

      // fill them with stuff by looking at goesWhere array
      for (j = startAt; j <= endAt; j++) {

        int inst = sortedIndices[a][j];
        int branch = data.whatGoesWhere[inst]; // can be only 0 or 1

        if (branch == 0) {
          tempArr[startAbove] = sortedIndices[a][j];
          startAbove++;
        } else {
          tempArr[startBelow] = sortedIndices[a][j];
          startBelow++;
        }

        // subsetIndices[ branch == 0 ? startAbove :  ][ a ][ num[branch] ] = sortedIndices[a][j];
        // num[branch]++;

      }

      // now copy the tempArr into the sortedIndices, thus overwriting it
      System.arraycopy(tempArr, 0, sortedIndices[a], startAt, endAt - startAt + 1);
    } // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx end for attr by attr

    return startAt + num[0]; // the first index of "below the split" instances
  }
コード例 #5
0
  /**
   * Splits instances into subsets. Not used anymore in 0.99. This is a derivative of the splitData
   * function from "weka.classifiers.trees.RandomTree", with the following changes:
   *
   * <p>- When handling instances with missing values in attribute chosen for the split, the
   * FastRandomTree assignes the instance to one of the branches at random, with bigger branches
   * having a higher probability of getting the instance.
   *
   * <p>- When splitting sortedIndices into two or more subsetIndices, FastRandomTree checks whether
   * an instance's split attribute value was above splitpoint only once per instances, and stores
   * result into the DataCache's whatGoesWhere field, which is then read in splitting subsetIndices.
   *
   * <p>As a consequence of the above points, the exact branch sizes (even with instances having
   * unknowns in the split attribute) are known in advance so subsetIndices arrays don't have to be
   * 'resized' (i.e. a new shorter copy of each one created and the old one GCed).
   *
   * <p>
   *
   * @param subsetIndices the sorted indices of the subset
   * @param att the attribute index
   * @param splitPoint the splitpoint for numeric attributes
   * @param sortedIndices the sorted indices of the whole set
   */
  protected void splitData(
      int[][][] subsetIndices, int att, double splitPoint, int[][] sortedIndices) {

    Random random = data.reusableRandomGenerator;
    int j;
    // 0.99: we have binary splits also for nominal data
    int[] num = new int[2]; // how many instances go to each branch

    if (data.isAttrNominal(att)) { // ============================ if nominal

      for (j = 0; j < sortedIndices[att].length; j++) {

        int inst = sortedIndices[att][j];

        if (data.isValueMissing(att, inst)) { // ---------- has missing value

          // decide where to put this instance randomly, with bigger branches
          // getting a higher chance
          double rn = random.nextDouble();
          int myBranch = -1;
          for (int k = 0; k < m_Prop.length; k++) {
            rn -= m_Prop[k];
            if ((rn <= 0) || k == (m_Prop.length - 1)) {
              myBranch = k;
              break;
            }
          }

          data.whatGoesWhere[inst] = myBranch;
          num[myBranch]++;

        } else { // ----------------------------- does not have missing value

          // if it matches the category to "split out", put above split
          // all other categories go below split
          int subset = (data.vals[att][inst] == splitPoint) ? 0 : 1;
          data.whatGoesWhere[inst] = subset;
          num[subset]++;
        } // --------------------------------------- end if has missing value
      }

    } else { // =================================================== if numeric

      num = new int[2];

      for (j = 0; j < sortedIndices[att].length; j++) {

        int inst = sortedIndices[att][j];

        // Instance inst = data.instance(sortedIndices[att][j]);

        if (data.isValueMissing(att, inst)) { // ---------- has missing value

          // decide if instance goes into subset 0 or 1 randomly,
          // with bigger subsets having a greater probability of getting
          // the instance assigned to them
          // instances with missing values get processed LAST (sort order)
          // so branch sizes are known by now (and stored in m_Prop)
          double rn = random.nextDouble();
          int branch = (rn > m_Prop[0]) ? 1 : 0;
          data.whatGoesWhere[inst] = branch;
          num[branch]++;

        } else { // ----------------------------- does not have missing value

          int branch = (data.vals[att][inst] < splitPoint) ? 0 : 1;

          data.whatGoesWhere[inst] = branch;
          num[branch]++;
        } // --------------------------------------- end if has missing value
      } // end for instance by instance
    } // ============================================ end if nominal / numeric

    // create the new subset (branch) arrays of correct size -- as of 0.99, not anymore
    for (int a = 0; a < data.numAttributes; a++) {
      if (a == data.classIndex) continue; // no need to sort this one
      for (int branch = 0; branch < num.length; branch++) {
        subsetIndices[branch][a] = new int[num[branch]];
      }
    }

    for (int a = 0; a < data.numAttributes; a++) { // xxxxxxxxxx attr by attr

      if (a == data.classIndex) continue;
      for (int branch = 0; branch < num.length; branch++) {
        num[branch] = 0;
      }

      // fill them with stuff by looking at goesWhere array
      for (j = 0; j < sortedIndices[a].length; j++) {

        int inst = sortedIndices[a][j];
        int branch = data.whatGoesWhere[inst]; // can be 0 or 1

        subsetIndices[branch][a][num[branch]] = sortedIndices[a][j];
        num[branch]++;
      }
    } // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx end for attr by attr
  }