コード例 #1
0
ファイル: BinC45Split.java プロジェクト: huangwen87/mdrill
  /**
   * Returns index of subset instance is assigned to. Returns -1 if instance is assigned to more
   * than one subset.
   *
   * @exception Exception if something goes wrong
   */
  public final int whichSubset(Instance instance) throws Exception {

    if (instance.isMissing(m_attIndex)) return -1;
    else {
      if (instance.attribute(m_attIndex).isNominal()) {
        if ((int) m_splitPoint == (int) instance.value(m_attIndex)) return 0;
        else return 1;
      } else if (Utils.smOrEq(instance.value(m_attIndex), m_splitPoint)) return 0;
      else return 1;
    }
  }
コード例 #2
0
 /**
  * Prunes a tree using C4.5's pruning procedure.
  *
  * @throws Exception if something goes wrong
  */
 public void prune() throws Exception {
   double errorsLargestBranch;
   double errorsLeaf;
   double errorsTree;
   int indexOfLargestBranch;
   myJ48ClassifierTree largestBranch;
   int i;
   if (!m_isLeaf) {
     // Prune all subtrees.
     for (i = 0; i < m_sons.length; i++) m_sons[i].prune();
     // Compute error for largest branch
     indexOfLargestBranch = m_localModel.distribution().maxBag();
     errorsLargestBranch = Double.MAX_VALUE;
     // Compute error if this Tree would be leaf
     errorsLeaf = getEstimatedErrorsForDistribution(m_localModel.distribution());
     // Compute error for the whole subtree
     errorsTree = getEstimatedErrors();
     // Decide if leaf is best choice.
     if (Utils.smOrEq(errorsLeaf, errorsTree + 0.1)
         && Utils.smOrEq(errorsLeaf, errorsLargestBranch + 0.1)) {
       // Free son Trees
       m_sons = null;
       m_isLeaf = true;
       // Get NoSplit Model for node.
       m_localModel = new myJ48NoSplit(m_localModel.distribution());
       return;
     }
     // Decide if largest branch is better choice
     // than whole subtree.
     if (Utils.smOrEq(errorsLargestBranch, errorsTree + 0.1)) {
       largestBranch = m_sons[indexOfLargestBranch];
       m_sons = largestBranch.m_sons;
       m_localModel = largestBranch.m_localModel;
       m_isLeaf = largestBranch.m_isLeaf;
       newDistribution(m_train);
       prune();
     }
   }
 }
コード例 #3
0
ファイル: BinC45Split.java プロジェクト: huangwen87/mdrill
  /**
   * Sets split point to greatest value in given data smaller or equal to old split point. (C4.5
   * does this for some strange reason).
   */
  public final void setSplitPoint(Instances allInstances) {

    double newSplitPoint = -Double.MAX_VALUE;
    double tempValue;
    Instance instance;

    if ((!allInstances.attribute(m_attIndex).isNominal()) && (m_numSubsets > 1)) {
      Enumeration enu = allInstances.enumerateInstances();
      while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (!instance.isMissing(m_attIndex)) {
          tempValue = instance.value(m_attIndex);
          if (Utils.gr(tempValue, newSplitPoint) && Utils.smOrEq(tempValue, m_splitPoint))
            newSplitPoint = tempValue;
        }
      }
      m_splitPoint = newSplitPoint;
    }
  }
コード例 #4
0
ファイル: BinC45Split.java プロジェクト: huangwen87/mdrill
  /**
   * Creates split on numeric attribute.
   *
   * @exception Exception if something goes wrong
   */
  private void handleNumericAttribute(Instances trainInstances) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int index = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double defaultEnt;
    double minSplit;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new Distribution(2, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
      instance = (Instance) enu.nextElement();
      if (instance.isMissing(m_attIndex)) break;
      m_distribution.add(1, instance);
      i++;
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses());
    if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj;
    else if (Utils.gr(minSplit, 25)) minSplit = 25;

    // Enough Instances with known values?
    if (Utils.sm((double) firstMiss, 2 * minSplit)) return;

    // Compute values of criteria for all possible split
    // indices.
    defaultEnt = m_infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {

      if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5
          < trainInstances.instance(next).value(m_attIndex)) {

        // Move class values for all Instances up to next
        // possible split point.
        m_distribution.shiftRange(1, 0, trainInstances, last, next);

        // Check if enough Instances in each subset and compute
        // values for criteria.
        if (Utils.grOrEq(m_distribution.perBag(0), minSplit)
            && Utils.grOrEq(m_distribution.perBag(1), minSplit)) {
          currentInfoGain =
              m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt);
          if (Utils.gr(currentInfoGain, m_infoGain)) {
            m_infoGain = currentInfoGain;
            splitIndex = next - 1;
          }
          index++;
        }
        last = next;
      }
      next++;
    }

    // Was there any useful split?
    if (index == 0) return;

    // Compute modified information gain for best split.
    if (m_useMDLcorrection) {
      m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights);
    }
    if (Utils.smOrEq(m_infoGain, 0)) return;

    // Set instance variables' values to values for
    // best split.
    m_numSubsets = 2;
    m_splitPoint =
        (trainInstances.instance(splitIndex + 1).value(m_attIndex)
                + trainInstances.instance(splitIndex).value(m_attIndex))
            / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) {
      m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distributioN for best split.
    m_distribution = new Distribution(2, trainInstances.numClasses());
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    // Compute modified gain ratio for best split.
    m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain);
  }