Пример #1
0
  /**
   * Creates split on enumerated attribute.
   *
   * @exception Exception if something goes wrong
   */
  private void handleEnumeratedAttribute(Instances trainInstances) throws Exception {

    Distribution newDistribution, secondDistribution;
    int numAttValues;
    double currIG, currGR;
    Instance instance;
    int i;

    numAttValues = trainInstances.attribute(m_attIndex).numValues();
    newDistribution = new Distribution(numAttValues, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    while (enu.hasMoreElements()) {
      instance = (Instance) enu.nextElement();
      if (!instance.isMissing(m_attIndex))
        newDistribution.add((int) instance.value(m_attIndex), instance);
    }
    m_distribution = newDistribution;

    // For all values
    for (i = 0; i < numAttValues; i++) {

      if (Utils.grOrEq(newDistribution.perBag(i), m_minNoObj)) {
        secondDistribution = new Distribution(newDistribution, i);

        // Check if minimum number of Instances in the two
        // subsets.
        if (secondDistribution.check(m_minNoObj)) {
          m_numSubsets = 2;
          currIG = m_infoGainCrit.splitCritValue(secondDistribution, m_sumOfWeights);
          currGR = m_gainRatioCrit.splitCritValue(secondDistribution, m_sumOfWeights, currIG);
          if ((i == 0) || Utils.gr(currGR, m_gainRatio)) {
            m_gainRatio = currGR;
            m_infoGain = currIG;
            m_splitPoint = (double) i;
            m_distribution = secondDistribution;
          }
        }
      }
    }
  }
Пример #2
0
  /**
   * Creates split on enumerated attribute.
   *
   * @exception Exception if something goes wrong
   */
  private void handleEnumeratedAttribute(Instances trainInstances) throws Exception {

    Instance instance;

    m_distribution = new Distribution(m_complexityIndex, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration<Instance> enu = trainInstances.enumerateInstances();
    while (enu.hasMoreElements()) {
      instance = enu.nextElement();
      if (!instance.isMissing(m_attIndex)) {
        m_distribution.add((int) instance.value(m_attIndex), instance);
      }
    }

    // Check if minimum number of Instances in at least two
    // subsets.
    if (m_distribution.check(m_minNoObj)) {
      m_numSubsets = m_complexityIndex;
      m_infoGain = infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights);
      m_gainRatio = gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain);
    }
  }
Пример #3
0
  /**
   * Creates split on numeric attribute.
   *
   * @exception Exception if something goes wrong
   */
  private void handleNumericAttribute(Instances trainInstances) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int index = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double defaultEnt;
    double minSplit;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new Distribution(2, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
      instance = (Instance) enu.nextElement();
      if (instance.isMissing(m_attIndex)) break;
      m_distribution.add(1, instance);
      i++;
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses());
    if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj;
    else if (Utils.gr(minSplit, 25)) minSplit = 25;

    // Enough Instances with known values?
    if (Utils.sm((double) firstMiss, 2 * minSplit)) return;

    // Compute values of criteria for all possible split
    // indices.
    defaultEnt = m_infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {

      if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5
          < trainInstances.instance(next).value(m_attIndex)) {

        // Move class values for all Instances up to next
        // possible split point.
        m_distribution.shiftRange(1, 0, trainInstances, last, next);

        // Check if enough Instances in each subset and compute
        // values for criteria.
        if (Utils.grOrEq(m_distribution.perBag(0), minSplit)
            && Utils.grOrEq(m_distribution.perBag(1), minSplit)) {
          currentInfoGain =
              m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt);
          if (Utils.gr(currentInfoGain, m_infoGain)) {
            m_infoGain = currentInfoGain;
            splitIndex = next - 1;
          }
          index++;
        }
        last = next;
      }
      next++;
    }

    // Was there any useful split?
    if (index == 0) return;

    // Compute modified information gain for best split.
    if (m_useMDLcorrection) {
      m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights);
    }
    if (Utils.smOrEq(m_infoGain, 0)) return;

    // Set instance variables' values to values for
    // best split.
    m_numSubsets = 2;
    m_splitPoint =
        (trainInstances.instance(splitIndex + 1).value(m_attIndex)
                + trainInstances.instance(splitIndex).value(m_attIndex))
            / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) {
      m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distributioN for best split.
    m_distribution = new Distribution(2, trainInstances.numClasses());
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    // Compute modified gain ratio for best split.
    m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain);
  }