/** * Returns index of subset instance is assigned to. Returns -1 if instance is assigned to more * than one subset. * * @exception Exception if something goes wrong */ public final int whichSubset(Instance instance) throws Exception { if (instance.isMissing(m_attIndex)) return -1; else { if (instance.attribute(m_attIndex).isNominal()) { if ((int) m_splitPoint == (int) instance.value(m_attIndex)) return 0; else return 1; } else if (Utils.smOrEq(instance.value(m_attIndex), m_splitPoint)) return 0; else return 1; } }
/** * Prunes a tree using C4.5's pruning procedure. * * @throws Exception if something goes wrong */ public void prune() throws Exception { double errorsLargestBranch; double errorsLeaf; double errorsTree; int indexOfLargestBranch; myJ48ClassifierTree largestBranch; int i; if (!m_isLeaf) { // Prune all subtrees. for (i = 0; i < m_sons.length; i++) m_sons[i].prune(); // Compute error for largest branch indexOfLargestBranch = m_localModel.distribution().maxBag(); errorsLargestBranch = Double.MAX_VALUE; // Compute error if this Tree would be leaf errorsLeaf = getEstimatedErrorsForDistribution(m_localModel.distribution()); // Compute error for the whole subtree errorsTree = getEstimatedErrors(); // Decide if leaf is best choice. if (Utils.smOrEq(errorsLeaf, errorsTree + 0.1) && Utils.smOrEq(errorsLeaf, errorsLargestBranch + 0.1)) { // Free son Trees m_sons = null; m_isLeaf = true; // Get NoSplit Model for node. m_localModel = new myJ48NoSplit(m_localModel.distribution()); return; } // Decide if largest branch is better choice // than whole subtree. if (Utils.smOrEq(errorsLargestBranch, errorsTree + 0.1)) { largestBranch = m_sons[indexOfLargestBranch]; m_sons = largestBranch.m_sons; m_localModel = largestBranch.m_localModel; m_isLeaf = largestBranch.m_isLeaf; newDistribution(m_train); prune(); } } }
/** * Sets split point to greatest value in given data smaller or equal to old split point. (C4.5 * does this for some strange reason). */ public final void setSplitPoint(Instances allInstances) { double newSplitPoint = -Double.MAX_VALUE; double tempValue; Instance instance; if ((!allInstances.attribute(m_attIndex).isNominal()) && (m_numSubsets > 1)) { Enumeration enu = allInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (!instance.isMissing(m_attIndex)) { tempValue = instance.value(m_attIndex); if (Utils.gr(tempValue, newSplitPoint) && Utils.smOrEq(tempValue, m_splitPoint)) newSplitPoint = tempValue; } } m_splitPoint = newSplitPoint; } }
/** * Creates split on numeric attribute. * * @exception Exception if something goes wrong */ private void handleNumericAttribute(Instances trainInstances) throws Exception { int firstMiss; int next = 1; int last = 0; int index = 0; int splitIndex = -1; double currentInfoGain; double defaultEnt; double minSplit; Instance instance; int i; // Current attribute is a numeric attribute. m_distribution = new Distribution(2, trainInstances.numClasses()); // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); i = 0; while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (instance.isMissing(m_attIndex)) break; m_distribution.add(1, instance); i++; } firstMiss = i; // Compute minimum number of Instances required in each // subset. minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses()); if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj; else if (Utils.gr(minSplit, 25)) minSplit = 25; // Enough Instances with known values? if (Utils.sm((double) firstMiss, 2 * minSplit)) return; // Compute values of criteria for all possible split // indices. defaultEnt = m_infoGainCrit.oldEnt(m_distribution); while (next < firstMiss) { if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next).value(m_attIndex)) { // Move class values for all Instances up to next // possible split point. m_distribution.shiftRange(1, 0, trainInstances, last, next); // Check if enough Instances in each subset and compute // values for criteria. if (Utils.grOrEq(m_distribution.perBag(0), minSplit) && Utils.grOrEq(m_distribution.perBag(1), minSplit)) { currentInfoGain = m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt); if (Utils.gr(currentInfoGain, m_infoGain)) { m_infoGain = currentInfoGain; splitIndex = next - 1; } index++; } last = next; } next++; } // Was there any useful split? if (index == 0) return; // Compute modified information gain for best split. if (m_useMDLcorrection) { m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights); } if (Utils.smOrEq(m_infoGain, 0)) return; // Set instance variables' values to values for // best split. m_numSubsets = 2; m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex) + trainInstances.instance(splitIndex).value(m_attIndex)) / 2; // In case we have a numerical precision problem we need to choose the // smaller value if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) { m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex); } // Restore distributioN for best split. m_distribution = new Distribution(2, trainInstances.numClasses()); m_distribution.addRange(0, trainInstances, 0, splitIndex + 1); m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss); // Compute modified gain ratio for best split. m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain); }