/** * Test using Fayyad and Irani's MDL criterion. * * @param priorCounts * @param bestCounts * @param numInstances * @param numCutPoints * @return true if the splits is acceptable */ private boolean FayyadAndIranisMDL( double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) { double priorEntropy, entropy, gain; double entropyLeft, entropyRight, delta; int numClassesTotal, numClassesRight, numClassesLeft; // Compute entropy before split. priorEntropy = ContingencyTables.entropy(priorCounts); // Compute entropy after split. entropy = ContingencyTables.entropyConditionedOnRows(bestCounts); // Compute information gain. gain = priorEntropy - entropy; // Number of classes occuring in the set numClassesTotal = 0; for (double priorCount : priorCounts) { if (priorCount > 0) { numClassesTotal++; } } // Number of classes occuring in the left subset numClassesLeft = 0; for (int i = 0; i < bestCounts[0].length; i++) { if (bestCounts[0][i] > 0) { numClassesLeft++; } } // Number of classes occuring in the right subset numClassesRight = 0; for (int i = 0; i < bestCounts[1].length; i++) { if (bestCounts[1][i] > 0) { numClassesRight++; } } // Entropy of the left and the right subsets entropyLeft = ContingencyTables.entropy(bestCounts[0]); entropyRight = ContingencyTables.entropy(bestCounts[1]); // Compute terms for MDL formula delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) - ((numClassesTotal * priorEntropy) - (numClassesRight * entropyRight) - (numClassesLeft * entropyLeft)); // Check if split is to be accepted return (gain > (Utils.log2(numCutPoints) + delta) / numInstances); }
/** * Selects cutpoints for sorted subset. * * @param instances * @param attIndex * @param first * @param lastPlusOne * @return */ private double[] cutPointsForSubset( Instances instances, int attIndex, int first, int lastPlusOne) { double[][] counts, bestCounts; double[] priorCounts, left, right, cutPoints; double currentCutPoint = -Double.MAX_VALUE, bestCutPoint = -1, currentEntropy, bestEntropy, priorEntropy, gain; int bestIndex = -1, numCutPoints = 0; double numInstances = 0; // Compute number of instances in set if ((lastPlusOne - first) < 2) { return null; } // Compute class counts. counts = new double[2][instances.numClasses()]; for (int i = first; i < lastPlusOne; i++) { numInstances += instances.instance(i).weight(); counts[1][(int) instances.instance(i).classValue()] += instances.instance(i).weight(); } // Save prior counts priorCounts = new double[instances.numClasses()]; System.arraycopy(counts[1], 0, priorCounts, 0, instances.numClasses()); // Entropy of the full set priorEntropy = ContingencyTables.entropy(priorCounts); bestEntropy = priorEntropy; // Find best entropy. bestCounts = new double[2][instances.numClasses()]; for (int i = first; i < (lastPlusOne - 1); i++) { counts[0][(int) instances.instance(i).classValue()] += instances.instance(i).weight(); counts[1][(int) instances.instance(i).classValue()] -= instances.instance(i).weight(); if (instances.instance(i).value(attIndex) < instances.instance(i + 1).value(attIndex)) { currentCutPoint = (instances.instance(i).value(attIndex) + instances.instance(i + 1).value(attIndex)) / 2.0; currentEntropy = ContingencyTables.entropyConditionedOnRows(counts); if (currentEntropy < bestEntropy) { bestCutPoint = currentCutPoint; bestEntropy = currentEntropy; bestIndex = i; System.arraycopy(counts[0], 0, bestCounts[0], 0, instances.numClasses()); System.arraycopy(counts[1], 0, bestCounts[1], 0, instances.numClasses()); } numCutPoints++; } } // Use worse encoding? if (!m_UseBetterEncoding) { numCutPoints = (lastPlusOne - first) - 1; } // Checks if gain is zero gain = priorEntropy - bestEntropy; if (gain <= 0) { return null; } // Check if split is to be accepted if ((m_UseKononenko && KononenkosMDL(priorCounts, bestCounts, numInstances, numCutPoints)) || (!m_UseKononenko && FayyadAndIranisMDL(priorCounts, bestCounts, numInstances, numCutPoints))) { // Select split points for the left and right subsets left = cutPointsForSubset(instances, attIndex, first, bestIndex + 1); right = cutPointsForSubset(instances, attIndex, bestIndex + 1, lastPlusOne); // Merge cutpoints and return them if ((left == null) && (right) == null) { cutPoints = new double[1]; cutPoints[0] = bestCutPoint; } else if (right == null) { cutPoints = new double[left.length + 1]; System.arraycopy(left, 0, cutPoints, 0, left.length); cutPoints[left.length] = bestCutPoint; } else if (left == null) { cutPoints = new double[1 + right.length]; cutPoints[0] = bestCutPoint; System.arraycopy(right, 0, cutPoints, 1, right.length); } else { cutPoints = new double[left.length + right.length + 1]; System.arraycopy(left, 0, cutPoints, 0, left.length); cutPoints[left.length] = bestCutPoint; System.arraycopy(right, 0, cutPoints, left.length + 1, right.length); } return cutPoints; } else { return null; } }
/** * Classifies the given test instance. * * @param instance the instance to be classified * @return the predicted class for the instance * @throws Exception if the instance can't be classified */ public double[] distributionForInstance(Instance instance) throws Exception { double[] dist = new double[m_NumClasses]; double[] temp = new double[m_NumClasses]; double weight = 1.0; for (int i = 0; i < instance.numAttributes(); i++) { if (i != m_ClassIndex && !instance.isMissing(i)) { double val = instance.value(i); boolean ok = false; if (instance.attribute(i).isNumeric()) { int k; for (k = m_intervalBounds[i].length - 1; k >= 0; k--) { if (val > m_intervalBounds[i][k]) { for (int j = 0; j < m_NumClasses; j++) { if (m_globalCounts[j] > 0) { temp[j] = ((m_counts[i][k][j] + TINY) / (m_globalCounts[j] + TINY)); } } ok = true; break; } else if (val == m_intervalBounds[i][k]) { for (int j = 0; j < m_NumClasses; j++) { if (m_globalCounts[j] > 0) { temp[j] = ((m_counts[i][k][j] + m_counts[i][k - 1][j]) / 2.0) + TINY; temp[j] /= (m_globalCounts[j] + TINY); } } ok = true; break; } } if (!ok) { throw new Exception("This shouldn't happen"); } } else { // nominal attribute ok = true; for (int j = 0; j < m_NumClasses; j++) { if (m_globalCounts[j] > 0) { temp[j] = ((m_counts[i][(int) val][j] + TINY) / (m_globalCounts[j] + TINY)); } } } double sum = Utils.sum(temp); if (sum <= 0) { for (int j = 0; j < temp.length; j++) { temp[j] = 1.0 / (double) temp.length; } } else { Utils.normalize(temp, sum); } if (m_weightByConfidence) { weight = weka.core.ContingencyTables.entropy(temp); weight = Math.pow(weight, m_bias); if (weight < 1.0) { weight = 1.0; } } for (int j = 0; j < m_NumClasses; j++) { dist[j] += (temp[j] * weight); } } } double sum = Utils.sum(dist); if (sum <= 0) { for (int j = 0; j < dist.length; j++) { dist[j] = 1.0 / (double) dist.length; } return dist; } else { Utils.normalize(dist, sum); return dist; } }
/** * evaluates an individual attribute by measuring the gain ratio of the class given the attribute. * * @param attribute the index of the attribute to be evaluated * @return the gain ratio * @throws Exception if the attribute could not be evaluated */ public double evaluateAttribute(int attribute) throws Exception { int i, j, ii, jj; int ni, nj; double sum = 0.0; ni = m_trainInstances.attribute(attribute).numValues() + 1; nj = m_numClasses + 1; double[] sumi, sumj; Instance inst; double temp = 0.0; sumi = new double[ni]; sumj = new double[nj]; double[][] counts = new double[ni][nj]; sumi = new double[ni]; sumj = new double[nj]; for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumj[j] = 0.0; counts[i][j] = 0.0; } } // Fill the contingency table for (i = 0; i < m_numInstances; i++) { inst = m_trainInstances.instance(i); if (inst.isMissing(attribute)) { ii = ni - 1; } else { ii = (int) inst.value(attribute); } if (inst.isMissing(m_classIndex)) { jj = nj - 1; } else { jj = (int) inst.value(m_classIndex); } counts[ii][jj]++; } // get the row totals for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumi[i] += counts[i][j]; sum += counts[i][j]; } } // get the column totals for (j = 0; j < nj; j++) { sumj[j] = 0.0; for (i = 0; i < ni; i++) { sumj[j] += counts[i][j]; } } // distribute missing counts if (m_missing_merge && (sumi[ni - 1] < m_numInstances) && (sumj[nj - 1] < m_numInstances)) { double[] i_copy = new double[sumi.length]; double[] j_copy = new double[sumj.length]; double[][] counts_copy = new double[sumi.length][sumj.length]; for (i = 0; i < ni; i++) { System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length); } System.arraycopy(sumi, 0, i_copy, 0, sumi.length); System.arraycopy(sumj, 0, j_copy, 0, sumj.length); double total_missing = (sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]); // do the missing i's if (sumi[ni - 1] > 0.0) { for (j = 0; j < nj - 1; j++) { if (counts[ni - 1][j] > 0.0) { for (i = 0; i < ni - 1; i++) { temp = ((i_copy[i] / (sum - i_copy[ni - 1])) * counts[ni - 1][j]); counts[i][j] += temp; sumi[i] += temp; } counts[ni - 1][j] = 0.0; } } } sumi[ni - 1] = 0.0; // do the missing j's if (sumj[nj - 1] > 0.0) { for (i = 0; i < ni - 1; i++) { if (counts[i][nj - 1] > 0.0) { for (j = 0; j < nj - 1; j++) { temp = ((j_copy[j] / (sum - j_copy[nj - 1])) * counts[i][nj - 1]); counts[i][j] += temp; sumj[j] += temp; } counts[i][nj - 1] = 0.0; } } } sumj[nj - 1] = 0.0; // do the both missing if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) { for (i = 0; i < ni - 1; i++) { for (j = 0; j < nj - 1; j++) { temp = (counts_copy[i][j] / (sum - total_missing)) * counts_copy[ni - 1][nj - 1]; counts[i][j] += temp; sumi[i] += temp; sumj[j] += temp; } } counts[ni - 1][nj - 1] = 0.0; } } return ContingencyTables.gainRatio(counts); }