/** * Test using Fayyad and Irani's MDL criterion. * * @param priorCounts * @param bestCounts * @param numInstances * @param numCutPoints * @return true if the splits is acceptable */ private boolean FayyadAndIranisMDL( double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) { double priorEntropy, entropy, gain; double entropyLeft, entropyRight, delta; int numClassesTotal, numClassesRight, numClassesLeft; // Compute entropy before split. priorEntropy = ContingencyTables.entropy(priorCounts); // Compute entropy after split. entropy = ContingencyTables.entropyConditionedOnRows(bestCounts); // Compute information gain. gain = priorEntropy - entropy; // Number of classes occuring in the set numClassesTotal = 0; for (double priorCount : priorCounts) { if (priorCount > 0) { numClassesTotal++; } } // Number of classes occuring in the left subset numClassesLeft = 0; for (int i = 0; i < bestCounts[0].length; i++) { if (bestCounts[0][i] > 0) { numClassesLeft++; } } // Number of classes occuring in the right subset numClassesRight = 0; for (int i = 0; i < bestCounts[1].length; i++) { if (bestCounts[1][i] > 0) { numClassesRight++; } } // Entropy of the left and the right subsets entropyLeft = ContingencyTables.entropy(bestCounts[0]); entropyRight = ContingencyTables.entropy(bestCounts[1]); // Compute terms for MDL formula delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) - ((numClassesTotal * priorEntropy) - (numClassesRight * entropyRight) - (numClassesLeft * entropyLeft)); // Check if split is to be accepted return (gain > (Utils.log2(numCutPoints) + delta) / numInstances); }
/** * The description length of the theory for a given rule. Computed as:<br> * 0.5* [||k||+ S(t, k, k/t)]<br> * where k is the number of antecedents of the rule; t is the total possible antecedents that * could appear in a rule; ||K|| is the universal prior for k , log2*(k) and S(t,k,p) = * -k*log2(p)-(n-k)log2(1-p) is the subset encoding length. * * <p>Details see Quilan: "MDL and categorical theories (Continued)",ML95 * * @param index the index of the given rule (assuming correct) * @return the theory DL, weighted if weight != 1.0 */ public double theoryDL(int index) { double k = ((Rule) m_Ruleset.elementAt(index)).size(); if (k == 0) return 0.0; double tdl = Utils.log2(k); if (k > 1) // Approximation tdl += 2.0 * Utils.log2(tdl); // of log2 star tdl += subsetDL(m_Total, k, k / m_Total); // System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl); return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl; }
/** * Computes the entropy of a dataset. * * @param data the data for which entropy is to be computed * @return the entropy of the data's class distribution * @throws Exception if computation fails */ private double computeEntropy(Instances data) throws Exception { double[] classCounts = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); classCounts[(int) inst.classValue()]++; } double entropy = 0; for (int j = 0; j < data.numClasses(); j++) { if (classCounts[j] > 0) { entropy -= classCounts[j] * Utils.log2(classCounts[j]); } } entropy /= (double) data.numInstances(); return entropy + Utils.log2(data.numInstances()); }
public double MyXLogX(double x) { // Utils.xlogx(x); double precision = 1000000.0; // 6 decimal places double d = Math.floor(x * precision) / precision; double MyD = 0.0; if (d < 0.0 || d > 1.0) System.err.println("Error: MyXLogX(x): x = " + x + " is out of range."); else if (Double.isInfinite(d)) System.err.println("Error: MyXLogX(x): x = " + x + " is infinite."); else if (Double.isNaN(d)) System.err.println("Error: MyXLogX(x): x = " + x + " is NaN."); else if (d == 0.0 || d == 1.0) return 0.0; else MyD = d * Utils.log2(d); return MyD; }
@Override protected void doForFile(File file) throws Exception { int numOfTokens = TokensTool.getInstance().getIntegerMap().get(currFileName); int numOfTypes = TypesTool.getInstance().getIntegerMap().get(currFileName); int v1 = 0; Map<String, Integer> unigrams = UnigramsTool.getInstance().getFilesData().get(currFileName); for (Integer count : unigrams.values()) { if (count == 1) { v1++; } } double nominator = 100 * Utils.log2(numOfTokens); double denominator = ((double) (1 - v1)) / numOfTypes; double ttr = nominator / denominator; getIntegerMap().put(currFileName, normalize(ttr)); }
/** * Test using Kononenko's MDL criterion. * * @param priorCounts * @param bestCounts * @param numInstances * @param numCutPoints * @return true if the split is acceptable */ private boolean KononenkosMDL( double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) { double distPrior, instPrior, distAfter = 0, sum, instAfter = 0; double before, after; int numClassesTotal; // Number of classes occuring in the set numClassesTotal = 0; for (double priorCount : priorCounts) { if (priorCount > 0) { numClassesTotal++; } } // Encode distribution prior to split distPrior = SpecialFunctions.log2Binomial(numInstances + numClassesTotal - 1, numClassesTotal - 1); // Encode instances prior to split. instPrior = SpecialFunctions.log2Multinomial(numInstances, priorCounts); before = instPrior + distPrior; // Encode distributions and instances after split. for (double[] bestCount : bestCounts) { sum = Utils.sum(bestCount); distAfter += SpecialFunctions.log2Binomial(sum + numClassesTotal - 1, numClassesTotal - 1); instAfter += SpecialFunctions.log2Multinomial(sum, bestCount); } // Coding cost after split after = Utils.log2(numCutPoints) + distAfter + instAfter; // Check if split is to be accepted return (before > after); }
/** * The description length of data given the parameters of the data based on the ruleset. * * <p>Details see Quinlan: "MDL and categorical theories (Continued)",ML95 * * <p> * * @param expFPOverErr expected FP/(FP+FN) * @param cover coverage * @param uncover uncoverage * @param fp False Positive * @param fn False Negative * @return the description length */ public static double dataDL( double expFPOverErr, double cover, double uncover, double fp, double fn) { double totalBits = Utils.log2(cover + uncover + 1.0); // how many data? double coverBits, uncoverBits; // What's the error? double expErr; // Expected FP or FN if (Utils.gr(cover, uncover)) { expErr = expFPOverErr * (fp + fn); coverBits = subsetDL(cover, fp, expErr / cover); uncoverBits = Utils.gr(uncover, 0.0) ? subsetDL(uncover, fn, fn / uncover) : 0.0; } else { expErr = (1.0 - expFPOverErr) * (fp + fn); coverBits = Utils.gr(cover, 0.0) ? subsetDL(cover, fp, fp / cover) : 0.0; uncoverBits = subsetDL(uncover, fn, expErr / uncover); } /* System.err.println("!!!cover: " + cover + "|uncover" + uncover + "|coverBits: "+coverBits+"|uncBits: "+ uncoverBits+ "|FPRate: "+expFPOverErr + "|expErr: "+expErr+ "|fp: "+fp+"|fn: "+fn+"|total: "+totalBits); */ return (totalBits + coverBits + uncoverBits); }
/** * Subset description length: <br> * S(t,k,p) = -k*log2(p)-(n-k)log2(1-p) * * <p>Details see Quilan: "MDL and categorical theories (Continued)",ML95 * * @param t the number of elements in a known set * @param k the number of elements in a subset * @param p the expected proportion of subset known by recipient * @return the subset description length */ public static double subsetDL(double t, double k, double p) { double rt = Utils.gr(p, 0.0) ? (-k * Utils.log2(p)) : 0.0; rt -= (t - k) * Utils.log2(1 - p); return rt; }
/** * Creates split on numeric attribute. * * @exception Exception if something goes wrong */ private void handleNumericAttribute(Instances trainInstances) throws Exception { int firstMiss; int next = 1; int last = 0; int index = 0; int splitIndex = -1; double currentInfoGain; double defaultEnt; double minSplit; Instance instance; int i; // Current attribute is a numeric attribute. m_distribution = new Distribution(2, trainInstances.numClasses()); // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); i = 0; while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (instance.isMissing(m_attIndex)) break; m_distribution.add(1, instance); i++; } firstMiss = i; // Compute minimum number of Instances required in each // subset. minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses()); if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj; else if (Utils.gr(minSplit, 25)) minSplit = 25; // Enough Instances with known values? if (Utils.sm((double) firstMiss, 2 * minSplit)) return; // Compute values of criteria for all possible split // indices. defaultEnt = m_infoGainCrit.oldEnt(m_distribution); while (next < firstMiss) { if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next).value(m_attIndex)) { // Move class values for all Instances up to next // possible split point. m_distribution.shiftRange(1, 0, trainInstances, last, next); // Check if enough Instances in each subset and compute // values for criteria. if (Utils.grOrEq(m_distribution.perBag(0), minSplit) && Utils.grOrEq(m_distribution.perBag(1), minSplit)) { currentInfoGain = m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt); if (Utils.gr(currentInfoGain, m_infoGain)) { m_infoGain = currentInfoGain; splitIndex = next - 1; } index++; } last = next; } next++; } // Was there any useful split? if (index == 0) return; // Compute modified information gain for best split. if (m_useMDLcorrection) { m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights); } if (Utils.smOrEq(m_infoGain, 0)) return; // Set instance variables' values to values for // best split. m_numSubsets = 2; m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex) + trainInstances.instance(splitIndex).value(m_attIndex)) / 2; // In case we have a numerical precision problem we need to choose the // smaller value if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) { m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex); } // Restore distributioN for best split. m_distribution = new Distribution(2, trainInstances.numClasses()); m_distribution.addRange(0, trainInstances, 0, splitIndex + 1); m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss); // Compute modified gain ratio for best split. m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain); }
/** Returns coding cost for split (used in rule learner). */ @Override public final double codingCost() { return Utils.log2(m_index); }
/** * Updates all the statistics about a classifiers performance for the current test instance. * * @param predictedDistribution the probabilities assigned to each class * @param instance the instance to be classified * @throws Exception if the class of the instance is not set */ protected void updateStatsForClassifier(double[] predictedDistribution, Instance instance) throws Exception { int actualClass = (int) instance.classValue(); if (!instance.classIsMissing()) { updateMargins(predictedDistribution, actualClass, instance.weight()); // collect all predictions and their corresponding classes SortedMap<Double, Integer> predToClass = new TreeMap<Double, Integer>(descendingDouble); for (int i = 0; i < m_NumClasses; i++) { predToClass.put(predictedDistribution[i], i); } List<Integer> candidateClasses = new ArrayList<Integer>(relaxParam); int count = 0; for (Double pred : predToClass.keySet()) { candidateClasses.add(predToClass.get(pred)); count++; if (count == relaxParam) break; } // check if relaxed set of candidates contains actual, if so - // attribute that prediction // otherwise - take the to pprediction int predictedClass = -1; if (candidateClasses.contains(actualClass)) predictedClass = actualClass; else predictedClass = candidateClasses.get(0); /* // Determine the predicted class (doesn't detect multiple // classifications) int predictedClass = -1; double bestProb = 0.0; for(int i = 0; i < m_NumClasses; i++) { if (predictedDistribution[i] > bestProb) { predictedClass = i; bestProb = predictedDistribution[i]; } } */ m_WithClass += instance.weight(); // Determine misclassification cost if (m_CostMatrix != null) { if (predictedClass < 0) { // For missing predictions, we assume the worst possible cost. // This is pretty harsh. // Perhaps we could take the negative of the cost of a correct // prediction (-m_CostMatrix.getElement(actualClass,actualClass)), // although often this will be zero m_TotalCost += instance.weight() * m_CostMatrix.getMaxCost(actualClass, instance); } else { m_TotalCost += instance.weight() * m_CostMatrix.getElement(actualClass, predictedClass, instance); } } // Update counts when no class was predicted if (predictedClass < 0) { m_Unclassified += instance.weight(); return; } double predictedProb = Math.max(MIN_SF_PROB, predictedDistribution[actualClass]); double priorProb = Math.max(MIN_SF_PROB, m_ClassPriors[actualClass] / m_ClassPriorsSum); if (predictedProb >= priorProb) { m_SumKBInfo += (Utils.log2(predictedProb) - Utils.log2(priorProb)) * instance.weight(); } else { m_SumKBInfo -= (Utils.log2(1.0 - predictedProb) - Utils.log2(1.0 - priorProb)) * instance.weight(); } m_SumSchemeEntropy -= Utils.log2(predictedProb) * instance.weight(); m_SumPriorEntropy -= Utils.log2(priorProb) * instance.weight(); updateNumericScores( predictedDistribution, makeDistribution(instance.classValue()), instance.weight()); // Update other stats m_ConfusionMatrix[actualClass][predictedClass] += instance.weight(); if (predictedClass != actualClass) { m_Incorrect += instance.weight(); } else { m_Correct += instance.weight(); } } else { m_MissingClass += instance.weight(); } }