Esempio n. 1
0
  /**
   * Test using Fayyad and Irani's MDL criterion.
   *
   * @param priorCounts
   * @param bestCounts
   * @param numInstances
   * @param numCutPoints
   * @return true if the splits is acceptable
   */
  private boolean FayyadAndIranisMDL(
      double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) {

    double priorEntropy, entropy, gain;
    double entropyLeft, entropyRight, delta;
    int numClassesTotal, numClassesRight, numClassesLeft;

    // Compute entropy before split.
    priorEntropy = ContingencyTables.entropy(priorCounts);

    // Compute entropy after split.
    entropy = ContingencyTables.entropyConditionedOnRows(bestCounts);

    // Compute information gain.
    gain = priorEntropy - entropy;

    // Number of classes occuring in the set
    numClassesTotal = 0;
    for (double priorCount : priorCounts) {
      if (priorCount > 0) {
        numClassesTotal++;
      }
    }

    // Number of classes occuring in the left subset
    numClassesLeft = 0;
    for (int i = 0; i < bestCounts[0].length; i++) {
      if (bestCounts[0][i] > 0) {
        numClassesLeft++;
      }
    }

    // Number of classes occuring in the right subset
    numClassesRight = 0;
    for (int i = 0; i < bestCounts[1].length; i++) {
      if (bestCounts[1][i] > 0) {
        numClassesRight++;
      }
    }

    // Entropy of the left and the right subsets
    entropyLeft = ContingencyTables.entropy(bestCounts[0]);
    entropyRight = ContingencyTables.entropy(bestCounts[1]);

    // Compute terms for MDL formula
    delta =
        Utils.log2(Math.pow(3, numClassesTotal) - 2)
            - ((numClassesTotal * priorEntropy)
                - (numClassesRight * entropyRight)
                - (numClassesLeft * entropyLeft));

    // Check if split is to be accepted
    return (gain > (Utils.log2(numCutPoints) + delta) / numInstances);
  }
Esempio n. 2
0
  /**
   * The description length of the theory for a given rule. Computed as:<br>
   * 0.5* [||k||+ S(t, k, k/t)]<br>
   * where k is the number of antecedents of the rule; t is the total possible antecedents that
   * could appear in a rule; ||K|| is the universal prior for k , log2*(k) and S(t,k,p) =
   * -k*log2(p)-(n-k)log2(1-p) is the subset encoding length.
   *
   * <p>Details see Quilan: "MDL and categorical theories (Continued)",ML95
   *
   * @param index the index of the given rule (assuming correct)
   * @return the theory DL, weighted if weight != 1.0
   */
  public double theoryDL(int index) {

    double k = ((Rule) m_Ruleset.elementAt(index)).size();

    if (k == 0) return 0.0;

    double tdl = Utils.log2(k);
    if (k > 1) // Approximation
    tdl += 2.0 * Utils.log2(tdl); // of log2 star
    tdl += subsetDL(m_Total, k, k / m_Total);
    // System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl);
    return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl;
  }
Esempio n. 3
0
  /**
   * Computes the entropy of a dataset.
   *
   * @param data the data for which entropy is to be computed
   * @return the entropy of the data's class distribution
   * @throws Exception if computation fails
   */
  private double computeEntropy(Instances data) throws Exception {

    double[] classCounts = new double[data.numClasses()];
    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
      Instance inst = (Instance) instEnum.nextElement();
      classCounts[(int) inst.classValue()]++;
    }
    double entropy = 0;
    for (int j = 0; j < data.numClasses(); j++) {
      if (classCounts[j] > 0) {
        entropy -= classCounts[j] * Utils.log2(classCounts[j]);
      }
    }
    entropy /= (double) data.numInstances();
    return entropy + Utils.log2(data.numInstances());
  }
 public double MyXLogX(double x) {
   // Utils.xlogx(x);
   double precision = 1000000.0; // 6 decimal places
   double d = Math.floor(x * precision) / precision;
   double MyD = 0.0;
   if (d < 0.0 || d > 1.0) System.err.println("Error: MyXLogX(x): x = " + x + " is out of range.");
   else if (Double.isInfinite(d))
     System.err.println("Error: MyXLogX(x): x = " + x + " is infinite.");
   else if (Double.isNaN(d)) System.err.println("Error: MyXLogX(x): x = " + x + " is NaN.");
   else if (d == 0.0 || d == 1.0) return 0.0;
   else MyD = d * Utils.log2(d);
   return MyD;
 }
Esempio n. 5
0
  @Override
  protected void doForFile(File file) throws Exception {
    int numOfTokens = TokensTool.getInstance().getIntegerMap().get(currFileName);
    int numOfTypes = TypesTool.getInstance().getIntegerMap().get(currFileName);
    int v1 = 0;

    Map<String, Integer> unigrams = UnigramsTool.getInstance().getFilesData().get(currFileName);
    for (Integer count : unigrams.values()) {
      if (count == 1) {
        v1++;
      }
    }
    double nominator = 100 * Utils.log2(numOfTokens);
    double denominator = ((double) (1 - v1)) / numOfTypes;
    double ttr = nominator / denominator;

    getIntegerMap().put(currFileName, normalize(ttr));
  }
Esempio n. 6
0
  /**
   * Test using Kononenko's MDL criterion.
   *
   * @param priorCounts
   * @param bestCounts
   * @param numInstances
   * @param numCutPoints
   * @return true if the split is acceptable
   */
  private boolean KononenkosMDL(
      double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) {

    double distPrior, instPrior, distAfter = 0, sum, instAfter = 0;
    double before, after;
    int numClassesTotal;

    // Number of classes occuring in the set
    numClassesTotal = 0;
    for (double priorCount : priorCounts) {
      if (priorCount > 0) {
        numClassesTotal++;
      }
    }

    // Encode distribution prior to split
    distPrior =
        SpecialFunctions.log2Binomial(numInstances + numClassesTotal - 1, numClassesTotal - 1);

    // Encode instances prior to split.
    instPrior = SpecialFunctions.log2Multinomial(numInstances, priorCounts);

    before = instPrior + distPrior;

    // Encode distributions and instances after split.
    for (double[] bestCount : bestCounts) {
      sum = Utils.sum(bestCount);
      distAfter += SpecialFunctions.log2Binomial(sum + numClassesTotal - 1, numClassesTotal - 1);
      instAfter += SpecialFunctions.log2Multinomial(sum, bestCount);
    }

    // Coding cost after split
    after = Utils.log2(numCutPoints) + distAfter + instAfter;

    // Check if split is to be accepted
    return (before > after);
  }
Esempio n. 7
0
  /**
   * The description length of data given the parameters of the data based on the ruleset.
   *
   * <p>Details see Quinlan: "MDL and categorical theories (Continued)",ML95
   *
   * <p>
   *
   * @param expFPOverErr expected FP/(FP+FN)
   * @param cover coverage
   * @param uncover uncoverage
   * @param fp False Positive
   * @param fn False Negative
   * @return the description length
   */
  public static double dataDL(
      double expFPOverErr, double cover, double uncover, double fp, double fn) {
    double totalBits = Utils.log2(cover + uncover + 1.0); // how many data?
    double coverBits, uncoverBits; // What's the error?
    double expErr; // Expected FP or FN

    if (Utils.gr(cover, uncover)) {
      expErr = expFPOverErr * (fp + fn);
      coverBits = subsetDL(cover, fp, expErr / cover);
      uncoverBits = Utils.gr(uncover, 0.0) ? subsetDL(uncover, fn, fn / uncover) : 0.0;
    } else {
      expErr = (1.0 - expFPOverErr) * (fp + fn);
      coverBits = Utils.gr(cover, 0.0) ? subsetDL(cover, fp, fp / cover) : 0.0;
      uncoverBits = subsetDL(uncover, fn, expErr / uncover);
    }

    /*
      System.err.println("!!!cover: " + cover + "|uncover" + uncover +
      "|coverBits: "+coverBits+"|uncBits: "+ uncoverBits+
      "|FPRate: "+expFPOverErr + "|expErr: "+expErr+
      "|fp: "+fp+"|fn: "+fn+"|total: "+totalBits);
    */
    return (totalBits + coverBits + uncoverBits);
  }
Esempio n. 8
0
 /**
  * Subset description length: <br>
  * S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
  *
  * <p>Details see Quilan: "MDL and categorical theories (Continued)",ML95
  *
  * @param t the number of elements in a known set
  * @param k the number of elements in a subset
  * @param p the expected proportion of subset known by recipient
  * @return the subset description length
  */
 public static double subsetDL(double t, double k, double p) {
   double rt = Utils.gr(p, 0.0) ? (-k * Utils.log2(p)) : 0.0;
   rt -= (t - k) * Utils.log2(1 - p);
   return rt;
 }
Esempio n. 9
0
  /**
   * Creates split on numeric attribute.
   *
   * @exception Exception if something goes wrong
   */
  private void handleNumericAttribute(Instances trainInstances) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int index = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double defaultEnt;
    double minSplit;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new Distribution(2, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
      instance = (Instance) enu.nextElement();
      if (instance.isMissing(m_attIndex)) break;
      m_distribution.add(1, instance);
      i++;
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses());
    if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj;
    else if (Utils.gr(minSplit, 25)) minSplit = 25;

    // Enough Instances with known values?
    if (Utils.sm((double) firstMiss, 2 * minSplit)) return;

    // Compute values of criteria for all possible split
    // indices.
    defaultEnt = m_infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {

      if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5
          < trainInstances.instance(next).value(m_attIndex)) {

        // Move class values for all Instances up to next
        // possible split point.
        m_distribution.shiftRange(1, 0, trainInstances, last, next);

        // Check if enough Instances in each subset and compute
        // values for criteria.
        if (Utils.grOrEq(m_distribution.perBag(0), minSplit)
            && Utils.grOrEq(m_distribution.perBag(1), minSplit)) {
          currentInfoGain =
              m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt);
          if (Utils.gr(currentInfoGain, m_infoGain)) {
            m_infoGain = currentInfoGain;
            splitIndex = next - 1;
          }
          index++;
        }
        last = next;
      }
      next++;
    }

    // Was there any useful split?
    if (index == 0) return;

    // Compute modified information gain for best split.
    if (m_useMDLcorrection) {
      m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights);
    }
    if (Utils.smOrEq(m_infoGain, 0)) return;

    // Set instance variables' values to values for
    // best split.
    m_numSubsets = 2;
    m_splitPoint =
        (trainInstances.instance(splitIndex + 1).value(m_attIndex)
                + trainInstances.instance(splitIndex).value(m_attIndex))
            / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) {
      m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distributioN for best split.
    m_distribution = new Distribution(2, trainInstances.numClasses());
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    // Compute modified gain ratio for best split.
    m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain);
  }
Esempio n. 10
0
  /** Returns coding cost for split (used in rule learner). */
  @Override
  public final double codingCost() {

    return Utils.log2(m_index);
  }
Esempio n. 11
0
  /**
   * Updates all the statistics about a classifiers performance for the current test instance.
   *
   * @param predictedDistribution the probabilities assigned to each class
   * @param instance the instance to be classified
   * @throws Exception if the class of the instance is not set
   */
  protected void updateStatsForClassifier(double[] predictedDistribution, Instance instance)
      throws Exception {

    int actualClass = (int) instance.classValue();

    if (!instance.classIsMissing()) {
      updateMargins(predictedDistribution, actualClass, instance.weight());

      // collect all predictions and their corresponding classes
      SortedMap<Double, Integer> predToClass = new TreeMap<Double, Integer>(descendingDouble);
      for (int i = 0; i < m_NumClasses; i++) {
        predToClass.put(predictedDistribution[i], i);
      }
      List<Integer> candidateClasses = new ArrayList<Integer>(relaxParam);
      int count = 0;
      for (Double pred : predToClass.keySet()) {
        candidateClasses.add(predToClass.get(pred));
        count++;
        if (count == relaxParam) break;
      }
      // check if relaxed set of candidates contains actual, if so -
      // attribute that prediction
      // otherwise - take the to pprediction
      int predictedClass = -1;
      if (candidateClasses.contains(actualClass)) predictedClass = actualClass;
      else predictedClass = candidateClasses.get(0);

      /*
      // Determine the predicted class (doesn't detect multiple
      // classifications)
      int predictedClass = -1;
      double bestProb = 0.0;
      for(int i = 0; i < m_NumClasses; i++) {
      	if (predictedDistribution[i] > bestProb) {
      		predictedClass = i;
      		bestProb = predictedDistribution[i];
      	}
      }
       */

      m_WithClass += instance.weight();

      // Determine misclassification cost
      if (m_CostMatrix != null) {
        if (predictedClass < 0) {
          // For missing predictions, we assume the worst possible cost.
          // This is pretty harsh.
          // Perhaps we could take the negative of the cost of a correct
          // prediction (-m_CostMatrix.getElement(actualClass,actualClass)),
          // although often this will be zero
          m_TotalCost += instance.weight() * m_CostMatrix.getMaxCost(actualClass, instance);
        } else {
          m_TotalCost +=
              instance.weight() * m_CostMatrix.getElement(actualClass, predictedClass, instance);
        }
      }

      // Update counts when no class was predicted
      if (predictedClass < 0) {
        m_Unclassified += instance.weight();
        return;
      }

      double predictedProb = Math.max(MIN_SF_PROB, predictedDistribution[actualClass]);
      double priorProb = Math.max(MIN_SF_PROB, m_ClassPriors[actualClass] / m_ClassPriorsSum);
      if (predictedProb >= priorProb) {
        m_SumKBInfo += (Utils.log2(predictedProb) - Utils.log2(priorProb)) * instance.weight();
      } else {
        m_SumKBInfo -=
            (Utils.log2(1.0 - predictedProb) - Utils.log2(1.0 - priorProb)) * instance.weight();
      }

      m_SumSchemeEntropy -= Utils.log2(predictedProb) * instance.weight();
      m_SumPriorEntropy -= Utils.log2(priorProb) * instance.weight();

      updateNumericScores(
          predictedDistribution, makeDistribution(instance.classValue()), instance.weight());

      // Update other stats
      m_ConfusionMatrix[actualClass][predictedClass] += instance.weight();
      if (predictedClass != actualClass) {
        m_Incorrect += instance.weight();
      } else {
        m_Correct += instance.weight();
      }
    } else {
      m_MissingClass += instance.weight();
    }
  }