Beispiel #1
0
  /**
   * Test using Fayyad and Irani's MDL criterion.
   *
   * @param priorCounts
   * @param bestCounts
   * @param numInstances
   * @param numCutPoints
   * @return true if the splits is acceptable
   */
  private boolean FayyadAndIranisMDL(
      double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) {

    double priorEntropy, entropy, gain;
    double entropyLeft, entropyRight, delta;
    int numClassesTotal, numClassesRight, numClassesLeft;

    // Compute entropy before split.
    priorEntropy = ContingencyTables.entropy(priorCounts);

    // Compute entropy after split.
    entropy = ContingencyTables.entropyConditionedOnRows(bestCounts);

    // Compute information gain.
    gain = priorEntropy - entropy;

    // Number of classes occuring in the set
    numClassesTotal = 0;
    for (double priorCount : priorCounts) {
      if (priorCount > 0) {
        numClassesTotal++;
      }
    }

    // Number of classes occuring in the left subset
    numClassesLeft = 0;
    for (int i = 0; i < bestCounts[0].length; i++) {
      if (bestCounts[0][i] > 0) {
        numClassesLeft++;
      }
    }

    // Number of classes occuring in the right subset
    numClassesRight = 0;
    for (int i = 0; i < bestCounts[1].length; i++) {
      if (bestCounts[1][i] > 0) {
        numClassesRight++;
      }
    }

    // Entropy of the left and the right subsets
    entropyLeft = ContingencyTables.entropy(bestCounts[0]);
    entropyRight = ContingencyTables.entropy(bestCounts[1]);

    // Compute terms for MDL formula
    delta =
        Utils.log2(Math.pow(3, numClassesTotal) - 2)
            - ((numClassesTotal * priorEntropy)
                - (numClassesRight * entropyRight)
                - (numClassesLeft * entropyLeft));

    // Check if split is to be accepted
    return (gain > (Utils.log2(numCutPoints) + delta) / numInstances);
  }
Beispiel #2
0
  /**
   * Selects cutpoints for sorted subset.
   *
   * @param instances
   * @param attIndex
   * @param first
   * @param lastPlusOne
   * @return
   */
  private double[] cutPointsForSubset(
      Instances instances, int attIndex, int first, int lastPlusOne) {

    double[][] counts, bestCounts;
    double[] priorCounts, left, right, cutPoints;
    double currentCutPoint = -Double.MAX_VALUE,
        bestCutPoint = -1,
        currentEntropy,
        bestEntropy,
        priorEntropy,
        gain;
    int bestIndex = -1, numCutPoints = 0;
    double numInstances = 0;

    // Compute number of instances in set
    if ((lastPlusOne - first) < 2) {
      return null;
    }

    // Compute class counts.
    counts = new double[2][instances.numClasses()];
    for (int i = first; i < lastPlusOne; i++) {
      numInstances += instances.instance(i).weight();
      counts[1][(int) instances.instance(i).classValue()] += instances.instance(i).weight();
    }

    // Save prior counts
    priorCounts = new double[instances.numClasses()];
    System.arraycopy(counts[1], 0, priorCounts, 0, instances.numClasses());

    // Entropy of the full set
    priorEntropy = ContingencyTables.entropy(priorCounts);
    bestEntropy = priorEntropy;

    // Find best entropy.
    bestCounts = new double[2][instances.numClasses()];
    for (int i = first; i < (lastPlusOne - 1); i++) {
      counts[0][(int) instances.instance(i).classValue()] += instances.instance(i).weight();
      counts[1][(int) instances.instance(i).classValue()] -= instances.instance(i).weight();
      if (instances.instance(i).value(attIndex) < instances.instance(i + 1).value(attIndex)) {
        currentCutPoint =
            (instances.instance(i).value(attIndex) + instances.instance(i + 1).value(attIndex))
                / 2.0;
        currentEntropy = ContingencyTables.entropyConditionedOnRows(counts);
        if (currentEntropy < bestEntropy) {
          bestCutPoint = currentCutPoint;
          bestEntropy = currentEntropy;
          bestIndex = i;
          System.arraycopy(counts[0], 0, bestCounts[0], 0, instances.numClasses());
          System.arraycopy(counts[1], 0, bestCounts[1], 0, instances.numClasses());
        }
        numCutPoints++;
      }
    }

    // Use worse encoding?
    if (!m_UseBetterEncoding) {
      numCutPoints = (lastPlusOne - first) - 1;
    }

    // Checks if gain is zero
    gain = priorEntropy - bestEntropy;
    if (gain <= 0) {
      return null;
    }

    // Check if split is to be accepted
    if ((m_UseKononenko && KononenkosMDL(priorCounts, bestCounts, numInstances, numCutPoints))
        || (!m_UseKononenko
            && FayyadAndIranisMDL(priorCounts, bestCounts, numInstances, numCutPoints))) {

      // Select split points for the left and right subsets
      left = cutPointsForSubset(instances, attIndex, first, bestIndex + 1);
      right = cutPointsForSubset(instances, attIndex, bestIndex + 1, lastPlusOne);

      // Merge cutpoints and return them
      if ((left == null) && (right) == null) {
        cutPoints = new double[1];
        cutPoints[0] = bestCutPoint;
      } else if (right == null) {
        cutPoints = new double[left.length + 1];
        System.arraycopy(left, 0, cutPoints, 0, left.length);
        cutPoints[left.length] = bestCutPoint;
      } else if (left == null) {
        cutPoints = new double[1 + right.length];
        cutPoints[0] = bestCutPoint;
        System.arraycopy(right, 0, cutPoints, 1, right.length);
      } else {
        cutPoints = new double[left.length + right.length + 1];
        System.arraycopy(left, 0, cutPoints, 0, left.length);
        cutPoints[left.length] = bestCutPoint;
        System.arraycopy(right, 0, cutPoints, left.length + 1, right.length);
      }

      return cutPoints;
    } else {
      return null;
    }
  }
Beispiel #3
0
  /**
   * Classifies the given test instance.
   *
   * @param instance the instance to be classified
   * @return the predicted class for the instance
   * @throws Exception if the instance can't be classified
   */
  public double[] distributionForInstance(Instance instance) throws Exception {
    double[] dist = new double[m_NumClasses];
    double[] temp = new double[m_NumClasses];
    double weight = 1.0;

    for (int i = 0; i < instance.numAttributes(); i++) {
      if (i != m_ClassIndex && !instance.isMissing(i)) {
        double val = instance.value(i);
        boolean ok = false;
        if (instance.attribute(i).isNumeric()) {
          int k;
          for (k = m_intervalBounds[i].length - 1; k >= 0; k--) {
            if (val > m_intervalBounds[i][k]) {
              for (int j = 0; j < m_NumClasses; j++) {
                if (m_globalCounts[j] > 0) {
                  temp[j] = ((m_counts[i][k][j] + TINY) / (m_globalCounts[j] + TINY));
                }
              }
              ok = true;
              break;
            } else if (val == m_intervalBounds[i][k]) {
              for (int j = 0; j < m_NumClasses; j++) {
                if (m_globalCounts[j] > 0) {
                  temp[j] = ((m_counts[i][k][j] + m_counts[i][k - 1][j]) / 2.0) + TINY;
                  temp[j] /= (m_globalCounts[j] + TINY);
                }
              }
              ok = true;
              break;
            }
          }
          if (!ok) {
            throw new Exception("This shouldn't happen");
          }
        } else { // nominal attribute
          ok = true;
          for (int j = 0; j < m_NumClasses; j++) {
            if (m_globalCounts[j] > 0) {
              temp[j] = ((m_counts[i][(int) val][j] + TINY) / (m_globalCounts[j] + TINY));
            }
          }
        }

        double sum = Utils.sum(temp);
        if (sum <= 0) {
          for (int j = 0; j < temp.length; j++) {
            temp[j] = 1.0 / (double) temp.length;
          }
        } else {
          Utils.normalize(temp, sum);
        }

        if (m_weightByConfidence) {
          weight = weka.core.ContingencyTables.entropy(temp);
          weight = Math.pow(weight, m_bias);
          if (weight < 1.0) {
            weight = 1.0;
          }
        }

        for (int j = 0; j < m_NumClasses; j++) {
          dist[j] += (temp[j] * weight);
        }
      }
    }

    double sum = Utils.sum(dist);
    if (sum <= 0) {
      for (int j = 0; j < dist.length; j++) {
        dist[j] = 1.0 / (double) dist.length;
      }
      return dist;
    } else {
      Utils.normalize(dist, sum);
      return dist;
    }
  }