Beispiel #1
0
  /**
   * Test using Fayyad and Irani's MDL criterion.
   *
   * @param priorCounts
   * @param bestCounts
   * @param numInstances
   * @param numCutPoints
   * @return true if the splits is acceptable
   */
  private boolean FayyadAndIranisMDL(
      double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) {

    double priorEntropy, entropy, gain;
    double entropyLeft, entropyRight, delta;
    int numClassesTotal, numClassesRight, numClassesLeft;

    // Compute entropy before split.
    priorEntropy = ContingencyTables.entropy(priorCounts);

    // Compute entropy after split.
    entropy = ContingencyTables.entropyConditionedOnRows(bestCounts);

    // Compute information gain.
    gain = priorEntropy - entropy;

    // Number of classes occuring in the set
    numClassesTotal = 0;
    for (double priorCount : priorCounts) {
      if (priorCount > 0) {
        numClassesTotal++;
      }
    }

    // Number of classes occuring in the left subset
    numClassesLeft = 0;
    for (int i = 0; i < bestCounts[0].length; i++) {
      if (bestCounts[0][i] > 0) {
        numClassesLeft++;
      }
    }

    // Number of classes occuring in the right subset
    numClassesRight = 0;
    for (int i = 0; i < bestCounts[1].length; i++) {
      if (bestCounts[1][i] > 0) {
        numClassesRight++;
      }
    }

    // Entropy of the left and the right subsets
    entropyLeft = ContingencyTables.entropy(bestCounts[0]);
    entropyRight = ContingencyTables.entropy(bestCounts[1]);

    // Compute terms for MDL formula
    delta =
        Utils.log2(Math.pow(3, numClassesTotal) - 2)
            - ((numClassesTotal * priorEntropy)
                - (numClassesRight * entropyRight)
                - (numClassesLeft * entropyLeft));

    // Check if split is to be accepted
    return (gain > (Utils.log2(numCutPoints) + delta) / numInstances);
  }
Beispiel #2
0
  /**
   * Selects cutpoints for sorted subset.
   *
   * @param instances
   * @param attIndex
   * @param first
   * @param lastPlusOne
   * @return
   */
  private double[] cutPointsForSubset(
      Instances instances, int attIndex, int first, int lastPlusOne) {

    double[][] counts, bestCounts;
    double[] priorCounts, left, right, cutPoints;
    double currentCutPoint = -Double.MAX_VALUE,
        bestCutPoint = -1,
        currentEntropy,
        bestEntropy,
        priorEntropy,
        gain;
    int bestIndex = -1, numCutPoints = 0;
    double numInstances = 0;

    // Compute number of instances in set
    if ((lastPlusOne - first) < 2) {
      return null;
    }

    // Compute class counts.
    counts = new double[2][instances.numClasses()];
    for (int i = first; i < lastPlusOne; i++) {
      numInstances += instances.instance(i).weight();
      counts[1][(int) instances.instance(i).classValue()] += instances.instance(i).weight();
    }

    // Save prior counts
    priorCounts = new double[instances.numClasses()];
    System.arraycopy(counts[1], 0, priorCounts, 0, instances.numClasses());

    // Entropy of the full set
    priorEntropy = ContingencyTables.entropy(priorCounts);
    bestEntropy = priorEntropy;

    // Find best entropy.
    bestCounts = new double[2][instances.numClasses()];
    for (int i = first; i < (lastPlusOne - 1); i++) {
      counts[0][(int) instances.instance(i).classValue()] += instances.instance(i).weight();
      counts[1][(int) instances.instance(i).classValue()] -= instances.instance(i).weight();
      if (instances.instance(i).value(attIndex) < instances.instance(i + 1).value(attIndex)) {
        currentCutPoint =
            (instances.instance(i).value(attIndex) + instances.instance(i + 1).value(attIndex))
                / 2.0;
        currentEntropy = ContingencyTables.entropyConditionedOnRows(counts);
        if (currentEntropy < bestEntropy) {
          bestCutPoint = currentCutPoint;
          bestEntropy = currentEntropy;
          bestIndex = i;
          System.arraycopy(counts[0], 0, bestCounts[0], 0, instances.numClasses());
          System.arraycopy(counts[1], 0, bestCounts[1], 0, instances.numClasses());
        }
        numCutPoints++;
      }
    }

    // Use worse encoding?
    if (!m_UseBetterEncoding) {
      numCutPoints = (lastPlusOne - first) - 1;
    }

    // Checks if gain is zero
    gain = priorEntropy - bestEntropy;
    if (gain <= 0) {
      return null;
    }

    // Check if split is to be accepted
    if ((m_UseKononenko && KononenkosMDL(priorCounts, bestCounts, numInstances, numCutPoints))
        || (!m_UseKononenko
            && FayyadAndIranisMDL(priorCounts, bestCounts, numInstances, numCutPoints))) {

      // Select split points for the left and right subsets
      left = cutPointsForSubset(instances, attIndex, first, bestIndex + 1);
      right = cutPointsForSubset(instances, attIndex, bestIndex + 1, lastPlusOne);

      // Merge cutpoints and return them
      if ((left == null) && (right) == null) {
        cutPoints = new double[1];
        cutPoints[0] = bestCutPoint;
      } else if (right == null) {
        cutPoints = new double[left.length + 1];
        System.arraycopy(left, 0, cutPoints, 0, left.length);
        cutPoints[left.length] = bestCutPoint;
      } else if (left == null) {
        cutPoints = new double[1 + right.length];
        cutPoints[0] = bestCutPoint;
        System.arraycopy(right, 0, cutPoints, 1, right.length);
      } else {
        cutPoints = new double[left.length + right.length + 1];
        System.arraycopy(left, 0, cutPoints, 0, left.length);
        cutPoints[left.length] = bestCutPoint;
        System.arraycopy(right, 0, cutPoints, left.length + 1, right.length);
      }

      return cutPoints;
    } else {
      return null;
    }
  }
Beispiel #3
0
  /**
   * Classifies the given test instance.
   *
   * @param instance the instance to be classified
   * @return the predicted class for the instance
   * @throws Exception if the instance can't be classified
   */
  public double[] distributionForInstance(Instance instance) throws Exception {
    double[] dist = new double[m_NumClasses];
    double[] temp = new double[m_NumClasses];
    double weight = 1.0;

    for (int i = 0; i < instance.numAttributes(); i++) {
      if (i != m_ClassIndex && !instance.isMissing(i)) {
        double val = instance.value(i);
        boolean ok = false;
        if (instance.attribute(i).isNumeric()) {
          int k;
          for (k = m_intervalBounds[i].length - 1; k >= 0; k--) {
            if (val > m_intervalBounds[i][k]) {
              for (int j = 0; j < m_NumClasses; j++) {
                if (m_globalCounts[j] > 0) {
                  temp[j] = ((m_counts[i][k][j] + TINY) / (m_globalCounts[j] + TINY));
                }
              }
              ok = true;
              break;
            } else if (val == m_intervalBounds[i][k]) {
              for (int j = 0; j < m_NumClasses; j++) {
                if (m_globalCounts[j] > 0) {
                  temp[j] = ((m_counts[i][k][j] + m_counts[i][k - 1][j]) / 2.0) + TINY;
                  temp[j] /= (m_globalCounts[j] + TINY);
                }
              }
              ok = true;
              break;
            }
          }
          if (!ok) {
            throw new Exception("This shouldn't happen");
          }
        } else { // nominal attribute
          ok = true;
          for (int j = 0; j < m_NumClasses; j++) {
            if (m_globalCounts[j] > 0) {
              temp[j] = ((m_counts[i][(int) val][j] + TINY) / (m_globalCounts[j] + TINY));
            }
          }
        }

        double sum = Utils.sum(temp);
        if (sum <= 0) {
          for (int j = 0; j < temp.length; j++) {
            temp[j] = 1.0 / (double) temp.length;
          }
        } else {
          Utils.normalize(temp, sum);
        }

        if (m_weightByConfidence) {
          weight = weka.core.ContingencyTables.entropy(temp);
          weight = Math.pow(weight, m_bias);
          if (weight < 1.0) {
            weight = 1.0;
          }
        }

        for (int j = 0; j < m_NumClasses; j++) {
          dist[j] += (temp[j] * weight);
        }
      }
    }

    double sum = Utils.sum(dist);
    if (sum <= 0) {
      for (int j = 0; j < dist.length; j++) {
        dist[j] = 1.0 / (double) dist.length;
      }
      return dist;
    } else {
      Utils.normalize(dist, sum);
      return dist;
    }
  }
  /**
   * evaluates an individual attribute by measuring the gain ratio of the class given the attribute.
   *
   * @param attribute the index of the attribute to be evaluated
   * @return the gain ratio
   * @throws Exception if the attribute could not be evaluated
   */
  public double evaluateAttribute(int attribute) throws Exception {
    int i, j, ii, jj;
    int ni, nj;
    double sum = 0.0;
    ni = m_trainInstances.attribute(attribute).numValues() + 1;
    nj = m_numClasses + 1;
    double[] sumi, sumj;
    Instance inst;
    double temp = 0.0;
    sumi = new double[ni];
    sumj = new double[nj];
    double[][] counts = new double[ni][nj];
    sumi = new double[ni];
    sumj = new double[nj];

    for (i = 0; i < ni; i++) {
      sumi[i] = 0.0;

      for (j = 0; j < nj; j++) {
        sumj[j] = 0.0;
        counts[i][j] = 0.0;
      }
    }

    // Fill the contingency table
    for (i = 0; i < m_numInstances; i++) {
      inst = m_trainInstances.instance(i);

      if (inst.isMissing(attribute)) {
        ii = ni - 1;
      } else {
        ii = (int) inst.value(attribute);
      }

      if (inst.isMissing(m_classIndex)) {
        jj = nj - 1;
      } else {
        jj = (int) inst.value(m_classIndex);
      }

      counts[ii][jj]++;
    }

    // get the row totals
    for (i = 0; i < ni; i++) {
      sumi[i] = 0.0;

      for (j = 0; j < nj; j++) {
        sumi[i] += counts[i][j];
        sum += counts[i][j];
      }
    }

    // get the column totals
    for (j = 0; j < nj; j++) {
      sumj[j] = 0.0;

      for (i = 0; i < ni; i++) {
        sumj[j] += counts[i][j];
      }
    }

    // distribute missing counts
    if (m_missing_merge && (sumi[ni - 1] < m_numInstances) && (sumj[nj - 1] < m_numInstances)) {
      double[] i_copy = new double[sumi.length];
      double[] j_copy = new double[sumj.length];
      double[][] counts_copy = new double[sumi.length][sumj.length];

      for (i = 0; i < ni; i++) {
        System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
      }

      System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
      System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
      double total_missing = (sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);

      // do the missing i's
      if (sumi[ni - 1] > 0.0) {
        for (j = 0; j < nj - 1; j++) {
          if (counts[ni - 1][j] > 0.0) {
            for (i = 0; i < ni - 1; i++) {
              temp = ((i_copy[i] / (sum - i_copy[ni - 1])) * counts[ni - 1][j]);
              counts[i][j] += temp;
              sumi[i] += temp;
            }

            counts[ni - 1][j] = 0.0;
          }
        }
      }

      sumi[ni - 1] = 0.0;

      // do the missing j's
      if (sumj[nj - 1] > 0.0) {
        for (i = 0; i < ni - 1; i++) {
          if (counts[i][nj - 1] > 0.0) {
            for (j = 0; j < nj - 1; j++) {
              temp = ((j_copy[j] / (sum - j_copy[nj - 1])) * counts[i][nj - 1]);
              counts[i][j] += temp;
              sumj[j] += temp;
            }

            counts[i][nj - 1] = 0.0;
          }
        }
      }

      sumj[nj - 1] = 0.0;

      // do the both missing
      if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
        for (i = 0; i < ni - 1; i++) {
          for (j = 0; j < nj - 1; j++) {
            temp = (counts_copy[i][j] / (sum - total_missing)) * counts_copy[ni - 1][nj - 1];
            counts[i][j] += temp;
            sumi[i] += temp;
            sumj[j] += temp;
          }
        }

        counts[ni - 1][nj - 1] = 0.0;
      }
    }

    return ContingencyTables.gainRatio(counts);
  }