Beispiel #1
0
  /**
   * Select only instances with weights that contribute to the specified quantile of the weight
   * distribution
   *
   * @param data the input instances
   * @param quantile the specified quantile eg 0.9 to select 90% of the weight mass
   * @return the selected instances
   */
  protected Instances selectWeightQuantile(Instances data, double quantile) {

    int numInstances = data.numInstances();
    Instances trainData = new Instances(data, numInstances);
    double[] weights = new double[numInstances];

    double sumOfWeights = 0;
    for (int i = 0; i < numInstances; i++) {
      weights[i] = data.instance(i).weight();
      sumOfWeights += weights[i];
    }
    double weightMassToSelect = sumOfWeights * quantile;
    int[] sortedIndices = Utils.sort(weights);

    // Select the instances
    sumOfWeights = 0;
    for (int i = numInstances - 1; i >= 0; i--) {
      Instance instance = (Instance) data.instance(sortedIndices[i]).copy();
      trainData.add(instance);
      sumOfWeights += weights[sortedIndices[i]];
      if ((sumOfWeights > weightMassToSelect)
          && (i > 0)
          && (weights[sortedIndices[i]] != weights[sortedIndices[i - 1]])) {
        break;
      }
    }
    if (m_Debug) {
      System.err.println("Selected " + trainData.numInstances() + " out of " + numInstances);
    }
    return trainData;
  }
Beispiel #2
0
  /** Computes average class values for each attribute and value */
  private void computeAverageClassValues() {

    double totalCounts, sum;
    Instance instance;
    double[] counts;

    double[][] avgClassValues = new double[getInputFormat().numAttributes()][0];
    m_Indices = new int[getInputFormat().numAttributes()][0];
    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
      Attribute att = getInputFormat().attribute(j);
      if (att.isNominal()) {
        avgClassValues[j] = new double[att.numValues()];
        counts = new double[att.numValues()];
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
          instance = getInputFormat().instance(i);
          if (!instance.classIsMissing() && (!instance.isMissing(j))) {
            counts[(int) instance.value(j)] += instance.weight();
            avgClassValues[j][(int) instance.value(j)] += instance.weight() * instance.classValue();
          }
        }
        sum = Utils.sum(avgClassValues[j]);
        totalCounts = Utils.sum(counts);
        if (Utils.gr(totalCounts, 0)) {
          for (int k = 0; k < att.numValues(); k++) {
            if (Utils.gr(counts[k], 0)) {
              avgClassValues[j][k] /= counts[k];
            } else {
              avgClassValues[j][k] = sum / totalCounts;
            }
          }
        }
        m_Indices[j] = Utils.sort(avgClassValues[j]);
      }
    }
  }
Beispiel #3
0
  private void findCutOff(double[] pos, double[] neg) {
    int[] pOrder = Utils.sort(pos), nOrder = Utils.sort(neg);
    /*
    System.err.println("\n\n???Positive: ");
    for(int t=0; t<pOrder.length; t++)
    System.err.print(t+":"+Utils.doubleToString(pos[pOrder[t]],0,2)+" ");
    System.err.println("\n\n???Negative: ");
    for(int t=0; t<nOrder.length; t++)
    System.err.print(t+":"+Utils.doubleToString(neg[nOrder[t]],0,2)+" ");
    */
    int pNum = pos.length, nNum = neg.length, count, p = 0, n = 0;
    double fstAccu = 0.0, sndAccu = (double) pNum, split;
    double maxAccu = 0, minDistTo0 = Double.MAX_VALUE;

    // Skip continuous negatives
    for (; (n < nNum) && (pos[pOrder[0]] >= neg[nOrder[n]]); n++, fstAccu++) ;

    if (n >= nNum) { // totally seperate
      m_Cutoff = (neg[nOrder[nNum - 1]] + pos[pOrder[0]]) / 2.0;
      // m_Cutoff = neg[nOrder[nNum-1]];
      return;
    }

    count = n;
    while ((p < pNum) && (n < nNum)) {
      // Compare the next in the two lists
      if (pos[pOrder[p]] >= neg[nOrder[n]]) { // Neg has less log-odds
        fstAccu += 1.0;
        split = neg[nOrder[n]];
        n++;
      } else {
        sndAccu -= 1.0;
        split = pos[pOrder[p]];
        p++;
      }
      count++;
      if ((fstAccu + sndAccu > maxAccu)
          || ((fstAccu + sndAccu == maxAccu) && (Math.abs(split) < minDistTo0))) {
        maxAccu = fstAccu + sndAccu;
        m_Cutoff = split;
        minDistTo0 = Math.abs(split);
      }
    }
  }
Beispiel #4
0
  /**
   * computes the thresholds for outliers and extreme values
   *
   * @param instances the data to work on
   */
  protected void computeThresholds(Instances instances) {
    int i;
    double[] values;
    int[] sortedIndices;
    int half;
    int quarter;
    double q1;
    double q2;
    double q3;

    m_UpperExtremeValue = new double[m_AttributeIndices.length];
    m_UpperOutlier = new double[m_AttributeIndices.length];
    m_LowerOutlier = new double[m_AttributeIndices.length];
    m_LowerExtremeValue = new double[m_AttributeIndices.length];
    m_Median = new double[m_AttributeIndices.length];
    m_IQR = new double[m_AttributeIndices.length];

    for (i = 0; i < m_AttributeIndices.length; i++) {
      // non-numeric attribute?
      if (m_AttributeIndices[i] == NON_NUMERIC) continue;

      // sort attribute data
      values = instances.attributeToDoubleArray(m_AttributeIndices[i]);
      sortedIndices = Utils.sort(values);

      // determine indices
      half = sortedIndices.length / 2;
      quarter = half / 2;

      if (sortedIndices.length % 2 == 1) {
        q2 = values[sortedIndices[half]];
      } else {
        q2 = (values[sortedIndices[half]] + values[sortedIndices[half + 1]]) / 2;
      }

      if (half % 2 == 1) {
        q1 = values[sortedIndices[quarter]];
        q3 = values[sortedIndices[sortedIndices.length - quarter - 1]];
      } else {
        q1 = (values[sortedIndices[quarter]] + values[sortedIndices[quarter + 1]]) / 2;
        q3 =
            (values[sortedIndices[sortedIndices.length - quarter - 1]]
                    + values[sortedIndices[sortedIndices.length - quarter]])
                / 2;
      }

      // determine thresholds and other values
      m_Median[i] = q2;
      m_IQR[i] = q3 - q1;
      m_UpperExtremeValue[i] = q3 + getExtremeValuesFactor() * m_IQR[i];
      m_UpperOutlier[i] = q3 + getOutlierFactor() * m_IQR[i];
      m_LowerOutlier[i] = q1 - getOutlierFactor() * m_IQR[i];
      m_LowerExtremeValue[i] = q1 - getExtremeValuesFactor() * m_IQR[i];
    }
  }
  /**
   * Gets the index of the instance with the closest threshold value to the desired target
   *
   * @param tcurve a set of instances that have been generated by this class
   * @param threshold the target threshold
   * @return the index of the instance that has threshold closest to the target, or -1 if this could
   *     not be found (i.e. no data, or bad threshold target)
   */
  public static int getThresholdInstance(Instances tcurve, double threshold) {

    if (!RELATION_NAME.equals(tcurve.relationName())
        || (tcurve.numInstances() == 0)
        || (threshold < 0)
        || (threshold > 1.0)) {
      return -1;
    }
    if (tcurve.numInstances() == 1) {
      return 0;
    }
    double[] tvals = tcurve.attributeToDoubleArray(tcurve.numAttributes() - 1);
    int[] sorted = Utils.sort(tvals);
    return binarySearch(sorted, tvals, threshold);
  }
  /**
   * Calculates the n point precision result, which is the precision averaged over n evenly spaced
   * (w.r.t recall) samples of the curve.
   *
   * @param tcurve a previously extracted threshold curve Instances.
   * @param n the number of points to average over.
   * @return the n-point precision.
   */
  public static double getNPointPrecision(Instances tcurve, int n) {

    if (!RELATION_NAME.equals(tcurve.relationName()) || (tcurve.numInstances() == 0)) {
      return Double.NaN;
    }
    int recallInd = tcurve.attribute(RECALL_NAME).index();
    int precisInd = tcurve.attribute(PRECISION_NAME).index();
    double[] recallVals = tcurve.attributeToDoubleArray(recallInd);
    int[] sorted = Utils.sort(recallVals);
    double isize = 1.0 / (n - 1);
    double psum = 0;
    for (int i = 0; i < n; i++) {
      int pos = binarySearch(sorted, recallVals, i * isize);
      double recall = recallVals[sorted[pos]];
      double precis = tcurve.instance(sorted[pos]).value(precisInd);
      /*
      System.err.println("Point " + (i + 1) + ": i=" + pos
                         + " r=" + (i * isize)
                         + " p'=" + precis
                         + " r'=" + recall);
      */
      // interpolate figures for non-endpoints
      while ((pos != 0) && (pos < sorted.length - 1)) {
        pos++;
        double recall2 = recallVals[sorted[pos]];
        if (recall2 != recall) {
          double precis2 = tcurve.instance(sorted[pos]).value(precisInd);
          double slope = (precis2 - precis) / (recall2 - recall);
          double offset = precis - recall * slope;
          precis = isize * i * slope + offset;
          /*
          System.err.println("Point2 " + (i + 1) + ": i=" + pos
                             + " r=" + (i * isize)
                             + " p'=" + precis2
                             + " r'=" + recall2
                             + " p''=" + precis);
          */
          break;
        }
      }
      psum += precis;
    }
    return psum / n;
  }
Beispiel #7
0
  /**
   * Sorts the evaluated attribute list
   *
   * @return an array of sorted (highest eval to lowest) attribute indexes
   * @throws Exception of sorting can't be done.
   */
  public double[][] rankedAttributes() throws Exception {
    int i, j;

    if (m_attributeList == null || m_attributeMerit == null) {
      throw new Exception(
          "Search must be performed before a ranked " + "attribute list can be obtained");
    }

    int[] ranked = Utils.sort(m_attributeMerit);
    // reverse the order of the ranked indexes
    double[][] bestToWorst = new double[ranked.length][2];

    for (i = ranked.length - 1, j = 0; i >= 0; i--) {
      bestToWorst[j++][0] = ranked[i];
    }

    // convert the indexes to attribute indexes
    for (i = 0; i < bestToWorst.length; i++) {
      int temp = ((int) bestToWorst[i][0]);
      bestToWorst[i][0] = m_attributeList[temp];
      bestToWorst[i][1] = m_attributeMerit[temp];
    }

    if (m_numToSelect > bestToWorst.length) {
      throw new Exception("More attributes requested than exist in the data");
    }

    if (m_numToSelect <= 0) {
      if (m_threshold == -Double.MAX_VALUE) {
        m_calculatedNumToSelect = bestToWorst.length;
      } else {
        determineNumToSelectFromThreshold(bestToWorst);
      }
    }
    /*    if (m_numToSelect > 0) {
    determineThreshFromNumToSelect(bestToWorst);
    } */

    return bestToWorst;
  }
Beispiel #8
0
  // Build a polytree on the tree
  protected int[][] polyTree(Instances D, Instances[] newD) throws Exception {
    L = (D == null) ? newD[0].classIndex() : D.classIndex();
    CD = new double[L][L];
    numVisited = 0;
    int root = 0;
    int[][] pa = new int[L][0];
    visited = new boolean[L];
    flagCB = new boolean[L];
    Arrays.fill(visited, false);
    Arrays.fill(flagCB, false);

    if (depMode) {
      // Calculate the conditional MI matrix
      if (newD == null) CD = conDepMatrix(D);
      if (newD != null) CD = conDepMatrix(newD);
    } else {
      // Calculate the marginal normalized MI matrix
      CD = StatUtilsPro.NormMargDep(D);
    }

    // Build the tree skeleton
    int[][] paTree = skeleton(CD);

    // Find the causal basins
    int[][] paPoly = new int[L][L];
    causalBasin(root, paTree, paPoly);

    // If causal basin can't cover all labels, build a directed tree (paTemp)
    int[][] paTemp = new int[L][0];
    root = -1;
    for (int j = 0; j < L; j++) {
      for (int k = j; k < L; k++) {
        if (paPoly[j][k] == 1) {
          root = j;
          Arrays.fill(visited, false);
          visited[root] = true;
          treeify(root, paPoly, paTemp);
          break;
        }
      }
      if (root != -1) break;
    }
    // Save the parents of every node in the polytree (pa)
    for (int j = 0; j < L; j++) {
      for (int k = j; k < L; k++) {
        if (paPoly[j][k] == 3) pa[j] = A.append(pa[j], k);
        if (paPoly[j][k] == 2) pa[k] = A.append(pa[k], j);
      }
    }
    for (int j = 0; j < L; j++) {
      if (pa[j].length < 1) {
        for (int v : paTemp[j]) {
          pa[j] = A.append(pa[j], v);
          paPoly[j][v] = 3;
          paPoly[v][j] = 2;
        }
      }
    }

    // Rank the labels in the polytree (rank)
    root = 0;
    int[] rank = new int[L];
    Arrays.fill(rank, 0);
    Arrays.fill(visited, false);
    rankLabel(root, paPoly, rank);
    chainOrder = Utils.sort(rank);

    // Enhance the polytree
    int[] temp = new int[] {};
    double thCD = 0.0005;
    for (int j : chainOrder) {
      for (int k : temp) {
        if (paPoly[j][k] != 3) {
          if (j < k && CD[j][k] > thCD) pa[j] = A.append(pa[j], k);
          if (j > k && CD[k][j] > thCD) pa[j] = A.append(pa[j], k);
        }
      }
      temp = A.append(temp, j);
    }
    return pa;
  }
Beispiel #9
0
 /*
   Sort, in increasing order, the instances and the meanDistance arrays.
 */
 public void sort() {
   indexes = Utils.sort(meanDistance);
 };
  /**
   * Calculates the performance stats for the desired class and return results as a set of
   * Instances.
   *
   * @param predictions the predictions to base the curve on
   * @param classIndex index of the class of interest.
   * @return datapoints as a set of instances.
   */
  public Instances getCurve(FastVector predictions, int classIndex) {

    if ((predictions.size() == 0)
        || (((NominalPrediction) predictions.elementAt(0)).distribution().length <= classIndex)) {
      System.out.println(
          "Foooobared "
              + predictions.size()
              + " "
              + ((NominalPrediction) predictions.elementAt(0)).distribution().length
              + " "
              + classIndex);
      return null;
    }

    double totPos = 0, totNeg = 0;
    double[] probs = getProbabilities(predictions, classIndex);

    // Get distribution of positive/negatives
    for (int i = 0; i < probs.length; i++) {
      NominalPrediction pred = (NominalPrediction) predictions.elementAt(i);
      if (pred.actual() == Prediction.MISSING_VALUE) {
        System.err.println(getClass().getName() + " Skipping prediction with missing class value");
        continue;
      }
      if (pred.weight() < 0) {
        System.err.println(getClass().getName() + " Skipping prediction with negative weight");
        continue;
      }
      if (pred.actual() == classIndex) {
        totPos += pred.weight();
      } else {
        totNeg += pred.weight();
      }
    }

    Instances insts = makeHeader();
    int[] sorted = Utils.sort(probs);
    TwoClassStats tc = new TwoClassStats(totPos, totNeg, 0, 0);
    double threshold = 0;
    double cumulativePos = 0;
    double cumulativeNeg = 0;

    for (int i = 0; i < sorted.length; i++) {

      if ((i == 0) || (probs[sorted[i]] > threshold)) {
        tc.setTruePositive(tc.getTruePositive() - cumulativePos);
        tc.setFalseNegative(tc.getFalseNegative() + cumulativePos);
        tc.setFalsePositive(tc.getFalsePositive() - cumulativeNeg);
        tc.setTrueNegative(tc.getTrueNegative() + cumulativeNeg);
        threshold = probs[sorted[i]];
        insts.add(makeInstance(tc, threshold));
        cumulativePos = 0;
        cumulativeNeg = 0;
        if (i == sorted.length - 1) {
          break;
        }
      }

      NominalPrediction pred = (NominalPrediction) predictions.elementAt(sorted[i]);

      if (pred.actual() == Prediction.MISSING_VALUE) {
        System.err.println(getClass().getName() + " Skipping prediction with missing class value");
        continue;
      }
      if (pred.weight() < 0) {
        System.err.println(getClass().getName() + " Skipping prediction with negative weight");
        continue;
      }
      if (pred.actual() == classIndex) {
        cumulativePos += pred.weight();
      } else {
        cumulativeNeg += pred.weight();
      }

      /*
      System.out.println(tc + " " + probs[sorted[i]]
                         + " " + (pred.actual() == classIndex));
      */
      /*if ((i != (sorted.length - 1)) &&
               ((i == 0) ||
               (probs[sorted[i]] != probs[sorted[i - 1]]))) {
             insts.add(makeInstance(tc, probs[sorted[i]]));
      }*/
    }

    // make sure a zero point gets into the curve
    if (tc.getFalseNegative() != totPos || tc.getTrueNegative() != totNeg) {
      tc = new TwoClassStats(0, 0, totNeg, totPos);
      threshold = probs[sorted[sorted.length - 1]] + 10e-6;
      insts.add(makeInstance(tc, threshold));
    }

    return insts;
  }
Beispiel #11
0
  /**
   * Generates the classifier.
   *
   * @param instances set of instances serving as training data
   * @throws Exception if the classifier has not been generated successfully
   */
  public void buildClassifier(Instances instances) throws Exception {

    if (!m_weightByConfidence) {
      TINY = 0.0;
    }

    // can classifier handle the data?
    getCapabilities().testWithFail(instances);

    // remove instances with missing class
    instances = new Instances(instances);
    instances.deleteWithMissingClass();

    m_ClassIndex = instances.classIndex();
    m_NumClasses = instances.numClasses();
    m_globalCounts = new double[m_NumClasses];
    m_maxEntrop = Math.log(m_NumClasses) / Math.log(2);

    m_Instances = new Instances(instances, 0); // Copy the structure for ref

    m_intervalBounds = new double[instances.numAttributes()][2 + (2 * m_NumClasses)];

    for (int j = 0; j < instances.numAttributes(); j++) {
      boolean alt = false;
      for (int i = 0; i < m_NumClasses * 2 + 2; i++) {
        if (i == 0) {
          m_intervalBounds[j][i] = Double.NEGATIVE_INFINITY;
        } else if (i == m_NumClasses * 2 + 1) {
          m_intervalBounds[j][i] = Double.POSITIVE_INFINITY;
        } else {
          if (alt) {
            m_intervalBounds[j][i] = Double.NEGATIVE_INFINITY;
            alt = false;
          } else {
            m_intervalBounds[j][i] = Double.POSITIVE_INFINITY;
            alt = true;
          }
        }
      }
    }

    // find upper and lower bounds for numeric attributes
    for (int j = 0; j < instances.numAttributes(); j++) {
      if (j != m_ClassIndex && instances.attribute(j).isNumeric()) {
        for (int i = 0; i < instances.numInstances(); i++) {
          Instance inst = instances.instance(i);
          if (!inst.isMissing(j)) {
            if (inst.value(j) < m_intervalBounds[j][((int) inst.classValue() * 2 + 1)]) {
              m_intervalBounds[j][((int) inst.classValue() * 2 + 1)] = inst.value(j);
            }
            if (inst.value(j) > m_intervalBounds[j][((int) inst.classValue() * 2 + 2)]) {
              m_intervalBounds[j][((int) inst.classValue() * 2 + 2)] = inst.value(j);
            }
          }
        }
      }
    }

    m_counts = new double[instances.numAttributes()][][];

    // sort intervals
    for (int i = 0; i < instances.numAttributes(); i++) {
      if (instances.attribute(i).isNumeric()) {
        int[] sortedIntervals = Utils.sort(m_intervalBounds[i]);
        // remove any duplicate bounds
        int count = 1;
        for (int j = 1; j < sortedIntervals.length; j++) {
          if (m_intervalBounds[i][sortedIntervals[j]]
              != m_intervalBounds[i][sortedIntervals[j - 1]]) {
            count++;
          }
        }
        double[] reordered = new double[count];
        count = 1;
        reordered[0] = m_intervalBounds[i][sortedIntervals[0]];
        for (int j = 1; j < sortedIntervals.length; j++) {
          if (m_intervalBounds[i][sortedIntervals[j]]
              != m_intervalBounds[i][sortedIntervals[j - 1]]) {
            reordered[count] = m_intervalBounds[i][sortedIntervals[j]];
            count++;
          }
        }
        m_intervalBounds[i] = reordered;
        m_counts[i] = new double[count][m_NumClasses];
      } else if (i != m_ClassIndex) { // nominal attribute
        m_counts[i] = new double[instances.attribute(i).numValues()][m_NumClasses];
      }
    }

    // collect class counts
    for (int i = 0; i < instances.numInstances(); i++) {
      Instance inst = instances.instance(i);
      m_globalCounts[(int) instances.instance(i).classValue()] += inst.weight();
      for (int j = 0; j < instances.numAttributes(); j++) {
        if (!inst.isMissing(j) && j != m_ClassIndex) {
          if (instances.attribute(j).isNumeric()) {
            double val = inst.value(j);

            int k;
            for (k = m_intervalBounds[j].length - 1; k >= 0; k--) {
              if (val > m_intervalBounds[j][k]) {
                m_counts[j][k][(int) inst.classValue()] += inst.weight();
                break;
              } else if (val == m_intervalBounds[j][k]) {
                m_counts[j][k][(int) inst.classValue()] += (inst.weight() / 2.0);
                m_counts[j][k - 1][(int) inst.classValue()] += (inst.weight() / 2.0);
                ;
                break;
              }
            }

          } else {
            // nominal attribute
            m_counts[j][(int) inst.value(j)][(int) inst.classValue()] += inst.weight();
            ;
          }
        }
      }
    }
  }
  private double[] calculateRegionProbs(int j, int i) throws Exception {
    double[] sumOfProbsForRegion = new double[m_trainingData.classAttribute().numValues()];

    for (int u = 0; u < m_numOfSamplesPerRegion; u++) {

      double[] sumOfProbsForLocation = new double[m_trainingData.classAttribute().numValues()];

      m_weightingAttsValues[m_xAttribute] = getRandomX(j);
      m_weightingAttsValues[m_yAttribute] = getRandomY(m_panelHeight - i - 1);

      m_dataGenerator.setWeightingValues(m_weightingAttsValues);

      double[] weights = m_dataGenerator.getWeights();
      double sumOfWeights = Utils.sum(weights);
      int[] indices = Utils.sort(weights);

      // Prune 1% of weight mass
      int[] newIndices = new int[indices.length];
      double sumSoFar = 0;
      double criticalMass = 0.99 * sumOfWeights;
      int index = weights.length - 1;
      int counter = 0;
      for (int z = weights.length - 1; z >= 0; z--) {
        newIndices[index--] = indices[z];
        sumSoFar += weights[indices[z]];
        counter++;
        if (sumSoFar > criticalMass) {
          break;
        }
      }
      indices = new int[counter];
      System.arraycopy(newIndices, index + 1, indices, 0, counter);

      for (int z = 0; z < m_numOfSamplesPerGenerator; z++) {

        m_dataGenerator.setWeightingValues(m_weightingAttsValues);
        double[][] values = m_dataGenerator.generateInstances(indices);

        for (int q = 0; q < values.length; q++) {
          if (values[q] != null) {
            System.arraycopy(values[q], 0, m_vals, 0, m_vals.length);
            m_vals[m_xAttribute] = m_weightingAttsValues[m_xAttribute];
            m_vals[m_yAttribute] = m_weightingAttsValues[m_yAttribute];

            // classify the instance
            m_dist = m_classifier.distributionForInstance(m_predInst);

            for (int k = 0; k < sumOfProbsForLocation.length; k++) {
              sumOfProbsForLocation[k] += (m_dist[k] * weights[q]);
            }
          }
        }
      }

      for (int k = 0; k < sumOfProbsForRegion.length; k++) {
        sumOfProbsForRegion[k] += (sumOfProbsForLocation[k] * sumOfWeights);
      }
    }

    // average
    Utils.normalize(sumOfProbsForRegion);

    // cache
    double[] tempDist = new double[sumOfProbsForRegion.length];
    System.arraycopy(sumOfProbsForRegion, 0, tempDist, 0, sumOfProbsForRegion.length);

    return tempDist;
  }