Exemple #1
0
  public static double CA(Instances odata, int[] clusters) {
    double result = 0;
    double[] tmpdclass = odata.attributeToDoubleArray(odata.numAttributes() - 1);
    int[] oclass = new int[odata.numInstances()];
    for (int i = 0; i < tmpdclass.length; ++i) {
      oclass[i] = (int) tmpdclass[i];
    }
    int[] tmpclass = oclass.clone();
    int[] tmpclusters = clusters.clone();

    Arrays.sort(tmpclusters);
    Arrays.sort(tmpclass);
    int[][] M = new int[tmpclass[tmpclass.length - 1] + 1][tmpclusters[tmpclusters.length - 1] + 1];

    for (int i = 0; i < clusters.length; ++i) {
      M[oclass[i]][clusters[i]]++;
    }
    for (int i = 0; i < M.length; ++i) {
      System.out.println(Arrays.toString(M[i]));
    }
    for (int i = 0; i < M.length; ++i) {
      int maxindex = -1;
      for (int j = 0; j < M[0].length - 1; ++j) {
        if (M[i][j] < M[i][j + 1]) maxindex = j + 1;
      }
      M[i][0] = maxindex;
    }

    for (int i = 0; i < oclass.length; ++i) {
      if (M[oclass[i]][0] == clusters[i]) result++;
    }

    return (double) result / (double) odata.numInstances();
  }
  /**
   * Calculates the area under the ROC curve as the Wilcoxon-Mann-Whitney statistic.
   *
   * @param tcurve a previously extracted threshold curve Instances.
   * @return the ROC area, or Double.NaN if you don't pass in a ThresholdCurve generated Instances.
   */
  public static double getROCArea(Instances tcurve) {

    final int n = tcurve.numInstances();
    if (!RELATION_NAME.equals(tcurve.relationName()) || (n == 0)) {
      return Double.NaN;
    }
    final int tpInd = tcurve.attribute(TRUE_POS_NAME).index();
    final int fpInd = tcurve.attribute(FALSE_POS_NAME).index();
    final double[] tpVals = tcurve.attributeToDoubleArray(tpInd);
    final double[] fpVals = tcurve.attributeToDoubleArray(fpInd);

    double area = 0.0, cumNeg = 0.0;
    final double totalPos = tpVals[0];
    final double totalNeg = fpVals[0];
    for (int i = 0; i < n; i++) {
      double cip, cin;
      if (i < n - 1) {
        cip = tpVals[i] - tpVals[i + 1];
        cin = fpVals[i] - fpVals[i + 1];
      } else {
        cip = tpVals[n - 1];
        cin = fpVals[n - 1];
      }
      area += cip * (cumNeg + (0.5 * cin));
      cumNeg += cin;
    }
    area /= (totalNeg * totalPos);

    return area;
  }
  /**
   * Calculates the area under the precision-recall curve (AUPRC).
   *
   * @param tcurve a previously extracted threshold curve Instances.
   * @return the PRC area, or Double.NaN if you don't pass in a ThresholdCurve generated Instances.
   */
  public static double getPRCArea(Instances tcurve) {
    final int n = tcurve.numInstances();
    if (!RELATION_NAME.equals(tcurve.relationName()) || (n == 0)) {
      return Double.NaN;
    }

    final int pInd = tcurve.attribute(PRECISION_NAME).index();
    final int rInd = tcurve.attribute(RECALL_NAME).index();
    final double[] pVals = tcurve.attributeToDoubleArray(pInd);
    final double[] rVals = tcurve.attributeToDoubleArray(rInd);

    double area = 0;
    double xlast = rVals[n - 1];

    // start from the first real p/r pair (not the artificial zero point)
    for (int i = n - 2; i >= 0; i--) {
      double recallDelta = rVals[i] - xlast;
      area += (pVals[i] * recallDelta);

      xlast = rVals[i];
    }

    if (area == 0) {
      return Utils.missingValue();
    }
    return area;
  }
  /**
   * computes the thresholds for outliers and extreme values
   *
   * @param instances the data to work on
   */
  protected void computeThresholds(Instances instances) {
    int i;
    double[] values;
    int[] sortedIndices;
    int half;
    int quarter;
    double q1;
    double q2;
    double q3;

    m_UpperExtremeValue = new double[m_AttributeIndices.length];
    m_UpperOutlier = new double[m_AttributeIndices.length];
    m_LowerOutlier = new double[m_AttributeIndices.length];
    m_LowerExtremeValue = new double[m_AttributeIndices.length];
    m_Median = new double[m_AttributeIndices.length];
    m_IQR = new double[m_AttributeIndices.length];

    for (i = 0; i < m_AttributeIndices.length; i++) {
      // non-numeric attribute?
      if (m_AttributeIndices[i] == NON_NUMERIC) continue;

      // sort attribute data
      values = instances.attributeToDoubleArray(m_AttributeIndices[i]);
      sortedIndices = Utils.sort(values);

      // determine indices
      half = sortedIndices.length / 2;
      quarter = half / 2;

      if (sortedIndices.length % 2 == 1) {
        q2 = values[sortedIndices[half]];
      } else {
        q2 = (values[sortedIndices[half]] + values[sortedIndices[half + 1]]) / 2;
      }

      if (half % 2 == 1) {
        q1 = values[sortedIndices[quarter]];
        q3 = values[sortedIndices[sortedIndices.length - quarter - 1]];
      } else {
        q1 = (values[sortedIndices[quarter]] + values[sortedIndices[quarter + 1]]) / 2;
        q3 =
            (values[sortedIndices[sortedIndices.length - quarter - 1]]
                    + values[sortedIndices[sortedIndices.length - quarter]])
                / 2;
      }

      // determine thresholds and other values
      m_Median[i] = q2;
      m_IQR[i] = q3 - q1;
      m_UpperExtremeValue[i] = q3 + getExtremeValuesFactor() * m_IQR[i];
      m_UpperOutlier[i] = q3 + getOutlierFactor() * m_IQR[i];
      m_LowerOutlier[i] = q1 - getOutlierFactor() * m_IQR[i];
      m_LowerExtremeValue[i] = q1 - getExtremeValuesFactor() * m_IQR[i];
    }
  }
Exemple #5
0
  /**
   * processes the instances using the HAAR algorithm
   *
   * @param instances the data to process
   * @return the modified data
   * @throws Exception in case the processing goes wrong
   */
  protected Instances processHAAR(Instances instances) throws Exception {
    Instances result;
    int i;
    int n;
    int j;
    int clsIdx;
    double[] oldVal;
    double[] newVal;
    int level;
    int length;
    double[] clsVal;
    Attribute clsAtt;

    clsIdx = instances.classIndex();
    clsVal = null;
    clsAtt = null;
    if (clsIdx > -1) {
      clsVal = instances.attributeToDoubleArray(clsIdx);
      clsAtt = (Attribute) instances.classAttribute().copy();
      instances.setClassIndex(-1);
      instances.deleteAttributeAt(clsIdx);
    }
    result = new Instances(instances, 0);
    level = (int) StrictMath.ceil(StrictMath.log(instances.numAttributes()) / StrictMath.log(2.0));

    for (i = 0; i < instances.numInstances(); i++) {
      oldVal = instances.instance(i).toDoubleArray();
      newVal = new double[oldVal.length];

      for (n = level; n > 0; n--) {
        length = (int) StrictMath.pow(2, n - 1);

        for (j = 0; j < length; j++) {
          newVal[j] = (oldVal[j * 2] + oldVal[j * 2 + 1]) / StrictMath.sqrt(2);
          newVal[j + length] = (oldVal[j * 2] - oldVal[j * 2 + 1]) / StrictMath.sqrt(2);
        }

        System.arraycopy(newVal, 0, oldVal, 0, newVal.length);
      }

      // add new transformed instance
      result.add(new DenseInstance(1, newVal));
    }

    // add class again
    if (clsIdx > -1) {
      result.insertAttributeAt(clsAtt, clsIdx);
      result.setClassIndex(clsIdx);
      for (i = 0; i < clsVal.length; i++) result.instance(i).setClassValue(clsVal[i]);
    }

    return result;
  }
  /**
   * Gets the index of the instance with the closest threshold value to the desired target
   *
   * @param tcurve a set of instances that have been generated by this class
   * @param threshold the target threshold
   * @return the index of the instance that has threshold closest to the target, or -1 if this could
   *     not be found (i.e. no data, or bad threshold target)
   */
  public static int getThresholdInstance(Instances tcurve, double threshold) {

    if (!RELATION_NAME.equals(tcurve.relationName())
        || (tcurve.numInstances() == 0)
        || (threshold < 0)
        || (threshold > 1.0)) {
      return -1;
    }
    if (tcurve.numInstances() == 1) {
      return 0;
    }
    double[] tvals = tcurve.attributeToDoubleArray(tcurve.numAttributes() - 1);
    int[] sorted = Utils.sort(tvals);
    return binarySearch(sorted, tvals, threshold);
  }
  /**
   * Calculates the n point precision result, which is the precision averaged over n evenly spaced
   * (w.r.t recall) samples of the curve.
   *
   * @param tcurve a previously extracted threshold curve Instances.
   * @param n the number of points to average over.
   * @return the n-point precision.
   */
  public static double getNPointPrecision(Instances tcurve, int n) {

    if (!RELATION_NAME.equals(tcurve.relationName()) || (tcurve.numInstances() == 0)) {
      return Double.NaN;
    }
    int recallInd = tcurve.attribute(RECALL_NAME).index();
    int precisInd = tcurve.attribute(PRECISION_NAME).index();
    double[] recallVals = tcurve.attributeToDoubleArray(recallInd);
    int[] sorted = Utils.sort(recallVals);
    double isize = 1.0 / (n - 1);
    double psum = 0;
    for (int i = 0; i < n; i++) {
      int pos = binarySearch(sorted, recallVals, i * isize);
      double recall = recallVals[sorted[pos]];
      double precis = tcurve.instance(sorted[pos]).value(precisInd);
      /*
      System.err.println("Point " + (i + 1) + ": i=" + pos
                         + " r=" + (i * isize)
                         + " p'=" + precis
                         + " r'=" + recall);
      */
      // interpolate figures for non-endpoints
      while ((pos != 0) && (pos < sorted.length - 1)) {
        pos++;
        double recall2 = recallVals[sorted[pos]];
        if (recall2 != recall) {
          double precis2 = tcurve.instance(sorted[pos]).value(precisInd);
          double slope = (precis2 - precis) / (recall2 - recall);
          double offset = precis - recall * slope;
          precis = isize * i * slope + offset;
          /*
          System.err.println("Point2 " + (i + 1) + ": i=" + pos
                             + " r=" + (i * isize)
                             + " p'=" + precis2
                             + " r'=" + recall2
                             + " p''=" + precis);
          */
          break;
        }
      }
      psum += precis;
    }
    return psum / n;
  }
  public MethodGenerateClustering(
      Instances data,
      Instances dataforcluster,
      boolean[] labeledIndex,
      int ensemblesize,
      int Rnd,
      int methodind,
      double alpha)
      throws Exception {
    this.alpha = alpha;
    this.Rnd = new Random(Rnd);
    SquaredError = new double[ensemblesize];
    this.data = data;
    this.datacluster = new Instances(dataforcluster);
    this.labeledIndex = labeledIndex;

    this.labeledRelation = labeled2relation(labeledIndex, data).clone();
    double[] tmpdclass = data.attributeToDoubleArray(data.numAttributes() - 1);
    classes = new int[data.numInstances()];
    for (int i = 0; i < tmpdclass.length; ++i) classes[i] = (int) tmpdclass[i];

    switch (methodind) {
      case 0:
        {
          this.clustersRes = getMultiKmodesResults(data, dataforcluster, ensemblesize);
          break;
        }
      case 1:
        {
          this.clustersRes =
              getMultiKmodesResultswithRandomSelectFeature(data, dataforcluster, ensemblesize);
          break;
        }
      default:
        break;
    }
    res = getClusterers();
  }