public static double CA(Instances odata, int[] clusters) { double result = 0; double[] tmpdclass = odata.attributeToDoubleArray(odata.numAttributes() - 1); int[] oclass = new int[odata.numInstances()]; for (int i = 0; i < tmpdclass.length; ++i) { oclass[i] = (int) tmpdclass[i]; } int[] tmpclass = oclass.clone(); int[] tmpclusters = clusters.clone(); Arrays.sort(tmpclusters); Arrays.sort(tmpclass); int[][] M = new int[tmpclass[tmpclass.length - 1] + 1][tmpclusters[tmpclusters.length - 1] + 1]; for (int i = 0; i < clusters.length; ++i) { M[oclass[i]][clusters[i]]++; } for (int i = 0; i < M.length; ++i) { System.out.println(Arrays.toString(M[i])); } for (int i = 0; i < M.length; ++i) { int maxindex = -1; for (int j = 0; j < M[0].length - 1; ++j) { if (M[i][j] < M[i][j + 1]) maxindex = j + 1; } M[i][0] = maxindex; } for (int i = 0; i < oclass.length; ++i) { if (M[oclass[i]][0] == clusters[i]) result++; } return (double) result / (double) odata.numInstances(); }
/** * Calculates the area under the ROC curve as the Wilcoxon-Mann-Whitney statistic. * * @param tcurve a previously extracted threshold curve Instances. * @return the ROC area, or Double.NaN if you don't pass in a ThresholdCurve generated Instances. */ public static double getROCArea(Instances tcurve) { final int n = tcurve.numInstances(); if (!RELATION_NAME.equals(tcurve.relationName()) || (n == 0)) { return Double.NaN; } final int tpInd = tcurve.attribute(TRUE_POS_NAME).index(); final int fpInd = tcurve.attribute(FALSE_POS_NAME).index(); final double[] tpVals = tcurve.attributeToDoubleArray(tpInd); final double[] fpVals = tcurve.attributeToDoubleArray(fpInd); double area = 0.0, cumNeg = 0.0; final double totalPos = tpVals[0]; final double totalNeg = fpVals[0]; for (int i = 0; i < n; i++) { double cip, cin; if (i < n - 1) { cip = tpVals[i] - tpVals[i + 1]; cin = fpVals[i] - fpVals[i + 1]; } else { cip = tpVals[n - 1]; cin = fpVals[n - 1]; } area += cip * (cumNeg + (0.5 * cin)); cumNeg += cin; } area /= (totalNeg * totalPos); return area; }
/** * Calculates the area under the precision-recall curve (AUPRC). * * @param tcurve a previously extracted threshold curve Instances. * @return the PRC area, or Double.NaN if you don't pass in a ThresholdCurve generated Instances. */ public static double getPRCArea(Instances tcurve) { final int n = tcurve.numInstances(); if (!RELATION_NAME.equals(tcurve.relationName()) || (n == 0)) { return Double.NaN; } final int pInd = tcurve.attribute(PRECISION_NAME).index(); final int rInd = tcurve.attribute(RECALL_NAME).index(); final double[] pVals = tcurve.attributeToDoubleArray(pInd); final double[] rVals = tcurve.attributeToDoubleArray(rInd); double area = 0; double xlast = rVals[n - 1]; // start from the first real p/r pair (not the artificial zero point) for (int i = n - 2; i >= 0; i--) { double recallDelta = rVals[i] - xlast; area += (pVals[i] * recallDelta); xlast = rVals[i]; } if (area == 0) { return Utils.missingValue(); } return area; }
/** * computes the thresholds for outliers and extreme values * * @param instances the data to work on */ protected void computeThresholds(Instances instances) { int i; double[] values; int[] sortedIndices; int half; int quarter; double q1; double q2; double q3; m_UpperExtremeValue = new double[m_AttributeIndices.length]; m_UpperOutlier = new double[m_AttributeIndices.length]; m_LowerOutlier = new double[m_AttributeIndices.length]; m_LowerExtremeValue = new double[m_AttributeIndices.length]; m_Median = new double[m_AttributeIndices.length]; m_IQR = new double[m_AttributeIndices.length]; for (i = 0; i < m_AttributeIndices.length; i++) { // non-numeric attribute? if (m_AttributeIndices[i] == NON_NUMERIC) continue; // sort attribute data values = instances.attributeToDoubleArray(m_AttributeIndices[i]); sortedIndices = Utils.sort(values); // determine indices half = sortedIndices.length / 2; quarter = half / 2; if (sortedIndices.length % 2 == 1) { q2 = values[sortedIndices[half]]; } else { q2 = (values[sortedIndices[half]] + values[sortedIndices[half + 1]]) / 2; } if (half % 2 == 1) { q1 = values[sortedIndices[quarter]]; q3 = values[sortedIndices[sortedIndices.length - quarter - 1]]; } else { q1 = (values[sortedIndices[quarter]] + values[sortedIndices[quarter + 1]]) / 2; q3 = (values[sortedIndices[sortedIndices.length - quarter - 1]] + values[sortedIndices[sortedIndices.length - quarter]]) / 2; } // determine thresholds and other values m_Median[i] = q2; m_IQR[i] = q3 - q1; m_UpperExtremeValue[i] = q3 + getExtremeValuesFactor() * m_IQR[i]; m_UpperOutlier[i] = q3 + getOutlierFactor() * m_IQR[i]; m_LowerOutlier[i] = q1 - getOutlierFactor() * m_IQR[i]; m_LowerExtremeValue[i] = q1 - getExtremeValuesFactor() * m_IQR[i]; } }
/** * processes the instances using the HAAR algorithm * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong */ protected Instances processHAAR(Instances instances) throws Exception { Instances result; int i; int n; int j; int clsIdx; double[] oldVal; double[] newVal; int level; int length; double[] clsVal; Attribute clsAtt; clsIdx = instances.classIndex(); clsVal = null; clsAtt = null; if (clsIdx > -1) { clsVal = instances.attributeToDoubleArray(clsIdx); clsAtt = (Attribute) instances.classAttribute().copy(); instances.setClassIndex(-1); instances.deleteAttributeAt(clsIdx); } result = new Instances(instances, 0); level = (int) StrictMath.ceil(StrictMath.log(instances.numAttributes()) / StrictMath.log(2.0)); for (i = 0; i < instances.numInstances(); i++) { oldVal = instances.instance(i).toDoubleArray(); newVal = new double[oldVal.length]; for (n = level; n > 0; n--) { length = (int) StrictMath.pow(2, n - 1); for (j = 0; j < length; j++) { newVal[j] = (oldVal[j * 2] + oldVal[j * 2 + 1]) / StrictMath.sqrt(2); newVal[j + length] = (oldVal[j * 2] - oldVal[j * 2 + 1]) / StrictMath.sqrt(2); } System.arraycopy(newVal, 0, oldVal, 0, newVal.length); } // add new transformed instance result.add(new DenseInstance(1, newVal)); } // add class again if (clsIdx > -1) { result.insertAttributeAt(clsAtt, clsIdx); result.setClassIndex(clsIdx); for (i = 0; i < clsVal.length; i++) result.instance(i).setClassValue(clsVal[i]); } return result; }
/** * Gets the index of the instance with the closest threshold value to the desired target * * @param tcurve a set of instances that have been generated by this class * @param threshold the target threshold * @return the index of the instance that has threshold closest to the target, or -1 if this could * not be found (i.e. no data, or bad threshold target) */ public static int getThresholdInstance(Instances tcurve, double threshold) { if (!RELATION_NAME.equals(tcurve.relationName()) || (tcurve.numInstances() == 0) || (threshold < 0) || (threshold > 1.0)) { return -1; } if (tcurve.numInstances() == 1) { return 0; } double[] tvals = tcurve.attributeToDoubleArray(tcurve.numAttributes() - 1); int[] sorted = Utils.sort(tvals); return binarySearch(sorted, tvals, threshold); }
/** * Calculates the n point precision result, which is the precision averaged over n evenly spaced * (w.r.t recall) samples of the curve. * * @param tcurve a previously extracted threshold curve Instances. * @param n the number of points to average over. * @return the n-point precision. */ public static double getNPointPrecision(Instances tcurve, int n) { if (!RELATION_NAME.equals(tcurve.relationName()) || (tcurve.numInstances() == 0)) { return Double.NaN; } int recallInd = tcurve.attribute(RECALL_NAME).index(); int precisInd = tcurve.attribute(PRECISION_NAME).index(); double[] recallVals = tcurve.attributeToDoubleArray(recallInd); int[] sorted = Utils.sort(recallVals); double isize = 1.0 / (n - 1); double psum = 0; for (int i = 0; i < n; i++) { int pos = binarySearch(sorted, recallVals, i * isize); double recall = recallVals[sorted[pos]]; double precis = tcurve.instance(sorted[pos]).value(precisInd); /* System.err.println("Point " + (i + 1) + ": i=" + pos + " r=" + (i * isize) + " p'=" + precis + " r'=" + recall); */ // interpolate figures for non-endpoints while ((pos != 0) && (pos < sorted.length - 1)) { pos++; double recall2 = recallVals[sorted[pos]]; if (recall2 != recall) { double precis2 = tcurve.instance(sorted[pos]).value(precisInd); double slope = (precis2 - precis) / (recall2 - recall); double offset = precis - recall * slope; precis = isize * i * slope + offset; /* System.err.println("Point2 " + (i + 1) + ": i=" + pos + " r=" + (i * isize) + " p'=" + precis2 + " r'=" + recall2 + " p''=" + precis); */ break; } } psum += precis; } return psum / n; }
public MethodGenerateClustering( Instances data, Instances dataforcluster, boolean[] labeledIndex, int ensemblesize, int Rnd, int methodind, double alpha) throws Exception { this.alpha = alpha; this.Rnd = new Random(Rnd); SquaredError = new double[ensemblesize]; this.data = data; this.datacluster = new Instances(dataforcluster); this.labeledIndex = labeledIndex; this.labeledRelation = labeled2relation(labeledIndex, data).clone(); double[] tmpdclass = data.attributeToDoubleArray(data.numAttributes() - 1); classes = new int[data.numInstances()]; for (int i = 0; i < tmpdclass.length; ++i) classes[i] = (int) tmpdclass[i]; switch (methodind) { case 0: { this.clustersRes = getMultiKmodesResults(data, dataforcluster, ensemblesize); break; } case 1: { this.clustersRes = getMultiKmodesResultswithRandomSelectFeature(data, dataforcluster, ensemblesize); break; } default: break; } res = getClusterers(); }