/** * Select only instances with weights that contribute to the specified quantile of the weight * distribution * * @param data the input instances * @param quantile the specified quantile eg 0.9 to select 90% of the weight mass * @return the selected instances */ protected Instances selectWeightQuantile(Instances data, double quantile) { int numInstances = data.numInstances(); Instances trainData = new Instances(data, numInstances); double[] weights = new double[numInstances]; double sumOfWeights = 0; for (int i = 0; i < numInstances; i++) { weights[i] = data.instance(i).weight(); sumOfWeights += weights[i]; } double weightMassToSelect = sumOfWeights * quantile; int[] sortedIndices = Utils.sort(weights); // Select the instances sumOfWeights = 0; for (int i = numInstances - 1; i >= 0; i--) { Instance instance = (Instance) data.instance(sortedIndices[i]).copy(); trainData.add(instance); sumOfWeights += weights[sortedIndices[i]]; if ((sumOfWeights > weightMassToSelect) && (i > 0) && (weights[sortedIndices[i]] != weights[sortedIndices[i - 1]])) { break; } } if (m_Debug) { System.err.println("Selected " + trainData.numInstances() + " out of " + numInstances); } return trainData; }
/** Computes average class values for each attribute and value */ private void computeAverageClassValues() { double totalCounts, sum; Instance instance; double[] counts; double[][] avgClassValues = new double[getInputFormat().numAttributes()][0]; m_Indices = new int[getInputFormat().numAttributes()][0]; for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if (att.isNominal()) { avgClassValues[j] = new double[att.numValues()]; counts = new double[att.numValues()]; for (int i = 0; i < getInputFormat().numInstances(); i++) { instance = getInputFormat().instance(i); if (!instance.classIsMissing() && (!instance.isMissing(j))) { counts[(int) instance.value(j)] += instance.weight(); avgClassValues[j][(int) instance.value(j)] += instance.weight() * instance.classValue(); } } sum = Utils.sum(avgClassValues[j]); totalCounts = Utils.sum(counts); if (Utils.gr(totalCounts, 0)) { for (int k = 0; k < att.numValues(); k++) { if (Utils.gr(counts[k], 0)) { avgClassValues[j][k] /= counts[k]; } else { avgClassValues[j][k] = sum / totalCounts; } } } m_Indices[j] = Utils.sort(avgClassValues[j]); } } }
private void findCutOff(double[] pos, double[] neg) { int[] pOrder = Utils.sort(pos), nOrder = Utils.sort(neg); /* System.err.println("\n\n???Positive: "); for(int t=0; t<pOrder.length; t++) System.err.print(t+":"+Utils.doubleToString(pos[pOrder[t]],0,2)+" "); System.err.println("\n\n???Negative: "); for(int t=0; t<nOrder.length; t++) System.err.print(t+":"+Utils.doubleToString(neg[nOrder[t]],0,2)+" "); */ int pNum = pos.length, nNum = neg.length, count, p = 0, n = 0; double fstAccu = 0.0, sndAccu = (double) pNum, split; double maxAccu = 0, minDistTo0 = Double.MAX_VALUE; // Skip continuous negatives for (; (n < nNum) && (pos[pOrder[0]] >= neg[nOrder[n]]); n++, fstAccu++) ; if (n >= nNum) { // totally seperate m_Cutoff = (neg[nOrder[nNum - 1]] + pos[pOrder[0]]) / 2.0; // m_Cutoff = neg[nOrder[nNum-1]]; return; } count = n; while ((p < pNum) && (n < nNum)) { // Compare the next in the two lists if (pos[pOrder[p]] >= neg[nOrder[n]]) { // Neg has less log-odds fstAccu += 1.0; split = neg[nOrder[n]]; n++; } else { sndAccu -= 1.0; split = pos[pOrder[p]]; p++; } count++; if ((fstAccu + sndAccu > maxAccu) || ((fstAccu + sndAccu == maxAccu) && (Math.abs(split) < minDistTo0))) { maxAccu = fstAccu + sndAccu; m_Cutoff = split; minDistTo0 = Math.abs(split); } } }
/** * computes the thresholds for outliers and extreme values * * @param instances the data to work on */ protected void computeThresholds(Instances instances) { int i; double[] values; int[] sortedIndices; int half; int quarter; double q1; double q2; double q3; m_UpperExtremeValue = new double[m_AttributeIndices.length]; m_UpperOutlier = new double[m_AttributeIndices.length]; m_LowerOutlier = new double[m_AttributeIndices.length]; m_LowerExtremeValue = new double[m_AttributeIndices.length]; m_Median = new double[m_AttributeIndices.length]; m_IQR = new double[m_AttributeIndices.length]; for (i = 0; i < m_AttributeIndices.length; i++) { // non-numeric attribute? if (m_AttributeIndices[i] == NON_NUMERIC) continue; // sort attribute data values = instances.attributeToDoubleArray(m_AttributeIndices[i]); sortedIndices = Utils.sort(values); // determine indices half = sortedIndices.length / 2; quarter = half / 2; if (sortedIndices.length % 2 == 1) { q2 = values[sortedIndices[half]]; } else { q2 = (values[sortedIndices[half]] + values[sortedIndices[half + 1]]) / 2; } if (half % 2 == 1) { q1 = values[sortedIndices[quarter]]; q3 = values[sortedIndices[sortedIndices.length - quarter - 1]]; } else { q1 = (values[sortedIndices[quarter]] + values[sortedIndices[quarter + 1]]) / 2; q3 = (values[sortedIndices[sortedIndices.length - quarter - 1]] + values[sortedIndices[sortedIndices.length - quarter]]) / 2; } // determine thresholds and other values m_Median[i] = q2; m_IQR[i] = q3 - q1; m_UpperExtremeValue[i] = q3 + getExtremeValuesFactor() * m_IQR[i]; m_UpperOutlier[i] = q3 + getOutlierFactor() * m_IQR[i]; m_LowerOutlier[i] = q1 - getOutlierFactor() * m_IQR[i]; m_LowerExtremeValue[i] = q1 - getExtremeValuesFactor() * m_IQR[i]; } }
/** * Gets the index of the instance with the closest threshold value to the desired target * * @param tcurve a set of instances that have been generated by this class * @param threshold the target threshold * @return the index of the instance that has threshold closest to the target, or -1 if this could * not be found (i.e. no data, or bad threshold target) */ public static int getThresholdInstance(Instances tcurve, double threshold) { if (!RELATION_NAME.equals(tcurve.relationName()) || (tcurve.numInstances() == 0) || (threshold < 0) || (threshold > 1.0)) { return -1; } if (tcurve.numInstances() == 1) { return 0; } double[] tvals = tcurve.attributeToDoubleArray(tcurve.numAttributes() - 1); int[] sorted = Utils.sort(tvals); return binarySearch(sorted, tvals, threshold); }
/** * Calculates the n point precision result, which is the precision averaged over n evenly spaced * (w.r.t recall) samples of the curve. * * @param tcurve a previously extracted threshold curve Instances. * @param n the number of points to average over. * @return the n-point precision. */ public static double getNPointPrecision(Instances tcurve, int n) { if (!RELATION_NAME.equals(tcurve.relationName()) || (tcurve.numInstances() == 0)) { return Double.NaN; } int recallInd = tcurve.attribute(RECALL_NAME).index(); int precisInd = tcurve.attribute(PRECISION_NAME).index(); double[] recallVals = tcurve.attributeToDoubleArray(recallInd); int[] sorted = Utils.sort(recallVals); double isize = 1.0 / (n - 1); double psum = 0; for (int i = 0; i < n; i++) { int pos = binarySearch(sorted, recallVals, i * isize); double recall = recallVals[sorted[pos]]; double precis = tcurve.instance(sorted[pos]).value(precisInd); /* System.err.println("Point " + (i + 1) + ": i=" + pos + " r=" + (i * isize) + " p'=" + precis + " r'=" + recall); */ // interpolate figures for non-endpoints while ((pos != 0) && (pos < sorted.length - 1)) { pos++; double recall2 = recallVals[sorted[pos]]; if (recall2 != recall) { double precis2 = tcurve.instance(sorted[pos]).value(precisInd); double slope = (precis2 - precis) / (recall2 - recall); double offset = precis - recall * slope; precis = isize * i * slope + offset; /* System.err.println("Point2 " + (i + 1) + ": i=" + pos + " r=" + (i * isize) + " p'=" + precis2 + " r'=" + recall2 + " p''=" + precis); */ break; } } psum += precis; } return psum / n; }
/** * Sorts the evaluated attribute list * * @return an array of sorted (highest eval to lowest) attribute indexes * @throws Exception of sorting can't be done. */ public double[][] rankedAttributes() throws Exception { int i, j; if (m_attributeList == null || m_attributeMerit == null) { throw new Exception( "Search must be performed before a ranked " + "attribute list can be obtained"); } int[] ranked = Utils.sort(m_attributeMerit); // reverse the order of the ranked indexes double[][] bestToWorst = new double[ranked.length][2]; for (i = ranked.length - 1, j = 0; i >= 0; i--) { bestToWorst[j++][0] = ranked[i]; } // convert the indexes to attribute indexes for (i = 0; i < bestToWorst.length; i++) { int temp = ((int) bestToWorst[i][0]); bestToWorst[i][0] = m_attributeList[temp]; bestToWorst[i][1] = m_attributeMerit[temp]; } if (m_numToSelect > bestToWorst.length) { throw new Exception("More attributes requested than exist in the data"); } if (m_numToSelect <= 0) { if (m_threshold == -Double.MAX_VALUE) { m_calculatedNumToSelect = bestToWorst.length; } else { determineNumToSelectFromThreshold(bestToWorst); } } /* if (m_numToSelect > 0) { determineThreshFromNumToSelect(bestToWorst); } */ return bestToWorst; }
// Build a polytree on the tree protected int[][] polyTree(Instances D, Instances[] newD) throws Exception { L = (D == null) ? newD[0].classIndex() : D.classIndex(); CD = new double[L][L]; numVisited = 0; int root = 0; int[][] pa = new int[L][0]; visited = new boolean[L]; flagCB = new boolean[L]; Arrays.fill(visited, false); Arrays.fill(flagCB, false); if (depMode) { // Calculate the conditional MI matrix if (newD == null) CD = conDepMatrix(D); if (newD != null) CD = conDepMatrix(newD); } else { // Calculate the marginal normalized MI matrix CD = StatUtilsPro.NormMargDep(D); } // Build the tree skeleton int[][] paTree = skeleton(CD); // Find the causal basins int[][] paPoly = new int[L][L]; causalBasin(root, paTree, paPoly); // If causal basin can't cover all labels, build a directed tree (paTemp) int[][] paTemp = new int[L][0]; root = -1; for (int j = 0; j < L; j++) { for (int k = j; k < L; k++) { if (paPoly[j][k] == 1) { root = j; Arrays.fill(visited, false); visited[root] = true; treeify(root, paPoly, paTemp); break; } } if (root != -1) break; } // Save the parents of every node in the polytree (pa) for (int j = 0; j < L; j++) { for (int k = j; k < L; k++) { if (paPoly[j][k] == 3) pa[j] = A.append(pa[j], k); if (paPoly[j][k] == 2) pa[k] = A.append(pa[k], j); } } for (int j = 0; j < L; j++) { if (pa[j].length < 1) { for (int v : paTemp[j]) { pa[j] = A.append(pa[j], v); paPoly[j][v] = 3; paPoly[v][j] = 2; } } } // Rank the labels in the polytree (rank) root = 0; int[] rank = new int[L]; Arrays.fill(rank, 0); Arrays.fill(visited, false); rankLabel(root, paPoly, rank); chainOrder = Utils.sort(rank); // Enhance the polytree int[] temp = new int[] {}; double thCD = 0.0005; for (int j : chainOrder) { for (int k : temp) { if (paPoly[j][k] != 3) { if (j < k && CD[j][k] > thCD) pa[j] = A.append(pa[j], k); if (j > k && CD[k][j] > thCD) pa[j] = A.append(pa[j], k); } } temp = A.append(temp, j); } return pa; }
/* Sort, in increasing order, the instances and the meanDistance arrays. */ public void sort() { indexes = Utils.sort(meanDistance); };
/** * Calculates the performance stats for the desired class and return results as a set of * Instances. * * @param predictions the predictions to base the curve on * @param classIndex index of the class of interest. * @return datapoints as a set of instances. */ public Instances getCurve(FastVector predictions, int classIndex) { if ((predictions.size() == 0) || (((NominalPrediction) predictions.elementAt(0)).distribution().length <= classIndex)) { System.out.println( "Foooobared " + predictions.size() + " " + ((NominalPrediction) predictions.elementAt(0)).distribution().length + " " + classIndex); return null; } double totPos = 0, totNeg = 0; double[] probs = getProbabilities(predictions, classIndex); // Get distribution of positive/negatives for (int i = 0; i < probs.length; i++) { NominalPrediction pred = (NominalPrediction) predictions.elementAt(i); if (pred.actual() == Prediction.MISSING_VALUE) { System.err.println(getClass().getName() + " Skipping prediction with missing class value"); continue; } if (pred.weight() < 0) { System.err.println(getClass().getName() + " Skipping prediction with negative weight"); continue; } if (pred.actual() == classIndex) { totPos += pred.weight(); } else { totNeg += pred.weight(); } } Instances insts = makeHeader(); int[] sorted = Utils.sort(probs); TwoClassStats tc = new TwoClassStats(totPos, totNeg, 0, 0); double threshold = 0; double cumulativePos = 0; double cumulativeNeg = 0; for (int i = 0; i < sorted.length; i++) { if ((i == 0) || (probs[sorted[i]] > threshold)) { tc.setTruePositive(tc.getTruePositive() - cumulativePos); tc.setFalseNegative(tc.getFalseNegative() + cumulativePos); tc.setFalsePositive(tc.getFalsePositive() - cumulativeNeg); tc.setTrueNegative(tc.getTrueNegative() + cumulativeNeg); threshold = probs[sorted[i]]; insts.add(makeInstance(tc, threshold)); cumulativePos = 0; cumulativeNeg = 0; if (i == sorted.length - 1) { break; } } NominalPrediction pred = (NominalPrediction) predictions.elementAt(sorted[i]); if (pred.actual() == Prediction.MISSING_VALUE) { System.err.println(getClass().getName() + " Skipping prediction with missing class value"); continue; } if (pred.weight() < 0) { System.err.println(getClass().getName() + " Skipping prediction with negative weight"); continue; } if (pred.actual() == classIndex) { cumulativePos += pred.weight(); } else { cumulativeNeg += pred.weight(); } /* System.out.println(tc + " " + probs[sorted[i]] + " " + (pred.actual() == classIndex)); */ /*if ((i != (sorted.length - 1)) && ((i == 0) || (probs[sorted[i]] != probs[sorted[i - 1]]))) { insts.add(makeInstance(tc, probs[sorted[i]])); }*/ } // make sure a zero point gets into the curve if (tc.getFalseNegative() != totPos || tc.getTrueNegative() != totNeg) { tc = new TwoClassStats(0, 0, totNeg, totPos); threshold = probs[sorted[sorted.length - 1]] + 10e-6; insts.add(makeInstance(tc, threshold)); } return insts; }
/** * Generates the classifier. * * @param instances set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ public void buildClassifier(Instances instances) throws Exception { if (!m_weightByConfidence) { TINY = 0.0; } // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_ClassIndex = instances.classIndex(); m_NumClasses = instances.numClasses(); m_globalCounts = new double[m_NumClasses]; m_maxEntrop = Math.log(m_NumClasses) / Math.log(2); m_Instances = new Instances(instances, 0); // Copy the structure for ref m_intervalBounds = new double[instances.numAttributes()][2 + (2 * m_NumClasses)]; for (int j = 0; j < instances.numAttributes(); j++) { boolean alt = false; for (int i = 0; i < m_NumClasses * 2 + 2; i++) { if (i == 0) { m_intervalBounds[j][i] = Double.NEGATIVE_INFINITY; } else if (i == m_NumClasses * 2 + 1) { m_intervalBounds[j][i] = Double.POSITIVE_INFINITY; } else { if (alt) { m_intervalBounds[j][i] = Double.NEGATIVE_INFINITY; alt = false; } else { m_intervalBounds[j][i] = Double.POSITIVE_INFINITY; alt = true; } } } } // find upper and lower bounds for numeric attributes for (int j = 0; j < instances.numAttributes(); j++) { if (j != m_ClassIndex && instances.attribute(j).isNumeric()) { for (int i = 0; i < instances.numInstances(); i++) { Instance inst = instances.instance(i); if (!inst.isMissing(j)) { if (inst.value(j) < m_intervalBounds[j][((int) inst.classValue() * 2 + 1)]) { m_intervalBounds[j][((int) inst.classValue() * 2 + 1)] = inst.value(j); } if (inst.value(j) > m_intervalBounds[j][((int) inst.classValue() * 2 + 2)]) { m_intervalBounds[j][((int) inst.classValue() * 2 + 2)] = inst.value(j); } } } } } m_counts = new double[instances.numAttributes()][][]; // sort intervals for (int i = 0; i < instances.numAttributes(); i++) { if (instances.attribute(i).isNumeric()) { int[] sortedIntervals = Utils.sort(m_intervalBounds[i]); // remove any duplicate bounds int count = 1; for (int j = 1; j < sortedIntervals.length; j++) { if (m_intervalBounds[i][sortedIntervals[j]] != m_intervalBounds[i][sortedIntervals[j - 1]]) { count++; } } double[] reordered = new double[count]; count = 1; reordered[0] = m_intervalBounds[i][sortedIntervals[0]]; for (int j = 1; j < sortedIntervals.length; j++) { if (m_intervalBounds[i][sortedIntervals[j]] != m_intervalBounds[i][sortedIntervals[j - 1]]) { reordered[count] = m_intervalBounds[i][sortedIntervals[j]]; count++; } } m_intervalBounds[i] = reordered; m_counts[i] = new double[count][m_NumClasses]; } else if (i != m_ClassIndex) { // nominal attribute m_counts[i] = new double[instances.attribute(i).numValues()][m_NumClasses]; } } // collect class counts for (int i = 0; i < instances.numInstances(); i++) { Instance inst = instances.instance(i); m_globalCounts[(int) instances.instance(i).classValue()] += inst.weight(); for (int j = 0; j < instances.numAttributes(); j++) { if (!inst.isMissing(j) && j != m_ClassIndex) { if (instances.attribute(j).isNumeric()) { double val = inst.value(j); int k; for (k = m_intervalBounds[j].length - 1; k >= 0; k--) { if (val > m_intervalBounds[j][k]) { m_counts[j][k][(int) inst.classValue()] += inst.weight(); break; } else if (val == m_intervalBounds[j][k]) { m_counts[j][k][(int) inst.classValue()] += (inst.weight() / 2.0); m_counts[j][k - 1][(int) inst.classValue()] += (inst.weight() / 2.0); ; break; } } } else { // nominal attribute m_counts[j][(int) inst.value(j)][(int) inst.classValue()] += inst.weight(); ; } } } } }
private double[] calculateRegionProbs(int j, int i) throws Exception { double[] sumOfProbsForRegion = new double[m_trainingData.classAttribute().numValues()]; for (int u = 0; u < m_numOfSamplesPerRegion; u++) { double[] sumOfProbsForLocation = new double[m_trainingData.classAttribute().numValues()]; m_weightingAttsValues[m_xAttribute] = getRandomX(j); m_weightingAttsValues[m_yAttribute] = getRandomY(m_panelHeight - i - 1); m_dataGenerator.setWeightingValues(m_weightingAttsValues); double[] weights = m_dataGenerator.getWeights(); double sumOfWeights = Utils.sum(weights); int[] indices = Utils.sort(weights); // Prune 1% of weight mass int[] newIndices = new int[indices.length]; double sumSoFar = 0; double criticalMass = 0.99 * sumOfWeights; int index = weights.length - 1; int counter = 0; for (int z = weights.length - 1; z >= 0; z--) { newIndices[index--] = indices[z]; sumSoFar += weights[indices[z]]; counter++; if (sumSoFar > criticalMass) { break; } } indices = new int[counter]; System.arraycopy(newIndices, index + 1, indices, 0, counter); for (int z = 0; z < m_numOfSamplesPerGenerator; z++) { m_dataGenerator.setWeightingValues(m_weightingAttsValues); double[][] values = m_dataGenerator.generateInstances(indices); for (int q = 0; q < values.length; q++) { if (values[q] != null) { System.arraycopy(values[q], 0, m_vals, 0, m_vals.length); m_vals[m_xAttribute] = m_weightingAttsValues[m_xAttribute]; m_vals[m_yAttribute] = m_weightingAttsValues[m_yAttribute]; // classify the instance m_dist = m_classifier.distributionForInstance(m_predInst); for (int k = 0; k < sumOfProbsForLocation.length; k++) { sumOfProbsForLocation[k] += (m_dist[k] * weights[q]); } } } } for (int k = 0; k < sumOfProbsForRegion.length; k++) { sumOfProbsForRegion[k] += (sumOfProbsForLocation[k] * sumOfWeights); } } // average Utils.normalize(sumOfProbsForRegion); // cache double[] tempDist = new double[sumOfProbsForRegion.length]; System.arraycopy(sumOfProbsForRegion, 0, tempDist, 0, sumOfProbsForRegion.length); return tempDist; }