/** Generate a random neighbor which differs in only one medoid with current clusters. */ private double getRandomNeighbor(T[] data, T[] medoids, int[] y, double[] d) { int n = data.length; int index = Math.randomInt(k); T medoid = null; boolean dup; do { dup = false; medoid = data[Math.randomInt(n)]; for (int i = 0; i < k; i++) { if (medoid == medoids[i]) { dup = true; break; } } } while (dup); medoids[index] = medoid; for (int i = 0; i < n; i++) { double dist = distance.d(data[i], medoid); if (d[i] > dist) { y[i] = index; d[i] = dist; } else if (y[i] == index) { d[i] = dist; y[i] = index; for (int j = 0; j < k; j++) { if (j != index) { dist = distance.d(data[i], medoids[j]); if (d[i] > dist) { y[i] = j; d[i] = dist; } } } } } return Math.sum(d); }
/** * Finds the best split cutoff for attribute j at the current node. * * @param n the number instances in this node. * @param count the sample count in each class. * @param falseCount an array to store sample count in each class for false child node. * @param impurity the impurity of this node. * @param j the attribute to split on. */ public Node findBestSplit(int n, int[] count, int[] falseCount, double impurity, int j) { Node splitNode = new Node(); if (attributes[j].getType() == Attribute.Type.NOMINAL) { int m = ((NominalAttribute) attributes[j]).size(); int[][] trueCount = new int[m][k]; for (int i = 0; i < x.length; i++) { if (samples[i] > 0) { trueCount[(int) x[i][j]][y[i]] += samples[i]; } } for (int l = 0; l < m; l++) { int tc = Math.sum(trueCount[l]); int fc = n - tc; // If either side is empty, skip this feature. if (tc < nodeSize || fc < nodeSize) { continue; } for (int q = 0; q < k; q++) { falseCount[q] = count[q] - trueCount[l][q]; } int trueLabel = Math.whichMax(trueCount[l]); int falseLabel = Math.whichMax(falseCount); double gain = impurity - (double) tc / n * impurity(trueCount[l], tc) - (double) fc / n * impurity(falseCount, fc); if (gain > splitNode.splitScore) { // new best split splitNode.splitFeature = j; splitNode.splitValue = l; splitNode.splitScore = gain; splitNode.trueChildOutput = trueLabel; splitNode.falseChildOutput = falseLabel; } } } else if (attributes[j].getType() == Attribute.Type.NUMERIC) { int[] trueCount = new int[k]; double prevx = Double.NaN; int prevy = -1; for (int i : order[j]) { if (samples[i] > 0) { if (Double.isNaN(prevx) || x[i][j] == prevx || y[i] == prevy) { prevx = x[i][j]; prevy = y[i]; trueCount[y[i]] += samples[i]; continue; } int tc = Math.sum(trueCount); int fc = n - tc; // If either side is empty, skip this feature. if (tc < nodeSize || fc < nodeSize) { prevx = x[i][j]; prevy = y[i]; trueCount[y[i]] += samples[i]; continue; } for (int l = 0; l < k; l++) { falseCount[l] = count[l] - trueCount[l]; } int trueLabel = Math.whichMax(trueCount); int falseLabel = Math.whichMax(falseCount); double gain = impurity - (double) tc / n * impurity(trueCount, tc) - (double) fc / n * impurity(falseCount, fc); if (gain > splitNode.splitScore) { // new best split splitNode.splitFeature = j; splitNode.splitValue = (x[i][j] + prevx) / 2; splitNode.splitScore = gain; splitNode.trueChildOutput = trueLabel; splitNode.falseChildOutput = falseLabel; } prevx = x[i][j]; prevy = y[i]; trueCount[y[i]] += samples[i]; } } } else { throw new IllegalStateException("Unsupported attribute type: " + attributes[j].getType()); } return splitNode; }