@Override public int[] cluster( DataSet dataSet, int lowK, int highK, ExecutorService threadpool, int[] designations) { if (highK == lowK) return cluster(dataSet, lowK, threadpool, designations); else if (highK < lowK) throw new IllegalArgumentException( "low value of k (" + lowK + ") must be higher than the high value of k(" + highK + ")"); final int N = dataSet.getSampleSize(); final int D = dataSet.getNumNumericalVars(); fKs = new double[highK - 1]; // we HAVE to start from k=2 fKs[0] = 1.0; // see eq(2) int[] bestCluster = new int[N]; double minFk = lowK == 1 ? 1.0 : Double .POSITIVE_INFINITY; // If our low k is > 1, force the check later to kick in at the // first candidate k by making fK appear Inf if (designations == null || designations.length < N) designations = new int[N]; double alphaKprev = 0, S_k_prev = 0; // re used every iteration List<Vec> curMeans = new ArrayList<Vec>(highK); means = new ArrayList<Vec>(); // the best set of means // pre-compute cache instead of re-computing every time List<Double> accelCache = dm.getAccelerationCache(dataSet.getDataVectors(), threadpool); for (int k = 2; k < highK; k++) { curMeans.clear(); // kmeans objective function result is the same as S_k double S_k = cluster( dataSet, accelCache, k, curMeans, designations, true, threadpool, true); // TODO could add a flag to make approximate S_k an option. Though it dosn't // seem to work great on toy problems, might be fine on more realistic data double alpha_k; if (k == 2) alpha_k = 1 - 3.0 / (4 * D); // eq(3a) else alpha_k = alphaKprev + (1 - alphaKprev) / 6; // eq(3b) double fK; // eq(2) if (S_k_prev == 0) fKs[k - 1] = fK = 1; else fKs[k - 1] = fK = S_k / (alpha_k * S_k_prev); alphaKprev = alpha_k; S_k_prev = S_k; if (k >= lowK && minFk > fK) { System.arraycopy(designations, 0, bestCluster, 0, N); minFk = fK; means.clear(); for (Vec mean : curMeans) means.add(mean.clone()); } } // contract is we return designations with the data in it if we can, so copy the values back System.arraycopy(bestCluster, 0, designations, 0, N); return designations; }