Exemple #1
0
  @Override
  public int[] cluster(
      DataSet dataSet, int lowK, int highK, ExecutorService threadpool, int[] designations) {
    if (highK == lowK) return cluster(dataSet, lowK, threadpool, designations);
    else if (highK < lowK)
      throw new IllegalArgumentException(
          "low value of k (" + lowK + ") must be higher than the high value of k(" + highK + ")");
    final int N = dataSet.getSampleSize();
    final int D = dataSet.getNumNumericalVars();
    fKs = new double[highK - 1]; // we HAVE to start from k=2
    fKs[0] = 1.0; // see eq(2)

    int[] bestCluster = new int[N];
    double minFk =
        lowK == 1
            ? 1.0
            : Double
                .POSITIVE_INFINITY; // If our low k is > 1, force the check later to kick in at the
                                    // first candidate k by making fK appear Inf

    if (designations == null || designations.length < N) designations = new int[N];

    double alphaKprev = 0, S_k_prev = 0;

    // re used every iteration
    List<Vec> curMeans = new ArrayList<Vec>(highK);
    means = new ArrayList<Vec>(); // the best set of means
    // pre-compute cache instead of re-computing every time
    List<Double> accelCache = dm.getAccelerationCache(dataSet.getDataVectors(), threadpool);

    for (int k = 2; k < highK; k++) {
      curMeans.clear();
      // kmeans objective function result is the same as S_k
      double S_k =
          cluster(
              dataSet,
              accelCache,
              k,
              curMeans,
              designations,
              true,
              threadpool,
              true); // TODO could add a flag to make approximate S_k an option. Though it dosn't
                     // seem to work great on toy problems, might be fine on more realistic data

      double alpha_k;
      if (k == 2) alpha_k = 1 - 3.0 / (4 * D); // eq(3a)
      else alpha_k = alphaKprev + (1 - alphaKprev) / 6; // eq(3b)

      double fK; // eq(2)
      if (S_k_prev == 0) fKs[k - 1] = fK = 1;
      else fKs[k - 1] = fK = S_k / (alpha_k * S_k_prev);

      alphaKprev = alpha_k;
      S_k_prev = S_k;

      if (k >= lowK && minFk > fK) {
        System.arraycopy(designations, 0, bestCluster, 0, N);
        minFk = fK;
        means.clear();
        for (Vec mean : curMeans) means.add(mean.clone());
      }
    }

    // contract is we return designations with the data in it if we can, so copy the values back
    System.arraycopy(bestCluster, 0, designations, 0, N);
    return designations;
  }