@Override public Clustering<KMeansModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; int iteration = 0; for (; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); logVarstat(varstat, varsum); // Stop if no cluster assignment changed. if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } LOG.setCompleted(prog); if (LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); } // Wrap result Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); if (ids.size() == 0) { continue; } KMeansModel model = new KMeansModel(means[i], varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; }
/** * Evaluate a single clustering. * * @param db Database * @param rel Data relation * @param c Clustering * @return Mean simplified silhouette */ public double evaluateClustering( Database db, Relation<? extends NumberVector> rel, Clustering<?> c) { List<? extends Cluster<?>> clusters = c.getAllClusters(); NumberVector[] centroids = new NumberVector[clusters.size()]; int ignorednoise = centroids(rel, clusters, centroids, noiseOption); MeanVariance mssil = new MeanVariance(); Iterator<? extends Cluster<?>> ci = clusters.iterator(); for (int i = 0; ci.hasNext(); i++) { Cluster<?> cluster = ci.next(); if (cluster.size() <= 1) { // As suggested in Rousseeuw, we use 0 for singletons. mssil.put(0., cluster.size()); continue; } if (cluster.isNoise()) { switch (noiseOption) { case IGNORE_NOISE: continue; // Ignore elements case TREAT_NOISE_AS_SINGLETONS: // As suggested in Rousseeuw, we use 0 for singletons. mssil.put(0., cluster.size()); continue; case MERGE_NOISE: break; // Treat as cluster below } } // Cluster center: final NumberVector center = centroids[i]; assert (center != null); for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) { NumberVector obj = rel.get(it); // a: Distance to own centroid double a = distance.distance(center, obj); // b: Distance to other clusters centroids: double min = Double.POSITIVE_INFINITY; Iterator<? extends Cluster<?>> cj = clusters.iterator(); for (int j = 0; cj.hasNext(); j++) { Cluster<?> ocluster = cj.next(); if (i == j) { continue; } NumberVector other = centroids[j]; if (other == null) { // Noise! switch (noiseOption) { case IGNORE_NOISE: continue; case TREAT_NOISE_AS_SINGLETONS: // Treat each object like a centroid! for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) { double dist = distance.distance(rel.get(it2), obj); min = dist < min ? dist : min; } continue; case MERGE_NOISE: break; // Treat as cluster below, but should not be reachable. } } // Clusters: use centroid. double dist = distance.distance(other, obj); min = dist < min ? dist : min; } // One 'real' cluster only? min = min < Double.POSITIVE_INFINITY ? min : a; mssil.put((min - a) / (min > a ? min : a)); } } double penalty = 1.; // Only if {@link NoiseHandling#IGNORE_NOISE}: if (penalize && ignorednoise > 0) { penalty = (rel.size() - ignorednoise) / (double) rel.size(); } final double meanssil = penalty * mssil.getMean(); final double stdssil = penalty * mssil.getSampleStddev(); if (LOG.isStatistics()) { LOG.statistics( new StringStatistic( key + ".simplified-silhouette.noise-handling", noiseOption.toString())); if (ignorednoise > 0) { LOG.statistics(new LongStatistic(key + ".simplified-silhouette.ignored", ignorednoise)); } LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.mean", meanssil)); LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.stddev", stdssil)); } EvaluationResult ev = EvaluationResult.findOrCreate( db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation"); MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation"); g.addMeasure( "Simp. Silhouette +-" + FormatUtil.NF2.format(stdssil), meanssil, -1., 1., 0., false); db.getHierarchy().resultChanged(ev); return meanssil; }
/** * Evaluate a single clustering. * * @param db Database * @param rel Data relation * @param c Clustering * @return Gamma index */ public double evaluateClustering( Database db, Relation<? extends NumberVector> rel, Clustering<?> c) { List<? extends Cluster<?>> clusters = c.getAllClusters(); int ignorednoise = 0, withinPairs = 0; for (Cluster<?> cluster : clusters) { if ((cluster.size() <= 1 || cluster.isNoise())) { switch (noiseHandling) { case IGNORE_NOISE: ignorednoise += cluster.size(); continue; case TREAT_NOISE_AS_SINGLETONS: continue; // No concordant distances. case MERGE_NOISE: break; // Treat like a cluster below. } } withinPairs += (cluster.size() * (cluster.size() - 1)) >>> 1; if (withinPairs < 0) { throw new AbortException( "Integer overflow - clusters too large to compute pairwise distances."); } } // Materialize within-cluster distances (sorted): double[] withinDistances = computeWithinDistances(rel, clusters, withinPairs); int[] withinTies = new int[withinDistances.length]; // Count ties within countTies(withinDistances, withinTies); long concordantPairs = 0, discordantPairs = 0, betweenPairs = 0; // Step two, compute discordant distances: for (int i = 0; i < clusters.size(); i++) { Cluster<?> ocluster1 = clusters.get(i); if ((ocluster1.size() <= 1 || ocluster1.isNoise()) // && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) { continue; } for (int j = i + 1; j < clusters.size(); j++) { Cluster<?> ocluster2 = clusters.get(j); if ((ocluster2.size() <= 1 || ocluster2.isNoise()) // && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) { continue; } betweenPairs += ocluster1.size() * ocluster2.size(); for (DBIDIter oit1 = ocluster1.getIDs().iter(); oit1.valid(); oit1.advance()) { NumberVector obj = rel.get(oit1); for (DBIDIter oit2 = ocluster2.getIDs().iter(); oit2.valid(); oit2.advance()) { double dist = distanceFunction.distance(obj, rel.get(oit2)); int p = Arrays.binarySearch(withinDistances, dist); if (p >= 0) { // Tied distances: while (p > 0 && withinDistances[p - 1] >= dist) { --p; } concordantPairs += p; discordantPairs += withinDistances.length - p - withinTies[p]; continue; } p = -p - 1; concordantPairs += p; discordantPairs += withinDistances.length - p; } } } } // Total number of pairs possible: final long t = ((rel.size() - ignorednoise) * (long) (rel.size() - ignorednoise - 1)) >>> 1; final long tt = (t * (t - 1)) >>> 1; final double gamma = (concordantPairs - discordantPairs) / (double) (concordantPairs + discordantPairs); final double tau = computeTau(concordantPairs, discordantPairs, tt, withinDistances.length, betweenPairs); if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString())); if (ignorednoise > 0) { LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise)); } LOG.statistics(new DoubleStatistic(key + ".gamma", gamma)); LOG.statistics(new DoubleStatistic(key + ".tau", tau)); } EvaluationResult ev = EvaluationResult.findOrCreate( db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation"); MeasurementGroup g = ev.findOrCreateGroup("Concordance-based Evaluation"); g.addMeasure("Gamma", gamma, -1., 1., 0., false); g.addMeasure("Tau", tau, -1., +1., 0., false); db.getHierarchy().resultChanged(ev); return gamma; }
/** * Evaluate a single clustering. * * @param db Database * @param rel Data relation * @param c Clustering * @return C-Index */ public double evaluateClustering( Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) { List<? extends Cluster<?>> clusters = c.getAllClusters(); // theta is the sum, w the number of within group distances double theta = 0; int w = 0; int ignorednoise = 0; int isize = clusters.size() <= 1 ? rel.size() : rel.size() / (clusters.size() - 1); DoubleArray pairDists = new DoubleArray(isize); for (int i = 0; i < clusters.size(); i++) { Cluster<?> cluster = clusters.get(i); if (cluster.size() <= 1 || cluster.isNoise()) { switch (noiseOption) { case IGNORE_NOISE: ignorednoise += cluster.size(); continue; // Ignore case TREAT_NOISE_AS_SINGLETONS: continue; // No within-cluster distances! case MERGE_NOISE: break; // Treat like a cluster } } for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) { O obj = rel.get(it1); // Compare object to every cluster, but only once for (int j = i; j < clusters.size(); j++) { Cluster<?> ocluster = clusters.get(j); if (ocluster.size() <= 1 || ocluster.isNoise()) { switch (noiseOption) { case IGNORE_NOISE: continue; // Ignore this cluster. case TREAT_NOISE_AS_SINGLETONS: case MERGE_NOISE: break; // Treat like a cluster } } for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) { if (DBIDUtil.compare(it1, it2) <= 0) { // Only once. continue; } double dist = dq.distance(obj, rel.get(it2)); pairDists.add(dist); if (ocluster == cluster) { // Within-cluster distances. theta += dist; w++; } } } } } // Simulate best and worst cases: pairDists.sort(); double min = 0, max = 0; for (int i = 0, j = pairDists.size() - 1; i < w; i++, j--) { min += pairDists.get(i); max += pairDists.get(j); } double cIndex = (max > min) ? (theta - min) / (max - min) : 0.; if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString())); if (ignorednoise > 0) { LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise)); } LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex)); } EvaluationResult ev = EvaluationResult.findOrCreate( db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation"); MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation"); g.addMeasure("C-Index", cIndex, 0., 1., 0., true); db.getHierarchy().resultChanged(ev); return cIndex; }