@Override public Clustering<KMeansModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; int iteration = 0; for (; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); logVarstat(varstat, varsum); // Stop if no cluster assignment changed. if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } LOG.setCompleted(prog); if (LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); } // Wrap result Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); if (ids.size() == 0) { continue; } KMeansModel model = new KMeansModel(means[i], varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; }
/** * Run the algorithm * * @param db Database * @param relation Relation * @return Clustering hierarchy */ public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) { DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); if (size > 0x10000) { throw new AbortException( "This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow."); } if (Linkage.SINGLE.equals(linkage)) { LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!"); } // Compute the initial (lower triangular) distance matrix. double[] scratch = new double[triangleSize(size)]; DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter(); // Position counter - must agree with computeOffset! int pos = 0; boolean square = Linkage.WARD.equals(linkage) && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction())); for (int x = 0; ix.valid(); x++, ix.advance()) { iy.seek(0); for (int y = 0; y < x; y++, iy.advance()) { scratch[pos] = dq.distance(ix, iy); // Ward uses variances -- i.e. squared values if (square) { scratch[pos] *= scratch[pos]; } pos++; } } // Initialize space for result: WritableDBIDDataStore parent = DataStoreUtil.makeDBIDStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableDoubleDataStore height = DataStoreUtil.makeDoubleStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { parent.put(it, it); height.put(it, Double.POSITIVE_INFINITY); csize.put(it, 1); } // Repeat until everything merged, except the desired number of clusters: FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null; for (int i = 1; i < size; i++) { double min = Double.POSITIVE_INFINITY; int minx = -1, miny = -1; for (ix.seek(0); ix.valid(); ix.advance()) { if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) { continue; } final int xbase = triangleSize(ix.getOffset()); for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) { continue; } final int idx = xbase + iy.getOffset(); if (scratch[idx] <= min) { min = scratch[idx]; minx = ix.getOffset(); miny = iy.getOffset(); } } } assert (minx >= 0 && miny >= 0); // Avoid allocating memory, by reusing existing iterators: ix.seek(minx); iy.seek(miny); // Perform merge in data structure: x -> y // Since y < x, prefer keeping y, dropping x. int sizex = csize.intValue(ix), sizey = csize.intValue(iy); height.put(ix, min); parent.put(ix, iy); csize.put(iy, sizex + sizey); // Update distance matrix. Note: miny < minx final int xbase = triangleSize(minx), ybase = triangleSize(miny); // Write to (y, j), with j < y for (ij.seek(0); ij.getOffset() < miny; ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int sizej = csize.intValue(ij); scratch[ybase + ij.getOffset()] = linkage.combine( sizex, scratch[xbase + ij.getOffset()], sizey, scratch[ybase + ij.getOffset()], sizej, min); } // Write to (j, y), with y < j < x for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + miny] = linkage.combine( sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min); } // Write to (j, y), with y < x < j for (ij.seek(minx + 1); ij.valid(); ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min); } LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); return new PointerHierarchyRepresentationResult(ids, parent, height); }