/** Performs the DBSCAN algorithm on the given database. */ public Clustering<Model> run(Relation<O> relation) { final int size = relation.size(); if (size < minpts) { Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); result.addToplevelCluster( new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER)); return result; } RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction()); resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); runDBSCAN(relation, rangeQuery); double averagen = ncounter / (double) relation.size(); LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen)); if (averagen < 1 + 0.1 * (minpts - 1)) { LOG.warning("There are very few neighbors found. Epsilon may be too small."); } if (averagen > 100 * minpts) { LOG.warning("There are very many neighbors found. Epsilon may be too large."); } Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); for (ModifiableDBIDs res : resultList) { result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER)); } result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER)); return result; }
@Override public Clustering<BiclusterWithInversionsModel> biclustering() { double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; for (int i = 0; i < n; i++) { cand.reset(); multipleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } singleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } nodeAddition(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } cand.maskMatrix(mat, dist); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); noise.removeDBIDs(cids); result.addToplevelCluster(new Cluster<>(cids, model)); if (LOG.isVerbose()) { LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); LOG.verbose("Number of rows: " + cand.rowcard + "\n"); LOG.verbose("Number of columns: " + cand.colcard + "\n"); // LOG.verbose("Total number of masked values: " + maskedVals.size() + // "\n"); } LOG.incrementProcessed(prog); } // Add a noise cluster, full-dimensional. if (!noise.isEmpty()) { long[] allcols = BitsUtil.ones(getColDim()); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); result.addToplevelCluster(new Cluster<>(noise, true, model)); } LOG.ensureCompleted(prog); return result; }
@Override public Clustering<KMeansModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; int iteration = 0; for (; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); logVarstat(varstat, varsum); // Stop if no cluster assignment changed. if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } LOG.setCompleted(prog); if (LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); } // Wrap result Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); if (ids.size() == 0) { continue; } KMeansModel model = new KMeansModel(means[i], varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; }
/** * Performs the SUBCLU algorithm on the given database. * * @param relation Relation to process * @return Clustering result */ public Clustering<SubspaceModel> run(Relation<V> relation) { final int dimensionality = RelationUtil.dimensionality(relation); StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null; // Generate all 1-dimensional clusters LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters."); // mapping of dimensionality to set of subspaces HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>(); // list of 1-dimensional subspaces containing clusters List<Subspace> s_1 = new ArrayList<>(); subspaceMap.put(0, s_1); // mapping of subspaces to list of clusters TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<>(new Subspace.DimensionComparator()); for (int d = 0; d < dimensionality; d++) { Subspace currentSubspace = new Subspace(d); List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace); if (LOG.isDebuggingFiner()) { StringBuilder msg = new StringBuilder(); msg.append('\n') .append(clusters.size()) .append(" clusters in subspace ") .append(currentSubspace.dimensonsToString()) .append(": \n"); for (Cluster<Model> cluster : clusters) { msg.append(" " + cluster.getIDs() + "\n"); } LOG.debugFiner(msg.toString()); } if (!clusters.isEmpty()) { s_1.add(currentSubspace); clusterMap.put(currentSubspace, clusters); } } // Generate (d+1)-dimensional clusters from d-dimensional clusters for (int d = 0; d < dimensionality - 1; d++) { if (stepprog != null) { stepprog.beginStep( d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", LOG); } List<Subspace> subspaces = subspaceMap.get(d); if (subspaces == null || subspaces.isEmpty()) { if (stepprog != null) { for (int dim = d + 1; dim < dimensionality - 1; dim++) { stepprog.beginStep( dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", LOG); } } break; } List<Subspace> candidates = generateSubspaceCandidates(subspaces); List<Subspace> s_d = new ArrayList<>(); for (Subspace candidate : candidates) { Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap); if (LOG.isDebuggingFine()) { LOG.debugFine( "best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString()); } List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace); List<Cluster<Model>> clusters = new ArrayList<>(); for (Cluster<Model> cluster : bestSubspaceClusters) { List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate); if (!candidateClusters.isEmpty()) { clusters.addAll(candidateClusters); } } if (LOG.isDebuggingFine()) { StringBuilder msg = new StringBuilder(); msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n"); for (Cluster<Model> c : clusters) { msg.append(" " + c.getIDs() + "\n"); } LOG.debugFine(msg.toString()); } if (!clusters.isEmpty()) { s_d.add(candidate); clusterMap.put(candidate, clusters); } } if (!s_d.isEmpty()) { subspaceMap.put(d + 1, s_d); } } // build result int numClusters = 1; result = new Clustering<>("SUBCLU clustering", "subclu-clustering"); for (Subspace subspace : clusterMap.descendingKeySet()) { List<Cluster<Model>> clusters = clusterMap.get(subspace); for (Cluster<Model> cluster : clusters) { Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs()); newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs()))); newCluster.setName("cluster_" + numClusters++); result.addToplevelCluster(newCluster); } } LOG.setCompleted(stepprog); return result; }
/** * Performs the DOC or FastDOC (as configured) algorithm on the given Database. * * <p>This will run exhaustively, i.e. run DOC until no clusters are found anymore / the database * size has shrunk below the threshold for minimum cluster size. * * @param database Database * @param relation Data relation */ public Clustering<SubspaceModel> run(Database database, Relation<V> relation) { // Dimensionality of our set. final int d = RelationUtil.dimensionality(relation); // Get available DBIDs as a set we can remove items from. ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs()); // Precompute values as described in Figure 2. double r = Math.abs(Math.log(d + d) / Math.log(beta * .5)); // Outer loop count. int n = (int) (2. / alpha); // Inner loop count. int m = (int) (Math.pow(2. / alpha, r) * Math.log(4)); if (heuristics) { m = Math.min(m, Math.min(1000000, d * d)); } // Minimum size for a cluster for it to be accepted. int minClusterSize = (int) (alpha * S.size()); // List of all clusters we found. Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC"); // Inform the user about the number of actual clusters found so far. IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null; // To not only find a single cluster, we continue running until our set // of points is empty. while (S.size() > minClusterSize) { Cluster<SubspaceModel> C; if (heuristics) { C = runFastDOC(database, relation, S, d, n, m, (int) r); } else { C = runDOC(database, relation, S, d, n, m, (int) r, minClusterSize); } if (C == null) { // Stop trying if we couldn't find a cluster. break; } // Found a cluster, remember it, remove its points from the set. result.addToplevelCluster(C); // Remove all points of the cluster from the set and continue. S.removeDBIDs(C.getIDs()); if (cprogress != null) { cprogress.setProcessed(result.getAllClusters().size(), LOG); } } // Add the remainder as noise. if (S.size() > 0) { long[] alldims = BitsUtil.ones(d); result.addToplevelCluster( new Cluster<>( S, true, new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S).getArrayRef()))); } LOG.setCompleted(cprogress); return result; }