/** * Generates {@code d+1}-dimensional subspace candidates from the specified {@code d}-dimensional * subspaces. * * @param subspaces the {@code d}-dimensional subspaces * @return the {@code d+1}-dimensional subspace candidates */ private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) { List<Subspace> candidates = new ArrayList<>(); if (subspaces.isEmpty()) { return candidates; } // Generate (d+1)-dimensional candidate subspaces int d = subspaces.get(0).dimensionality(); StringBuilder msgFine = new StringBuilder("\n"); if (LOG.isDebuggingFiner()) { msgFine.append("subspaces ").append(subspaces).append('\n'); } for (int i = 0; i < subspaces.size(); i++) { Subspace s1 = subspaces.get(i); for (int j = i + 1; j < subspaces.size(); j++) { Subspace s2 = subspaces.get(j); Subspace candidate = s1.join(s2); if (candidate != null) { if (LOG.isDebuggingFiner()) { msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n'); } // prune irrelevant candidate subspaces List<Subspace> lowerSubspaces = lowerSubspaces(candidate); if (LOG.isDebuggingFiner()) { msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n'); } boolean irrelevantCandidate = false; for (Subspace s : lowerSubspaces) { if (!subspaces.contains(s)) { irrelevantCandidate = true; break; } } if (!irrelevantCandidate) { candidates.add(candidate); } } } } if (LOG.isDebuggingFiner()) { LOG.debugFiner(msgFine.toString()); } if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append(d + 1).append("-dimensional candidate subspaces: "); for (Subspace candidate : candidates) { msg.append(candidate.dimensonsToString()).append(' '); } LOG.debug(msg.toString()); } return candidates; }
/** * Performs the SUBCLU algorithm on the given database. * * @param relation Relation to process * @return Clustering result */ public Clustering<SubspaceModel> run(Relation<V> relation) { final int dimensionality = RelationUtil.dimensionality(relation); StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null; // Generate all 1-dimensional clusters LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters."); // mapping of dimensionality to set of subspaces HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>(); // list of 1-dimensional subspaces containing clusters List<Subspace> s_1 = new ArrayList<>(); subspaceMap.put(0, s_1); // mapping of subspaces to list of clusters TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<>(new Subspace.DimensionComparator()); for (int d = 0; d < dimensionality; d++) { Subspace currentSubspace = new Subspace(d); List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace); if (LOG.isDebuggingFiner()) { StringBuilder msg = new StringBuilder(); msg.append('\n') .append(clusters.size()) .append(" clusters in subspace ") .append(currentSubspace.dimensonsToString()) .append(": \n"); for (Cluster<Model> cluster : clusters) { msg.append(" " + cluster.getIDs() + "\n"); } LOG.debugFiner(msg.toString()); } if (!clusters.isEmpty()) { s_1.add(currentSubspace); clusterMap.put(currentSubspace, clusters); } } // Generate (d+1)-dimensional clusters from d-dimensional clusters for (int d = 0; d < dimensionality - 1; d++) { if (stepprog != null) { stepprog.beginStep( d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", LOG); } List<Subspace> subspaces = subspaceMap.get(d); if (subspaces == null || subspaces.isEmpty()) { if (stepprog != null) { for (int dim = d + 1; dim < dimensionality - 1; dim++) { stepprog.beginStep( dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", LOG); } } break; } List<Subspace> candidates = generateSubspaceCandidates(subspaces); List<Subspace> s_d = new ArrayList<>(); for (Subspace candidate : candidates) { Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap); if (LOG.isDebuggingFine()) { LOG.debugFine( "best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString()); } List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace); List<Cluster<Model>> clusters = new ArrayList<>(); for (Cluster<Model> cluster : bestSubspaceClusters) { List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate); if (!candidateClusters.isEmpty()) { clusters.addAll(candidateClusters); } } if (LOG.isDebuggingFine()) { StringBuilder msg = new StringBuilder(); msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n"); for (Cluster<Model> c : clusters) { msg.append(" " + c.getIDs() + "\n"); } LOG.debugFine(msg.toString()); } if (!clusters.isEmpty()) { s_d.add(candidate); clusterMap.put(candidate, clusters); } } if (!s_d.isEmpty()) { subspaceMap.put(d + 1, s_d); } } // build result int numClusters = 1; result = new Clustering<>("SUBCLU clustering", "subclu-clustering"); for (Subspace subspace : clusterMap.descendingKeySet()) { List<Cluster<Model>> clusters = clusterMap.get(subspace); for (Cluster<Model> cluster : clusters) { Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs()); newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs()))); newCluster.setName("cluster_" + numClusters++); result.addToplevelCluster(newCluster); } } LOG.setCompleted(stepprog); return result; }
/** * Performs a single run of DOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @param minClusterSize Minimum size a cluster must have to be accepted. * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) { // Best cluster for the current run. DBIDs C = null; // Relevant attributes for the best cluster. long[] D = null; // Quality of the best cluster. double quality = Double.NEGATIVE_INFINITY; // Bounds for our cluster. // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new // double[d], new double[d]); // Weights for distance (= rectangle query) SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d)); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension and build bounding box. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (BitsUtil.cardinality(nD) > 0) { // Get all points in the box. df.setSelectedDimensions(nD); // TODO: add filtering capabilities into query API! DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w)); if (LOG.isDebuggingFiner()) { LOG.finer( "Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + BitsUtil.cardinality(nD)); } // Is the cluster large enough? if (nC.size() < minClusterSize) { // Too small. if (LOG.isDebuggingFiner()) { LOG.finer("... but it's too small."); } } else { // Better cluster than before? double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD)); if (nQuality > quality) { if (LOG.isDebuggingFiner()) { LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality); } C = nC; D = nD; quality = nQuality; } else { if (LOG.isDebuggingFiner()) { LOG.finer("... but we already have a better one."); } } } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); return (C != null) ? makeCluster(relation, C, D) : null; }