@Override protected void prepareComplete() { StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null; scalingreferencevalues = new double[dimensionality]; randomPerAttribute = new Random[dimensionality]; if (scalingreference == ScalingReference.STDDEV) { if (buf != null) { buf.append("Standard deviation per attribute: "); } for (int d = 0; d < dimensionality; d++) { scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage; if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { scalingreferencevalues[d] = percentage; } randomPerAttribute[d] = new Random(RANDOM.nextLong()); if (buf != null) { buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage); } } } else if (scalingreference == ScalingReference.MINMAX && minima.length == 0 && maxima.length == 0) { if (buf != null) { buf.append("extension per attribute: "); } for (int d = 0; d < dimensionality; d++) { scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage; if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { scalingreferencevalues[d] = percentage; } randomPerAttribute[d] = new Random(RANDOM.nextLong()); if (buf != null) { buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage); } } } mvs = null; if (buf != null) { LOG.debugFine(buf.toString()); } }
/** * Algorithm 3 of Cheng and Church. * * <p>Try to re-add rows or columns that decrease the overall score. * * <p>Also try adding inverted rows. * * @param mat Data matrix * @param cand Bicluster candidate */ private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) { cand.updateRowAndColumnMeans(mat, true); cand.computeMeanSquaredDeviation(mat); while (true) { // We need this to be final + mutable final boolean[] added = new boolean[] {false, false}; // Step 2: add columns cand.visitRow( mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (!selcol); if (cand.computeColResidue(mat, col) <= cand.residue) { cand.selectColumn(col, true); added[0] = true; } return false; } }); // Step 3: recompute values if (added[0]) { cand.updateRowAndColumnMeans(mat, true); cand.computeMeanSquaredDeviation(mat); } // Step 4: try adding rows. cand.visitColumn( mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (!selrow); if (cand.computeRowResidue(mat, row, false) <= cand.residue) { cand.selectRow(row, true); added[1] = true; } return false; } }); // Step 5: try adding inverted rows. if (useinverted) { cand.visitColumn( mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (!selrow); if (cand.computeRowResidue(mat, row, true) <= cand.residue) { cand.selectRow(row, true); cand.invertRow(row, true); added[1] = true; } return false; } }); } if (added[1]) { cand.updateRowAndColumnMeans(mat, true); cand.computeMeanSquaredDeviation(mat); if (LOG.isDebuggingFine()) { LOG.debugFine( "Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } } if (!added[0] && !added[1]) { break; } } }
/** * Algorithm 1 of Cheng and Church: * * <p>Remove single rows or columns. * * <p>Inverted rows are not supported in this method. * * @param mat Data matrix * @param cand Bicluster candidate */ private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { // Assume that cand.residue is up to date! while (cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) { // Store current maximum. Need final mutable, so use arrays. final double[] max = {Double.NEGATIVE_INFINITY}; final int[] best = {-1, -1}; // Test rows if (cand.rowcard > 2) { cand.visitColumn( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selrow); double rowResidue = cand.computeRowResidue(mat, row, false); if (max[0] < rowResidue) { max[0] = rowResidue; best[0] = row; } return false; } }); } // Test columns: if (cand.colcard > 2) { cand.visitRow( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selcol); double colResidue = cand.computeColResidue(mat, col); if (max[0] < colResidue) { max[0] = colResidue; best[1] = col; } return false; } }); } if (best[1] >= 0) { // then override bestrow! cand.selectColumn(best[1], false); } else { assert (best[0] >= 0); cand.selectRow(best[0], false); } // TODO: incremental update could be much faster? cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); if (LOG.isDebuggingFine()) { LOG.debugFine( "Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } } }
/** * Algorithm 2 of Cheng and Church. * * <p>Remove all rows and columns that reduce the residue by alpha. * * <p>Inverted rows are not supported in this method. * * @param mat Data matrix * @param cand Bicluster candidate */ private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); // Note: assumes that cand.residue = H(I,J) while (cand.residue > delta) { final boolean[] modified = {false, false}; // Step 2: remove rows above threshold if (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) { final double alphaResidue = alpha * cand.residue; cand.visitColumn( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selrow); if (cand.computeRowResidue(mat, row, false) > alphaResidue) { cand.selectRow(row, false); modified[0] = true; } return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD); } }); // Step 3: update residue if (modified[0]) { cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); } } // Step 4: remove columns above threshold if (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) { final double alphaResidue = alpha * cand.residue; cand.visitRow( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selcol); if (cand.computeColResidue(mat, col) > alphaResidue) { cand.selectColumn(col, false); modified[1] = true; } return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD); } }); if (modified[1]) { cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); } } if (LOG.isDebuggingFine()) { LOG.debugFine( "Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } // Step 5: if nothing has been removed, try removing single nodes. if (!modified[0] && !modified[1]) { break; // Will be executed next in main loop, as per algorithm 4. // singleNodeDeletion(); } } }
/** * Performs the SUBCLU algorithm on the given database. * * @param relation Relation to process * @return Clustering result */ public Clustering<SubspaceModel> run(Relation<V> relation) { final int dimensionality = RelationUtil.dimensionality(relation); StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null; // Generate all 1-dimensional clusters LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters."); // mapping of dimensionality to set of subspaces HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>(); // list of 1-dimensional subspaces containing clusters List<Subspace> s_1 = new ArrayList<>(); subspaceMap.put(0, s_1); // mapping of subspaces to list of clusters TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<>(new Subspace.DimensionComparator()); for (int d = 0; d < dimensionality; d++) { Subspace currentSubspace = new Subspace(d); List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace); if (LOG.isDebuggingFiner()) { StringBuilder msg = new StringBuilder(); msg.append('\n') .append(clusters.size()) .append(" clusters in subspace ") .append(currentSubspace.dimensonsToString()) .append(": \n"); for (Cluster<Model> cluster : clusters) { msg.append(" " + cluster.getIDs() + "\n"); } LOG.debugFiner(msg.toString()); } if (!clusters.isEmpty()) { s_1.add(currentSubspace); clusterMap.put(currentSubspace, clusters); } } // Generate (d+1)-dimensional clusters from d-dimensional clusters for (int d = 0; d < dimensionality - 1; d++) { if (stepprog != null) { stepprog.beginStep( d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", LOG); } List<Subspace> subspaces = subspaceMap.get(d); if (subspaces == null || subspaces.isEmpty()) { if (stepprog != null) { for (int dim = d + 1; dim < dimensionality - 1; dim++) { stepprog.beginStep( dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", LOG); } } break; } List<Subspace> candidates = generateSubspaceCandidates(subspaces); List<Subspace> s_d = new ArrayList<>(); for (Subspace candidate : candidates) { Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap); if (LOG.isDebuggingFine()) { LOG.debugFine( "best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString()); } List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace); List<Cluster<Model>> clusters = new ArrayList<>(); for (Cluster<Model> cluster : bestSubspaceClusters) { List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate); if (!candidateClusters.isEmpty()) { clusters.addAll(candidateClusters); } } if (LOG.isDebuggingFine()) { StringBuilder msg = new StringBuilder(); msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n"); for (Cluster<Model> c : clusters) { msg.append(" " + c.getIDs() + "\n"); } LOG.debugFine(msg.toString()); } if (!clusters.isEmpty()) { s_d.add(candidate); clusterMap.put(candidate, clusters); } } if (!s_d.isEmpty()) { subspaceMap.put(d + 1, s_d); } } // build result int numClusters = 1; result = new Clustering<>("SUBCLU clustering", "subclu-clustering"); for (Subspace subspace : clusterMap.descendingKeySet()) { List<Cluster<Model>> clusters = clusterMap.get(subspace); for (Cluster<Model> cluster : clusters) { Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs()); newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs()))); newCluster.setName("cluster_" + numClusters++); result.addToplevelCluster(newCluster); } } LOG.setCompleted(stepprog); return result; }