/** Performs the DBSCAN algorithm on the given database. */ public Clustering<Model> run(Relation<O> relation) { final int size = relation.size(); if (size < minpts) { Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); result.addToplevelCluster( new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER)); return result; } RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction()); resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); runDBSCAN(relation, rangeQuery); double averagen = ncounter / (double) relation.size(); LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen)); if (averagen < 1 + 0.1 * (minpts - 1)) { LOG.warning("There are very few neighbors found. Epsilon may be too small."); } if (averagen > 100 * minpts) { LOG.warning("There are very many neighbors found. Epsilon may be too large."); } Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); for (ModifiableDBIDs res : resultList) { result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER)); } result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER)); return result; }
/** * Handles a DataStoreEvent with the specified type. If the current event type is not equal to the * specified type, the events accumulated up to now will be fired first. * * <p>The new event will be aggregated and fired on demand if {@link #accumulateDataStoreEvents} * is set, otherwise all registered <code>DataStoreListener</code> will be notified immediately * that the content of the database has been changed. * * @param objects the objects that have been changed, i.e. inserted, deleted or updated */ private void fireObjectsChanged(DBIDs objects, Type type) { // flush first if (currentDataStoreEventType != null && !currentDataStoreEventType.equals(type)) { flushDataStoreEvents(); } if (accumulateDataStoreEvents) { if (this.dataStoreObjects == null) { this.dataStoreObjects = DBIDUtil.newHashSet(); } this.dataStoreObjects.addDBIDs(objects); currentDataStoreEventType = type; return; } // Execute immediately: DataStoreEvent e; switch (type) { case INSERT: e = DataStoreEvent.insertionEvent(objects); break; case REMOVE: e = DataStoreEvent.removalEvent(objects); break; case UPDATE: e = DataStoreEvent.updateEvent(objects); break; default: return; } for (int i = dataListenerList.size(); --i >= 0; ) { dataListenerList.get(i).contentChanged(e); } }
/** * Utility method to create a subspace cluster from a list of DBIDs and the relevant attributes. * * @param relation to compute a centroid. * @param C the cluster points. * @param D the relevant dimensions. * @return an object representing the subspace cluster. */ private Cluster<SubspaceModel> makeCluster(Relation<V> relation, DBIDs C, long[] D) { DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values! Cluster<SubspaceModel> cluster = new Cluster<>(ids); cluster.setModel( new SubspaceModel(new Subspace(D), Centroid.make(relation, ids).getArrayRef())); return cluster; }
@Override public Clustering<KMeansModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; int iteration = 0; for (; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); logVarstat(varstat, varsum); // Stop if no cluster assignment changed. if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } LOG.setCompleted(prog); if (LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); } // Wrap result Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); if (ids.size() == 0) { continue; } KMeansModel model = new KMeansModel(means[i], varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; }
@Override public Clustering<BiclusterWithInversionsModel> biclustering() { double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; for (int i = 0; i < n; i++) { cand.reset(); multipleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } singleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } nodeAddition(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } cand.maskMatrix(mat, dist); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); noise.removeDBIDs(cids); result.addToplevelCluster(new Cluster<>(cids, model)); if (LOG.isVerbose()) { LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); LOG.verbose("Number of rows: " + cand.rowcard + "\n"); LOG.verbose("Number of columns: " + cand.colcard + "\n"); // LOG.verbose("Total number of masked values: " + maskedVals.size() + // "\n"); } LOG.incrementProcessed(prog); } // Add a noise cluster, full-dimensional. if (!noise.isEmpty()) { long[] allcols = BitsUtil.ones(getColDim()); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); result.addToplevelCluster(new Cluster<>(noise, true, model)); } LOG.ensureCompleted(prog); return result; }
/** * Handles a DataStoreEvent with the specified type. If the current event type is not equal to the * specified type, the events accumulated up to now will be fired first. * * <p>The new event will be aggregated and fired on demand if {@link #accumulateDataStoreEvents} * is set, otherwise all registered <code>DataStoreListener</code> will be notified immediately * that the content of the database has been changed. * * @param object the object that has been changed, i.e. inserted, deleted or updated */ private void fireObjectChanged(DBIDRef object, Type type) { // flush first if (currentDataStoreEventType != null && !currentDataStoreEventType.equals(type)) { flushDataStoreEvents(); } if (this.dataStoreObjects == null) { this.dataStoreObjects = DBIDUtil.newHashSet(); } this.dataStoreObjects.add(object); currentDataStoreEventType = type; if (!accumulateDataStoreEvents) { flushDataStoreEvents(); } }
/** * DBSCAN-function expandCluster. * * <p>Border-Objects become members of the first possible cluster. * * @param relation Database relation to run on * @param rangeQuery Range query to use * @param startObjectID potential seed of a new potential cluster * @param objprog the progress object for logging the current status */ protected void expandCluster( Relation<O> relation, RangeQuery<O> rangeQuery, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); ncounter += neighbors.size(); // startObject is no core-object if (neighbors.size() < minpts) { noise.add(startObjectID); processedIDs.add(startObjectID); if (objprog != null) { objprog.incrementProcessed(LOG); } return; } ModifiableDBIDs currentCluster = DBIDUtil.newArray(); currentCluster.add(startObjectID); processedIDs.add(startObjectID); // try to expand the cluster HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet(); processNeighbors(neighbors.iter(), currentCluster, seeds); DBIDVar o = DBIDUtil.newVar(); while (!seeds.isEmpty()) { seeds.pop(o); neighbors = rangeQuery.getRangeForDBID(o, epsilon); ncounter += neighbors.size(); if (neighbors.size() >= minpts) { processNeighbors(neighbors.iter(), currentCluster, seeds); } if (objprog != null) { objprog.incrementProcessed(LOG); } } resultList.add(currentCluster); if (clusprog != null) { clusprog.setProcessed(resultList.size(), LOG); } }
/** * Run the DBSCAN algorithm * * @param relation Data relation * @param rangeQuery Range query class */ protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) { final int size = relation.size(); FiniteProgress objprog = LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null; IndefiniteProgress clusprog = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null; processedIDs = DBIDUtil.newHashSet(size); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if (!processedIDs.contains(iditer)) { expandCluster(relation, rangeQuery, iditer, objprog, clusprog); } if (objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), LOG); clusprog.setProcessed(resultList.size(), LOG); } if (processedIDs.size() == size) { break; } } // Finish progress logging LOG.ensureCompleted(objprog); LOG.setCompleted(clusprog); }
/** * Evaluate a single outlier result as histogram. * * @param database Database to process * @param or Outlier result * @return Result */ public HistogramResult<DoubleVector> evaluateOutlierResult(Database database, OutlierResult or) { if (scaling instanceof OutlierScalingFunction) { OutlierScalingFunction oscaling = (OutlierScalingFunction) scaling; oscaling.prepare(or); } ModifiableDBIDs ids = DBIDUtil.newHashSet(or.getScores().getDBIDs()); DBIDs outlierIds = DatabaseUtil.getObjectsByLabelMatch(database, positiveClassName); // first value for outliers, second for each object // If we have useful (finite) min/max, use these for binning. double min = scaling.getMin(); double max = scaling.getMax(); final ObjHistogram<DoubleDoublePair> hist; if (Double.isInfinite(min) || Double.isNaN(min) || Double.isInfinite(max) || Double.isNaN(max)) { hist = new AbstractObjDynamicHistogram<DoubleDoublePair>(bins) { @Override public DoubleDoublePair aggregate(DoubleDoublePair first, DoubleDoublePair second) { first.first += second.first; first.second += second.second; return first; } @Override protected DoubleDoublePair makeObject() { return new DoubleDoublePair(0., 0.); } @Override protected DoubleDoublePair cloneForCache(DoubleDoublePair data) { return new DoubleDoublePair(data.first, data.second); } @Override protected DoubleDoublePair downsample(Object[] data, int start, int end, int size) { DoubleDoublePair sum = new DoubleDoublePair(0, 0); for (int i = start; i < end; i++) { DoubleDoublePair p = (DoubleDoublePair) data[i]; if (p != null) { sum.first += p.first; sum.second += p.second; } } return sum; } }; } else { hist = new AbstractObjStaticHistogram<DoubleDoublePair>(bins, min, max) { @Override protected DoubleDoublePair makeObject() { return new DoubleDoublePair(0., 0.); } @Override public void putData(double coord, DoubleDoublePair data) { DoubleDoublePair exist = get(coord); exist.first += data.first; exist.second += data.second; } }; } // first fill histogram only with values of outliers DoubleDoublePair negative, positive; if (!splitfreq) { negative = new DoubleDoublePair(1. / ids.size(), 0); positive = new DoubleDoublePair(0, 1. / ids.size()); } else { negative = new DoubleDoublePair(1. / (ids.size() - outlierIds.size()), 0); positive = new DoubleDoublePair(0, 1. / outlierIds.size()); } ids.removeDBIDs(outlierIds); // fill histogram with values of each object for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { double result = or.getScores().doubleValue(iter); result = scaling.getScaled(result); if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) { hist.putData(result, negative); } } for (DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) { double result = or.getScores().doubleValue(iter); result = scaling.getScaled(result); if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) { hist.putData(result, positive); } } Collection<DoubleVector> collHist = new ArrayList<>(hist.getNumBins()); for (ObjHistogram.Iter<DoubleDoublePair> iter = hist.iter(); iter.valid(); iter.advance()) { DoubleDoublePair data = iter.getValue(); DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), data.first, data.second}); collHist.add(row); } return new HistogramResult<>("Outlier Score Histogram", "outlier-histogram", collHist); }