/** Performs the DBSCAN algorithm on the given database. */ public Clustering<Model> run(Relation<O> relation) { final int size = relation.size(); if (size < minpts) { Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); result.addToplevelCluster( new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER)); return result; } RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction()); resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); runDBSCAN(relation, rangeQuery); double averagen = ncounter / (double) relation.size(); LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen)); if (averagen < 1 + 0.1 * (minpts - 1)) { LOG.warning("There are very few neighbors found. Epsilon may be too small."); } if (averagen > 100 * minpts) { LOG.warning("There are very many neighbors found. Epsilon may be too large."); } Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); for (ModifiableDBIDs res : resultList) { result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER)); } result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER)); return result; }
/** * Main loop for OUTRES * * @param relation Relation to process * @return Outlier detection result */ public OutlierResult run(Relation<V> relation) { WritableDoubleDataStore ranks = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); KernelDensityEstimator kernel = new KernelDensityEstimator(relation); long[] subspace = BitsUtil.zero(kernel.dim); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("OUTRES scores", relation.size(), LOG) : null; for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { BitsUtil.zeroI(subspace); double score = outresScore(0, subspace, iditer, kernel); ranks.putDouble(iditer, score); minmax.put(score); LOG.incrementProcessed(progress); } LOG.ensureCompleted(progress); OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.); OutlierResult outresResult = new OutlierResult( meta, new MaterializedDoubleRelation("OUTRES", "outres-score", ranks, relation.getDBIDs())); return outresResult; }
/** * Runs the DBSCAN algorithm on the specified partition of the database in the given subspace. If * parameter {@code ids} is null DBSCAN will be applied to the whole database. * * @param relation the database holding the objects to run DBSCAN on * @param ids the IDs of the database defining the partition to run DBSCAN on - if this parameter * is null DBSCAN will be applied to the whole database * @param subspace the subspace to run DBSCAN on * @return the clustering result of the DBSCAN run */ private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace subspace) { // distance function distanceFunction.setSelectedDimensions(subspace.getDimensions()); ProxyDatabase proxy; if (ids == null) { // TODO: in this case, we might want to use an index - the proxy below // will prevent this! ids = relation.getDBIDs(); } proxy = new ProxyDatabase(ids, relation); DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts); // run DBSCAN if (LOG.isVerbose()) { LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString()); } Clustering<Model> dbsres = dbscan.run(proxy); // separate cluster and noise List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters(); List<Cluster<Model>> clusters = new ArrayList<>(); for (Cluster<Model> c : clusterAndNoise) { if (!c.isNoise()) { clusters.add(c); } } return clusters; }
protected void autoEvaluateClusterings(ResultHierarchy hier, Result newResult) { Collection<Clustering<?>> clusterings = ResultUtil.filterResults(hier, newResult, Clustering.class); if (LOG.isDebugging()) { LOG.warning("Number of new clustering results: " + clusterings.size()); } for (Iterator<Clustering<?>> c = clusterings.iterator(); c.hasNext(); ) { Clustering<?> test = c.next(); if ("allinone-clustering".equals(test.getShortName())) { c.remove(); } else if ("allinnoise-clustering".equals(test.getShortName())) { c.remove(); } else if ("bylabel-clustering".equals(test.getShortName())) { c.remove(); } else if ("bymodel-clustering".equals(test.getShortName())) { c.remove(); } } if (clusterings.size() > 0) { try { new EvaluateClustering(new ByLabelClustering(), false, true) .processNewResult(hier, newResult); } catch (NoSupportedDataTypeException e) { // Pass - the data probably did not have labels. } } }
@Override public void run() { Database database = input.getDatabase(); Relation<O> relation = database.getRelation(distance.getInputTypeRestriction()); DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, distance); KNNQuery<O> knnQ = database.getKNNQuery(distanceQuery, DatabaseQuery.HINT_HEAVY_USE); // open file. try (RandomAccessFile file = new RandomAccessFile(out, "rw"); FileChannel channel = file.getChannel(); // and acquire a file write lock FileLock lock = channel.lock()) { // write magic header file.writeInt(KNN_CACHE_MAGIC); int bufsize = k * 12 * 2 + 10; // Initial size, enough for 2 kNN. ByteBuffer buffer = ByteBuffer.allocateDirect(bufsize); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Computing kNN", relation.size(), LOG) : null; for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { final KNNList nn = knnQ.getKNNForDBID(it, k); final int nnsize = nn.size(); // Grow the buffer when needed: if (nnsize * 12 + 10 > bufsize) { while (nnsize * 12 + 10 > bufsize) { bufsize <<= 1; } buffer = ByteBuffer.allocateDirect(bufsize); } buffer.clear(); ByteArrayUtil.writeUnsignedVarint(buffer, it.internalGetIndex()); ByteArrayUtil.writeUnsignedVarint(buffer, nnsize); int c = 0; for (DoubleDBIDListIter ni = nn.iter(); ni.valid(); ni.advance(), c++) { ByteArrayUtil.writeUnsignedVarint(buffer, ni.internalGetIndex()); buffer.putDouble(ni.doubleValue()); } if (c != nn.size()) { throw new AbortException("Sizes did not agree. Cache is invalid."); } buffer.flip(); channel.write(buffer); LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); lock.release(); } catch (IOException e) { LOG.exception(e); } // FIXME: close! }
@Override public void processNewResult(ResultHierarchy hier, Result newResult) { // We may just have added this result. if (newResult instanceof Clustering && isReferenceResult((Clustering<?>) newResult)) { return; } Database db = ResultUtil.findDatabase(hier); List<Clustering<?>> crs = ResultUtil.getClusteringResults(newResult); if (crs == null || crs.size() < 1) { return; } // Compute the reference clustering Clustering<?> refc = null; // Try to find an existing reference clustering (globally) { Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, db, Clustering.class); for (Clustering<?> test : cs) { if (isReferenceResult(test)) { refc = test; break; } } } // Try to find an existing reference clustering (locally) if (refc == null) { Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, newResult, Clustering.class); for (Clustering<?> test : cs) { if (isReferenceResult(test)) { refc = test; break; } } } if (refc == null) { LOG.debug("Generating a new reference clustering."); Result refres = referencealg.run(db); List<Clustering<?>> refcrs = ResultUtil.getClusteringResults(refres); if (refcrs.size() == 0) { LOG.warning("Reference algorithm did not return a clustering result!"); return; } if (refcrs.size() > 1) { LOG.warning("Reference algorithm returned more than one result!"); } refc = refcrs.get(0); } else { LOG.debug("Using existing clustering: " + refc.getLongName() + " " + refc.getShortName()); } for (Clustering<?> c : crs) { if (c == refc) { continue; } evaluteResult(db, c, refc); } }
/** * Run the Eclat algorithm * * @param db Database to process * @param relation Bit vector relation * @return Frequent patterns found */ public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) { // TODO: implement with resizable arrays, to not need dim. final int dim = RelationUtil.dimensionality(relation); final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation); // Compute absolute minsupport final int minsupp = getMinimumSupport(relation.size()); LOG.verbose("Build 1-dimensional transaction lists."); Duration ctime = LOG.newDuration(STAT + "eclat.transposition.time").begin(); DBIDs[] idx = buildIndex(relation, dim, minsupp); LOG.statistics(ctime.end()); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Building frequent itemsets", idx.length, LOG) : null; Duration etime = LOG.newDuration(STAT + "eclat.extraction.time").begin(); final List<Itemset> solution = new ArrayList<>(); for (int i = 0; i < idx.length; i++) { LOG.incrementProcessed(prog); extractItemsets(idx, i, minsupp, solution); } LOG.ensureCompleted(prog); Collections.sort(solution); LOG.statistics(etime.end()); LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size())); return new FrequentItemsetsResult("Eclat", "eclat", solution, meta); }
/** * Preprocessing step: determine the radii of interest for each point. * * @param ids IDs to process * @param rangeQuery Range query * @param interestingDistances Distances of interest */ protected void precomputeInterestingRadii( DBIDs ids, RangeQuery<O> rangeQuery, WritableDataStore<DoubleIntArrayList> interestingDistances) { FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null; for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax); // build list of critical distances DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1); { int i = 0; DoubleDBIDListIter ni = neighbors.iter(); while (ni.valid()) { final double curdist = ni.doubleValue(); ++i; ni.advance(); // Skip, if tied to the next object: if (ni.valid() && curdist == ni.doubleValue()) { continue; } cdist.append(curdist, i); // Scale radius, and reinsert if (alpha != 1.) { final double ri = curdist / alpha; if (ri <= rmax) { cdist.append(ri, Integer.MIN_VALUE); } } } } cdist.sort(); // fill the gaps to have fast lookups of number of neighbors at a given // distance. int lastk = 0; for (int i = 0, size = cdist.size(); i < size; i++) { final int k = cdist.getInt(i); if (k == Integer.MIN_VALUE) { cdist.setValue(i, lastk); } else { lastk = k; } } // TODO: shrink the list, removing duplicate radii? interestingDistances.put(iditer, cdist); LOG.incrementProcessed(progressPreproc); } LOG.ensureCompleted(progressPreproc); }
@Override public void checkRange(DBIDRange range) { final int size = max + 1 - min; if (size < range.size()) { LOG.warning("Distance matrix has size " + size + " but range has size: " + range.size()); } }
@Override public Clustering<KMeansModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; int iteration = 0; for (; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); logVarstat(varstat, varsum); // Stop if no cluster assignment changed. if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } LOG.setCompleted(prog); if (LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); } // Wrap result Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); if (ids.size() == 0) { continue; } KMeansModel model = new KMeansModel(means[i], varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; }
/** * Generates {@code d+1}-dimensional subspace candidates from the specified {@code d}-dimensional * subspaces. * * @param subspaces the {@code d}-dimensional subspaces * @return the {@code d+1}-dimensional subspace candidates */ private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) { List<Subspace> candidates = new ArrayList<>(); if (subspaces.isEmpty()) { return candidates; } // Generate (d+1)-dimensional candidate subspaces int d = subspaces.get(0).dimensionality(); StringBuilder msgFine = new StringBuilder("\n"); if (LOG.isDebuggingFiner()) { msgFine.append("subspaces ").append(subspaces).append('\n'); } for (int i = 0; i < subspaces.size(); i++) { Subspace s1 = subspaces.get(i); for (int j = i + 1; j < subspaces.size(); j++) { Subspace s2 = subspaces.get(j); Subspace candidate = s1.join(s2); if (candidate != null) { if (LOG.isDebuggingFiner()) { msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n'); } // prune irrelevant candidate subspaces List<Subspace> lowerSubspaces = lowerSubspaces(candidate); if (LOG.isDebuggingFiner()) { msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n'); } boolean irrelevantCandidate = false; for (Subspace s : lowerSubspaces) { if (!subspaces.contains(s)) { irrelevantCandidate = true; break; } } if (!irrelevantCandidate) { candidates.add(candidate); } } } } if (LOG.isDebuggingFiner()) { LOG.debugFiner(msgFine.toString()); } if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append(d + 1).append("-dimensional candidate subspaces: "); for (Subspace candidate : candidates) { msg.append(candidate.dimensonsToString()).append(' '); } LOG.debug(msg.toString()); } return candidates; }
@Override protected void prepareComplete() { StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null; scalingreferencevalues = new double[dimensionality]; randomPerAttribute = new Random[dimensionality]; if (scalingreference == ScalingReference.STDDEV) { if (buf != null) { buf.append("Standard deviation per attribute: "); } for (int d = 0; d < dimensionality; d++) { scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage; if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { scalingreferencevalues[d] = percentage; } randomPerAttribute[d] = new Random(RANDOM.nextLong()); if (buf != null) { buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage); } } } else if (scalingreference == ScalingReference.MINMAX && minima.length == 0 && maxima.length == 0) { if (buf != null) { buf.append("extension per attribute: "); } for (int d = 0; d < dimensionality; d++) { scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage; if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { scalingreferencevalues[d] = percentage; } randomPerAttribute[d] = new Random(RANDOM.nextLong()); if (buf != null) { buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage); } } } mvs = null; if (buf != null) { LOG.debugFine(buf.toString()); } }
/** Runs the wrapper with the specified arguments. */ @Override public void run() throws UnableToComplyException { MultipleObjectsBundle data = generator.loadData(); if (LOG.isVerbose()) { LOG.verbose("Writing output ..."); } try { if (outputFile.exists()) { if (LOG.isVerbose()) { LOG.verbose( "The file " + outputFile + " already exists, " + "the generator result will be APPENDED."); } } try (OutputStreamWriter outStream = new FileWriter(outputFile, true)) { writeClusters(outStream, data); } } catch (FileNotFoundException e) { throw new UnableToComplyException(e); } catch (IOException e) { throw new UnableToComplyException(e); } if (LOG.isVerbose()) { LOG.verbose("Done."); } }
/** * Run the DBSCAN algorithm * * @param relation Data relation * @param rangeQuery Range query class */ protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) { final int size = relation.size(); FiniteProgress objprog = LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null; IndefiniteProgress clusprog = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null; processedIDs = DBIDUtil.newHashSet(size); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if (!processedIDs.contains(iditer)) { expandCluster(relation, rangeQuery, iditer, objprog, clusprog); } if (objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), LOG); clusprog.setProcessed(resultList.size(), LOG); } if (processedIDs.size() == size) { break; } } // Finish progress logging LOG.ensureCompleted(objprog); LOG.setCompleted(clusprog); }
/** * Process a database * * @param database Database to process * @param relation Relation to process * @return Histogram of ranking qualities */ public HistogramResult<DoubleVector> run(Database database, Relation<O> relation) { final DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, getDistanceFunction()); final KNNQuery<O> knnQuery = database.getKNNQuery(distanceQuery, relation.size()); if (LOG.isVerbose()) { LOG.verbose("Preprocessing clusters..."); } // Cluster by labels Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters(); DoubleStaticHistogram hist = new DoubleStaticHistogram(numbins, 0.0, 1.0); if (LOG.isVerbose()) { LOG.verbose("Processing points..."); } FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null; MeanVariance mv = new MeanVariance(); // sort neighbors for (Cluster<?> clus : split) { for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { KNNList knn = knnQuery.getKNNForDBID(iter, relation.size()); double result = new ROCEvaluation().evaluate(clus, knn); mv.put(result); hist.increment(result, 1. / relation.size()); LOG.incrementProcessed(progress); } } LOG.ensureCompleted(progress); // Transform Histogram into a Double Vector array. Collection<DoubleVector> res = new ArrayList<>(relation.size()); for (DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) { DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), iter.getValue()}); res.add(row); } HistogramResult<DoubleVector> result = new HistogramResult<>("Ranking Quality Histogram", "ranking-histogram", res); result.addHeader("Mean: " + mv.getMean() + " Variance: " + mv.getSampleVariance()); return result; }
@Override protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID) // .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); if (config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } IntParameter minptsP = new IntParameter(MINPTS_ID) // .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(minptsP)) { minpts = minptsP.getValue(); if (minpts <= 2) { LOG.warning( "DBSCAN with minPts <= 2 is equivalent to single-link clustering at a single height. Consider using larger values of minPts."); } } }
private void loadCache(DistanceParser parser, File matrixfile) throws IOException { InputStream in = new BufferedInputStream(FileUtil.tryGzipInput(new FileInputStream(matrixfile))); cache = new TLongFloatHashMap( Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, -1L, Float.POSITIVE_INFINITY); min = Integer.MAX_VALUE; max = Integer.MIN_VALUE; parser.parse( in, new DistanceCacheWriter() { @Override public void put(int id1, int id2, double distance) { if (id1 < id2) { min = id1 < min ? id1 : min; max = id2 > max ? id2 : max; } else { min = id2 < min ? id2 : min; max = id1 > max ? id1 : max; } cache.put(makeKey(id1, id2), (float) distance); } @Override public boolean containsKey(int id1, int id2) { return cache.containsKey(makeKey(id1, id2)); } }); if (min != 0) { LOG.verbose( "Distance matrix is supposed to be 0-indexed. Choosing offset " + min + " to compensate."); } }
public Result run(Database database, Relation<O> rel) { DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction()); int size = rel.size(); long pairs = (size * (long) size) >> 1; final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling; if (ssize > Integer.MAX_VALUE) { throw new AbortException("Sampling size too large."); } final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize); DoubleMaxHeap heap = new DoubleMaxHeap(qsize); ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs()); DBIDArrayIter i1 = ids.iter(), i2 = ids.iter(); Random r = rand.getSingleThreadedRandom(); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null; for (long i = 0; i < ssize; i++) { int x = r.nextInt(size - 1) + 1, y = r.nextInt(x); double dist = dq.distance(i1.seek(x), i2.seek(y)); // Skip NaN, and/or zeros. if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) { continue; } heap.add(dist, qsize); LOG.incrementProcessed(prog); } LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile)); LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize)); LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek())); LOG.ensureCompleted(prog); Collection<String> header = Arrays.asList(new String[] {"Distance"}); Collection<Vector> data = Arrays.asList(new Vector[] {new Vector(heap.peek())}); return new CollectionResult<Vector>("Distances sample", "distance-sample", data, header); }
/** * A filter to perturb the values by adding micro-noise. * * <p>The added noise is generated, attribute-wise, by a Gaussian with mean=0 and a specified * standard deviation or by a uniform distribution with a specified range. The standard deviation or * the range can be scaled, attribute-wise, to a given percentage of the original standard deviation * in the data distribution (assuming a Gaussian distribution there), or to a percentage of the * extension in each attribute ({@code maximumValue - minimumValue}). * * <p>This filter has a potentially wide use but has been implemented for the following publication: * * <p>Reference: * * <p>A. Zimek, R. J. G. B. Campello, J. Sander:</br> Data Perturbation for Outlier Detection * Ensembles.<\br> In: Proc. 26th International Conference on Scientific and Statistical Database * Management (SSDBM), Aalborg, Denmark, 2014. * * @author Arthur Zimek */ @Title("Data Perturbation for Outlier Detection Ensembles") @Description( "A filter to perturb a datasset on read by an additive noise component, implemented for use in an outlier ensemble (this reference).") @Reference( authors = "A. Zimek, R. J. G. B. Campello, J. Sander", // title = "Data Perturbation for Outlier Detection Ensembles", // booktitle = "Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014", // url = "http://dx.doi.org/10.1145/2618243.2618257") public class PerturbationFilter<V extends NumberVector> extends AbstractVectorConversionFilter<V, V> { /** Class logger */ private static final Logging LOG = Logging.getLogger(PerturbationFilter.class); /** * Scaling reference options. * * @author Arthur Zimek * @apiviz.exclude */ public static enum ScalingReference { UNITCUBE, STDDEV, MINMAX } /** * Nature of the noise distribution. * * @author Arthur Zimek * @apiviz.exclude */ public static enum NoiseDistribution { GAUSSIAN, UNIFORM } /** Which reference to use for scaling the noise. */ private ScalingReference scalingreference; /** Nature of the noise distribution. */ private NoiseDistribution noisedistribution; /** Random object to generate the attribute-wise seeds for the noise. */ private final Random RANDOM; /** * Percentage of the variance of the random noise generation, given the variance of the * corresponding attribute in the data. */ private double percentage; /** Temporary storage used during initialization. */ private MeanVarianceMinMax[] mvs = null; /** Stores the scaling reference in each dimension. */ private double[] scalingreferencevalues = new double[0]; /** The random objects to generate noise distributions independently for each attribute. */ private Random[] randomPerAttribute = null; /** Stores the maximum in each dimension. */ private double[] maxima; /** Stores the minimum in each dimension. */ private double[] minima; /** Stores the dimensionality from the preprocessing. */ private int dimensionality = 0; /** * Constructor. * * @param seed Seed value, may be {@code null} for a random seed. * @param percentage Relative amount of jitter to add * @param scalingreference Scaling reference * @param minima Preset minimum values. May be {@code null}. * @param maxima Preset maximum values. May be {@code null}. * @param noisedistribution Nature of the noise distribution. */ public PerturbationFilter( Long seed, double percentage, ScalingReference scalingreference, double[] minima, double[] maxima, NoiseDistribution noisedistribution) { super(); this.percentage = percentage; this.scalingreference = scalingreference; this.minima = minima; this.maxima = maxima; this.noisedistribution = noisedistribution; this.RANDOM = (seed == null) ? new Random() : new Random(seed); } @Override protected boolean prepareStart(SimpleTypeInformation<V> in) { if (scalingreference == ScalingReference.MINMAX && minima.length != 0 && maxima.length != 0) { dimensionality = minima.length; scalingreferencevalues = new double[dimensionality]; randomPerAttribute = new Random[dimensionality]; for (int d = 0; d < dimensionality; d++) { scalingreferencevalues[d] = (maxima[d] - minima[d]) * percentage; if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { scalingreferencevalues[d] = percentage; } randomPerAttribute[d] = new Random(RANDOM.nextLong()); } return false; } if (scalingreference == ScalingReference.UNITCUBE) { return false; } return (scalingreferencevalues.length == 0); } @Override protected void prepareProcessInstance(V featureVector) { // First object? Then init. (We didn't have a dimensionality before!) if (mvs == null) { dimensionality = featureVector.getDimensionality(); mvs = MeanVarianceMinMax.newArray(dimensionality); } for (int d = 0; d < featureVector.getDimensionality(); d++) { mvs[d].put(featureVector.doubleValue(d)); } } @Override protected void prepareComplete() { StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null; scalingreferencevalues = new double[dimensionality]; randomPerAttribute = new Random[dimensionality]; if (scalingreference == ScalingReference.STDDEV) { if (buf != null) { buf.append("Standard deviation per attribute: "); } for (int d = 0; d < dimensionality; d++) { scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage; if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { scalingreferencevalues[d] = percentage; } randomPerAttribute[d] = new Random(RANDOM.nextLong()); if (buf != null) { buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage); } } } else if (scalingreference == ScalingReference.MINMAX && minima.length == 0 && maxima.length == 0) { if (buf != null) { buf.append("extension per attribute: "); } for (int d = 0; d < dimensionality; d++) { scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage; if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { scalingreferencevalues[d] = percentage; } randomPerAttribute[d] = new Random(RANDOM.nextLong()); if (buf != null) { buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage); } } } mvs = null; if (buf != null) { LOG.debugFine(buf.toString()); } } @Override protected SimpleTypeInformation<? super V> getInputTypeRestriction() { return TypeUtil.NUMBER_VECTOR_FIELD; } @Override protected V filterSingleObject(V featureVector) { if (scalingreference == ScalingReference.UNITCUBE && dimensionality == 0) { dimensionality = featureVector.getDimensionality(); scalingreferencevalues = new double[dimensionality]; randomPerAttribute = new Random[dimensionality]; for (int d = 0; d < dimensionality; d++) { scalingreferencevalues[d] = percentage; randomPerAttribute[d] = new Random(RANDOM.nextLong()); } } if (scalingreferencevalues.length != featureVector.getDimensionality()) { throw new IllegalArgumentException( "FeatureVectors and given Minima/Maxima differ in length."); } double[] values = new double[featureVector.getDimensionality()]; for (int d = 0; d < featureVector.getDimensionality(); d++) { if (this.noisedistribution.equals(NoiseDistribution.GAUSSIAN)) { values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextGaussian() * scalingreferencevalues[d]; } else if (this.noisedistribution.equals(NoiseDistribution.UNIFORM)) { values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextDouble() * scalingreferencevalues[d]; } } return factory.newNumberVector(values); } @Override protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) { initializeOutputType(in); return in; } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Arthur Zimek * @apiviz.exclude */ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { /** Parameter for minimum. */ public static final OptionID MINIMA_ID = new OptionID( "perturbationfilter.min", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the minimum values in each dimension assumed as a reference. If no value is specified, the minimum value of the attribute range in this dimension will be taken."); /** Parameter for maximum. */ public static final OptionID MAXIMA_ID = new OptionID( "perturbationfilter.max", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the maximum values in each dimension assumed as a reference. If no value is specified, the maximum value of the attribute range in this dimension will be taken."); /** Stores the maximum in each dimension. */ private double[] maxima = new double[0]; /** Stores the minimum in each dimension. */ private double[] minima = new double[0]; /** * Optional parameter to specify a seed for random Gaussian noise generation. If unused, system * time is used as seed. * * <p>Key: {@code -perturbationfilter.seed} */ public static final OptionID SEED_ID = new OptionID("perturbationfilter.seed", "Seed for random noise generation."); /** * Seed for randomly shuffling the rows of the database. If null, system time is used as seed. */ protected Long seed = null; /** * Optional parameter to specify a percentage of the standard deviation of the random Gaussian * noise generation, given the standard deviation of the corresponding attribute in the original * data distribution (assuming a Gaussian there). * * <p>Key: {@code -perturbationfilter.percentage} * * <p>Default: <code>0.01</code> * * <p>Constraint: 0 < percentage ≤1 */ public static final OptionID PERCENTAGE_ID = new OptionID( "perturbationfilter.percentage", "Percentage of the standard deviation of the random Gaussian noise generation per attribute, given the standard deviation of the corresponding attribute in the original data distribution (assuming a Gaussian distribution there)."); /** * Parameter for selecting scaling reference. * * <p>Key: {@code -perturbationfilter.scalingreference} * * <p>Default: <code>ScalingReference.UNITCUBE</code> */ public static final OptionID SCALINGREFERENCE_ID = new OptionID( "perturbationfilter.scalingreference", "The reference for scaling the Gaussian noise. Default is " + ScalingReference.UNITCUBE + ", parameter " + PERCENTAGE_ID.getName() + " will then directly define the standard deviation of all noise Gaussians. For options " + ScalingReference.STDDEV + " and " + ScalingReference.MINMAX + ", the percentage of the attributewise standard deviation or extension, repectively, will define the attributewise standard deviation of the noise Gaussians."); /** * Parameter for selecting the noise distribution. * * <p>Key: {@code -perturbationfilter.noisedistribution} * * <p>Default: <code>NoiseDistribution.UNIFORM</code> */ public static final OptionID NOISEDISTRIBUTION_ID = new OptionID( "perturbationfilter.noisedistribution", "The nature of the noise distribution, default is " + NoiseDistribution.UNIFORM); /** * Percentage of the variance of the random Gaussian noise generation or of the range of the * uniform distribution, given the variance of the corresponding attribute in the data. */ protected double percentage; /** The option which reference to use for scaling the noise. */ protected ScalingReference scalingreference; /** The option which nature of noise distribution to choose. */ protected NoiseDistribution noisedistribution; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); EnumParameter<ScalingReference> scalingReferenceP = new EnumParameter<>( SCALINGREFERENCE_ID, ScalingReference.class, ScalingReference.UNITCUBE); if (config.grab(scalingReferenceP)) { scalingreference = scalingReferenceP.getValue(); } EnumParameter<NoiseDistribution> noisedistributionP = new EnumParameter<>( NOISEDISTRIBUTION_ID, NoiseDistribution.class, NoiseDistribution.UNIFORM); if (config.grab(noisedistributionP)) { noisedistribution = noisedistributionP.getValue(); } DoubleParameter percentageP = new DoubleParameter(PERCENTAGE_ID, .01); percentageP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); percentageP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE); if (config.grab(percentageP)) { percentage = percentageP.getValue(); } LongParameter seedP = new LongParameter(SEED_ID); seedP.setOptional(true); if (config.grab(seedP)) { seed = seedP.getValue(); } DoubleListParameter minimaP = new DoubleListParameter(MINIMA_ID); minimaP.setOptional(true); if (config.grab(minimaP)) { minima = minimaP.getValue().clone(); } DoubleListParameter maximaP = new DoubleListParameter(MAXIMA_ID); maximaP.setOptional(true); if (config.grab(maximaP)) { maxima = maximaP.getValue().clone(); } config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(minimaP, maximaP)); config.checkConstraint(new EqualSizeGlobalConstraint(minimaP, maximaP)); } @Override protected PerturbationFilter<V> makeInstance() { return new PerturbationFilter<>( seed, percentage, scalingreference, minima, maxima, noisedistribution); } } }
/** * The Spatial Outlier Factor (SOF) is a spatial {@link * de.lmu.ifi.dbs.elki.algorithm.outlier.lof.LOF LOF} variation. * * <p>Since the "reachability distance" of LOF cannot be used canonically in the bichromatic case, * this part of LOF is dropped and the exact distance is used instead. * * <p>Huang, T., Qin, X.<br> * Detecting outliers in spatial database.<br> * In: Proc. 3rd International Conference on Image and Graphics, Hong Kong, China. A LOF variation * simplified with reachDist(o,p) == dist(o,p). * * @author Ahmed Hettab * @since 0.4.0 * @param <N> Neighborhood object type * @param <O> Attribute object type */ @Title("Spatial Outlier Factor") @Reference( authors = "Huang, T., Qin, X.", title = "Detecting outliers in spatial database", booktitle = "Proc. 3rd International Conference on Image and Graphics", url = "http://dx.doi.org/10.1109/ICIG.2004.53") public class SOF<N, O> extends AbstractDistanceBasedSpatialOutlier<N, O> { /** The logger for this class. */ private static final Logging LOG = Logging.getLogger(SOF.class); /** * Constructor. * * @param npred Neighborhood predicate * @param nonSpatialDistanceFunction Distance function on non-spatial attributes */ public SOF( NeighborSetPredicate.Factory<N> npred, PrimitiveDistanceFunction<O> nonSpatialDistanceFunction) { super(npred, nonSpatialDistanceFunction); } @Override protected Logging getLogger() { return LOG; } /** * The main run method * * @param database Database to use (actually unused) * @param spatial Relation for neighborhood * @param relation Attributes to evaluate * @return Outlier result */ public OutlierResult run(Database database, Relation<N> spatial, Relation<O> relation) { final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(database, spatial); DistanceQuery<O> distFunc = getNonSpatialDistanceFunction().instantiate(relation); WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT); WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax lofminmax = new DoubleMinMax(); // Compute densities for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { DBIDs neighbors = npred.getNeighborDBIDs(iditer); double avg = 0; for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { avg += distFunc.distance(iditer, iter); } double lrd = 1 / (avg / neighbors.size()); if (Double.isNaN(lrd)) { lrd = 0; } lrds.putDouble(iditer, lrd); } // Compute density quotients for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { DBIDs neighbors = npred.getNeighborDBIDs(iditer); double avg = 0; for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { avg += lrds.doubleValue(iter); } final double lrd = (avg / neighbors.size()) / lrds.doubleValue(iditer); if (!Double.isNaN(lrd)) { lofs.putDouble(iditer, lrd); lofminmax.put(lrd); } else { lofs.putDouble(iditer, 0.0); } } // Build result representation. DoubleRelation scoreResult = new MaterializedDoubleRelation( "Spatial Outlier Factor", "sof-outlier", lofs, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta( lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0); OutlierResult or = new OutlierResult(scoreMeta, scoreResult); or.addChildResult(npred); return or; } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array( getNeighborSetPredicateFactory().getInputTypeRestriction(), TypeUtil.NUMBER_VECTOR_FIELD); } /** * Parameterization class * * @author Ahmed Hettab * @apiviz.exclude * @param <N> Neighborhood type * @param <O> Attribute object type */ public static class Parameterizer<N, O> extends AbstractDistanceBasedSpatialOutlier.Parameterizer<N, O> { @Override protected SOF<N, O> makeInstance() { return new SOF<>(npredf, distanceFunction); } } }
/** * Algorithm 3 of Cheng and Church. * * <p>Try to re-add rows or columns that decrease the overall score. * * <p>Also try adding inverted rows. * * @param mat Data matrix * @param cand Bicluster candidate */ private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) { cand.updateRowAndColumnMeans(mat, true); cand.computeMeanSquaredDeviation(mat); while (true) { // We need this to be final + mutable final boolean[] added = new boolean[] {false, false}; // Step 2: add columns cand.visitRow( mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (!selcol); if (cand.computeColResidue(mat, col) <= cand.residue) { cand.selectColumn(col, true); added[0] = true; } return false; } }); // Step 3: recompute values if (added[0]) { cand.updateRowAndColumnMeans(mat, true); cand.computeMeanSquaredDeviation(mat); } // Step 4: try adding rows. cand.visitColumn( mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (!selrow); if (cand.computeRowResidue(mat, row, false) <= cand.residue) { cand.selectRow(row, true); added[1] = true; } return false; } }); // Step 5: try adding inverted rows. if (useinverted) { cand.visitColumn( mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (!selrow); if (cand.computeRowResidue(mat, row, true) <= cand.residue) { cand.selectRow(row, true); cand.invertRow(row, true); added[1] = true; } return false; } }); } if (added[1]) { cand.updateRowAndColumnMeans(mat, true); cand.computeMeanSquaredDeviation(mat); if (LOG.isDebuggingFine()) { LOG.debugFine( "Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } } if (!added[0] && !added[1]) { break; } } }
/** * Algorithm 2 of Cheng and Church. * * <p>Remove all rows and columns that reduce the residue by alpha. * * <p>Inverted rows are not supported in this method. * * @param mat Data matrix * @param cand Bicluster candidate */ private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); // Note: assumes that cand.residue = H(I,J) while (cand.residue > delta) { final boolean[] modified = {false, false}; // Step 2: remove rows above threshold if (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) { final double alphaResidue = alpha * cand.residue; cand.visitColumn( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selrow); if (cand.computeRowResidue(mat, row, false) > alphaResidue) { cand.selectRow(row, false); modified[0] = true; } return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD); } }); // Step 3: update residue if (modified[0]) { cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); } } // Step 4: remove columns above threshold if (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) { final double alphaResidue = alpha * cand.residue; cand.visitRow( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selcol); if (cand.computeColResidue(mat, col) > alphaResidue) { cand.selectColumn(col, false); modified[1] = true; } return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD); } }); if (modified[1]) { cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); } } if (LOG.isDebuggingFine()) { LOG.debugFine( "Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } // Step 5: if nothing has been removed, try removing single nodes. if (!modified[0] && !modified[1]) { break; // Will be executed next in main loop, as per algorithm 4. // singleNodeDeletion(); } } }
/** * Perform Cheng and Church biclustering. * * <p>Reference: <br> * Y. Cheng and G. M. Church. Biclustering of expression data. In Proceedings of the 8th * International Conference on Intelligent Systems for Molecular Biology (ISMB), San Diego, CA, * 2000. * * @author Erich Schubert * @param <V> Vector type. */ @Reference( authors = "Y. Cheng, G. M. Church", title = "Biclustering of expression data", booktitle = "Proc. 8th International Conference on Intelligent Systems for Molecular Biology (ISMB)") public class ChengAndChurch<V extends NumberVector> extends AbstractBiclustering<V, BiclusterWithInversionsModel> { /** The logger for this class. */ private static final Logging LOG = Logging.getLogger(ChengAndChurch.class); /** * The minimum number of columns that the database must have so that a removal of columns is * performed in {@link #multipleNodeDeletion}. * * <p>Just start deleting multiple columns when more than 100 columns are in the data matrix. */ private static final int MIN_COLUMN_REMOVE_THRESHOLD = 100; /** * The minimum number of rows that the database must have so that a removal of rows is performed * in {@link #multipleNodeDeletion}. * * <p>Just start deleting multiple rows when more than 100 rows are in the data matrix. * <!-- * <p> * The value is set to 100 as this is not really described in the paper. * </p> * --> */ private static final int MIN_ROW_REMOVE_THRESHOLD = 100; /** Threshold for the score. */ private double delta; /** * The parameter for multiple node deletion. * * <p>It is used to magnify the {@link #delta} value in the {@link #multipleNodeDeletion} method. */ private double alpha; /** Number of biclusters to be found. */ private int n; /** Allow inversion of rows in the last phase. */ private boolean useinverted = true; /** Distribution to sample random replacement values from. */ private Distribution dist; /** * Constructor. * * @param delta Delta parameter: desired quality * @param alpha Alpha parameter: controls switching to single node deletion approach * @param n Number of clusters to detect * @param dist Distribution of random values to insert */ public ChengAndChurch(double delta, double alpha, int n, Distribution dist) { super(); this.delta = delta; this.alpha = alpha; this.n = n; this.dist = dist; } /** * Visitor pattern for processing cells. * * @author Erich Schubert * @apiviz.exclude */ public static interface CellVisitor { /** Different modes of operation. */ int ALL = 0, SELECTED = 1, NOT_SELECTED = 2; /** * Visit a cell. * * @param val Value * @param row Row Number * @param col Column number * @param selrow Boolean, whether row is selected * @param selcol Boolean, whether column is selected * @return Stop flag, return {@code true} to stop visiting */ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol); } /** * Bicluster candidate. * * @author Erich Schubert * @apiviz.exclude */ protected static class BiclusterCandidate { /** Cardinalities. */ int rowcard, colcard; /** Means. */ double[] rowM, colM; /** Row and column bitmasks. */ long[] rows, irow, cols; /** Mean of the current bicluster. */ double allM; /** The current bicluster score (mean squared residue). */ double residue; /** * Constructor. * * @param rows Row dimensionality. * @param cols Column dimensionality. */ protected BiclusterCandidate(int rows, int cols) { super(); this.rows = BitsUtil.ones(rows); this.irow = BitsUtil.zero(rows); this.rowcard = rows; this.rowM = new double[rows]; this.cols = BitsUtil.ones(cols); this.colcard = cols; this.colM = new double[cols]; } /** Resets the values for the next cluster search. */ protected void reset() { rows = BitsUtil.ones(rowM.length); rowcard = rowM.length; cols = BitsUtil.ones(colM.length); colcard = colM.length; BitsUtil.zeroI(irow); } /** * Visit all selected cells in the data matrix. * * @param mat Data matrix * @param mode Operation mode * @param visitor Visitor function */ protected void visitAll(double[][] mat, int mode, CellVisitor visitor) { // For efficiency, we manually iterate over the rows and column bitmasks. // This saves repeated shifting needed by the manual bit access. for (int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) { long rlong = rows[rlpos]; // Fast skip blocks of 64 masked values. if ((mode == CellVisitor.SELECTED && rlong == 0L) || (mode == CellVisitor.NOT_SELECTED && rlong == -1L)) { rpos += Long.SIZE; continue; } for (int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) { boolean rselected = ((rlong & 1L) == 1L); if ((mode == CellVisitor.SELECTED && !rselected) || (mode == CellVisitor.NOT_SELECTED && rselected)) { continue; } for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { long clong = cols[clpos]; if ((mode == CellVisitor.SELECTED && clong == 0L) || (mode == CellVisitor.NOT_SELECTED && clong == -1L)) { cpos += Long.SIZE; continue; } for (int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) { boolean cselected = ((clong & 1L) == 1L); if ((mode == CellVisitor.SELECTED && !cselected) || (mode == CellVisitor.NOT_SELECTED && cselected)) { continue; } boolean stop = visitor.visit(mat[rpos][cpos], rpos, cpos, rselected, cselected); if (stop) { return; } } } } } } /** * Visit a column of the matrix. * * @param mat Data matrix * @param col Column to visit * @param mode Operation mode * @param visitor Visitor function */ protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) { boolean cselected = BitsUtil.get(cols, col); // For efficiency, we manually iterate over the rows and column bitmasks. // This saves repeated shifting needed by the manual bit access. for (int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) { long rlong = rows[rlpos]; // Fast skip blocks of 64 masked values. if (mode == CellVisitor.SELECTED && rlong == 0L) { rpos += Long.SIZE; continue; } if (mode == CellVisitor.NOT_SELECTED && rlong == -1L) { rpos += Long.SIZE; continue; } for (int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) { boolean rselected = ((rlong & 1L) == 1L); if (mode == CellVisitor.SELECTED && !rselected) { continue; } if (mode == CellVisitor.NOT_SELECTED && rselected) { continue; } boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected); if (stop) { return; } } } } /** * Visit a row of the data matrix. * * @param mat Data matrix * @param row Row to visit * @param visitor Visitor function */ protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) { boolean rselected = BitsUtil.get(rows, row); final double[] rowdata = mat[row]; for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { long clong = cols[clpos]; // Fast skip blocks of 64 masked values. if (mode == CellVisitor.SELECTED && clong == 0L) { cpos += Long.SIZE; continue; } if (mode == CellVisitor.NOT_SELECTED && clong == -1L) { cpos += Long.SIZE; continue; } for (int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) { boolean cselected = ((clong & 1L) == 1L); if (mode == CellVisitor.SELECTED && !cselected) { continue; } if (mode == CellVisitor.NOT_SELECTED && cselected) { continue; } boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected); if (stop) { return; } } } } /** Visitor for updating the means. */ private final CellVisitor MEANVISITOR = new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { if (selcol) { rowM[row] += val; } if (selrow) { colM[col] += val; } if (selcol && selrow) { allM += val; } return false; } }; /** * Update the row means and column means. * * @param mat Data matrix * @param all Flag, to update all * @return overall mean */ protected double updateRowAndColumnMeans(final double[][] mat, boolean all) { final int mode = all ? CellVisitor.ALL : CellVisitor.SELECTED; Arrays.fill(rowM, 0.); Arrays.fill(colM, 0.); allM = 0.; visitAll(mat, mode, MEANVISITOR); visitColumn( mat, 0, mode, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { rowM[row] /= colcard; return false; } }); visitRow( mat, 0, mode, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { colM[col] /= rowcard; return false; } }); allM /= colcard * rowcard; return allM; } /** * Compute the mean square residue. * * @param mat Data matrix * @return mean squared residue */ protected double computeMeanSquaredDeviation(final double[][] mat) { final Mean msr = new Mean(); visitAll( mat, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selrow && selcol); double v = val - rowM[row] - colM[col] + allM; msr.put(v * v); return false; } }); residue = msr.getMean(); return residue; } /** * Computes the <b>mean row residue</b> of the given <code>row</code>. * * @param mat Data matrix * @param row The row who's residue should be computed. * @param rowinverted Indicates if the row should be considered inverted. * @return The row residue of the given <code>row</code>. */ protected double computeRowResidue(final double[][] mat, int row, final boolean rowinverted) { final Mean rowResidue = new Mean(); visitRow( mat, row, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selcol); final double rowMean = rowM[row]; final double colMean = colM[col]; double v = ((!rowinverted) ? (val - rowMean) : (rowMean - val)) - colMean + allM; rowResidue.put(v * v); return false; } }); return rowResidue.getMean(); } /** * Computes the <b>mean column residue</b> of the given <code>col</code>. * * @param col The column who's residue should be computed. * @return The row residue of the given <code>col</code>um. */ protected double computeColResidue(final double[][] mat, final int col) { final double bias = colM[col] - allM; final Mean colResidue = new Mean(); visitColumn( mat, col, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selrow); final double rowMean = rowM[row]; double v = val - rowMean - bias; colResidue.put(v * v); return false; } }); return colResidue.getMean(); } /** * Updates the mask with replacement values for all data in the given rows and columns. * * @param mat Mask to update. * @param replacement Distribution to sample replacement values from. */ protected void maskMatrix(final double[][] mat, final Distribution replacement) { visitAll( mat, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selrow && selcol); mat[row][col] = replacement.nextRandom(); return false; } }); } /** * Select or deselect a column. * * @param cnum Column to select * @param set Value to set */ protected void selectColumn(int cnum, boolean set) { if (set) { BitsUtil.setI(cols, cnum); colcard++; } else { BitsUtil.clearI(cols, cnum); colcard--; } } /** * Select or deselect a row. * * @param rnum Row to select * @param set Value to set */ protected void selectRow(int rnum, boolean set) { if (set) { BitsUtil.setI(rows, rnum); rowcard++; } else { BitsUtil.clearI(rows, rnum); rowcard--; } } protected void invertRow(int rnum, boolean b) { BitsUtil.setI(irow, rnum); } } @Override public Clustering<BiclusterWithInversionsModel> biclustering() { double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; for (int i = 0; i < n; i++) { cand.reset(); multipleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } singleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } nodeAddition(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } cand.maskMatrix(mat, dist); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); noise.removeDBIDs(cids); result.addToplevelCluster(new Cluster<>(cids, model)); if (LOG.isVerbose()) { LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); LOG.verbose("Number of rows: " + cand.rowcard + "\n"); LOG.verbose("Number of columns: " + cand.colcard + "\n"); // LOG.verbose("Total number of masked values: " + maskedVals.size() + // "\n"); } LOG.incrementProcessed(prog); } // Add a noise cluster, full-dimensional. if (!noise.isEmpty()) { long[] allcols = BitsUtil.ones(getColDim()); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); result.addToplevelCluster(new Cluster<>(noise, true, model)); } LOG.ensureCompleted(prog); return result; } /** * Algorithm 1 of Cheng and Church: * * <p>Remove single rows or columns. * * <p>Inverted rows are not supported in this method. * * @param mat Data matrix * @param cand Bicluster candidate */ private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { // Assume that cand.residue is up to date! while (cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) { // Store current maximum. Need final mutable, so use arrays. final double[] max = {Double.NEGATIVE_INFINITY}; final int[] best = {-1, -1}; // Test rows if (cand.rowcard > 2) { cand.visitColumn( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selrow); double rowResidue = cand.computeRowResidue(mat, row, false); if (max[0] < rowResidue) { max[0] = rowResidue; best[0] = row; } return false; } }); } // Test columns: if (cand.colcard > 2) { cand.visitRow( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selcol); double colResidue = cand.computeColResidue(mat, col); if (max[0] < colResidue) { max[0] = colResidue; best[1] = col; } return false; } }); } if (best[1] >= 0) { // then override bestrow! cand.selectColumn(best[1], false); } else { assert (best[0] >= 0); cand.selectRow(best[0], false); } // TODO: incremental update could be much faster? cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); if (LOG.isDebuggingFine()) { LOG.debugFine( "Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } } } // /** * Algorithm 2 of Cheng and Church. * * <p>Remove all rows and columns that reduce the residue by alpha. * * <p>Inverted rows are not supported in this method. * * @param mat Data matrix * @param cand Bicluster candidate */ private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); // Note: assumes that cand.residue = H(I,J) while (cand.residue > delta) { final boolean[] modified = {false, false}; // Step 2: remove rows above threshold if (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) { final double alphaResidue = alpha * cand.residue; cand.visitColumn( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selrow); if (cand.computeRowResidue(mat, row, false) > alphaResidue) { cand.selectRow(row, false); modified[0] = true; } return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD); } }); // Step 3: update residue if (modified[0]) { cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); } } // Step 4: remove columns above threshold if (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) { final double alphaResidue = alpha * cand.residue; cand.visitRow( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selcol); if (cand.computeColResidue(mat, col) > alphaResidue) { cand.selectColumn(col, false); modified[1] = true; } return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD); } }); if (modified[1]) { cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); } } if (LOG.isDebuggingFine()) { LOG.debugFine( "Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } // Step 5: if nothing has been removed, try removing single nodes. if (!modified[0] && !modified[1]) { break; // Will be executed next in main loop, as per algorithm 4. // singleNodeDeletion(); } } } /** * Algorithm 3 of Cheng and Church. * * <p>Try to re-add rows or columns that decrease the overall score. * * <p>Also try adding inverted rows. * * @param mat Data matrix * @param cand Bicluster candidate */ private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) { cand.updateRowAndColumnMeans(mat, true); cand.computeMeanSquaredDeviation(mat); while (true) { // We need this to be final + mutable final boolean[] added = new boolean[] {false, false}; // Step 2: add columns cand.visitRow( mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (!selcol); if (cand.computeColResidue(mat, col) <= cand.residue) { cand.selectColumn(col, true); added[0] = true; } return false; } }); // Step 3: recompute values if (added[0]) { cand.updateRowAndColumnMeans(mat, true); cand.computeMeanSquaredDeviation(mat); } // Step 4: try adding rows. cand.visitColumn( mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (!selrow); if (cand.computeRowResidue(mat, row, false) <= cand.residue) { cand.selectRow(row, true); added[1] = true; } return false; } }); // Step 5: try adding inverted rows. if (useinverted) { cand.visitColumn( mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (!selrow); if (cand.computeRowResidue(mat, row, true) <= cand.residue) { cand.selectRow(row, true); cand.invertRow(row, true); added[1] = true; } return false; } }); } if (added[1]) { cand.updateRowAndColumnMeans(mat, true); cand.computeMeanSquaredDeviation(mat); if (LOG.isDebuggingFine()) { LOG.debugFine( "Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } } if (!added[0] && !added[1]) { break; } } } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * @apiviz.exclude * @param <V> Vector type */ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { /** Parameter to specify the distribution of replacement values when masking a cluster. */ public static final OptionID DIST_ID = new OptionID( "chengandchurch.replacement", "Distribution of replacement values when masking found clusters."); /** * Threshold value to determine the maximal acceptable score (mean squared residue) of a * bicluster. * * <p>Key: {@code -chengandchurch.delta} */ public static final OptionID DELTA_ID = new OptionID( "chengandchurch.delta", "Threshold value to determine the maximal acceptable score (mean squared residue) of a bicluster."); /** * Parameter for multiple node deletion to accelerate the algorithm. (>= 1) * * <p>Key: {@code -chengandchurch.alpha} */ public static final OptionID ALPHA_ID = new OptionID( "chengandchurch.alpha", "Parameter for multiple node deletion to accelerate the algorithm."); /** * Number of biclusters to be found. * * <p>Default value: 1 * * <p>Key: {@code -chengandchurch.n} */ public static final OptionID N_ID = new OptionID("chengandchurch.n", "The number of biclusters to be found."); /** Threshold for the score ({@link #DELTA_ID}). */ private double delta; /** * The parameter for multiple node deletion. * * <p>It is used to magnify the {@link #delta} value in the {@link * ChengAndChurch#multipleNodeDeletion} method. */ private double alpha; /** Number of biclusters to be found. */ private int n; /** Distribution of replacement values. */ private Distribution dist; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleParameter deltaP = new DoubleParameter(DELTA_ID); if (config.grab(deltaP)) { delta = deltaP.doubleValue(); } deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); IntParameter nP = new IntParameter(N_ID, 1); nP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(nP)) { n = nP.intValue(); } DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.); alphaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_DOUBLE); if (config.grab(alphaP)) { alpha = alphaP.doubleValue(); } ObjectParameter<Distribution> distP = new ObjectParameter<>(DIST_ID, Distribution.class, UniformDistribution.class); if (config.grab(distP)) { dist = distP.instantiateClass(config); } } @Override protected ChengAndChurch<V> makeInstance() { return new ChengAndChurch<>(delta, alpha, n, dist); } } }
/** * Algorithm 1 of Cheng and Church: * * <p>Remove single rows or columns. * * <p>Inverted rows are not supported in this method. * * @param mat Data matrix * @param cand Bicluster candidate */ private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { // Assume that cand.residue is up to date! while (cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) { // Store current maximum. Need final mutable, so use arrays. final double[] max = {Double.NEGATIVE_INFINITY}; final int[] best = {-1, -1}; // Test rows if (cand.rowcard > 2) { cand.visitColumn( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selrow); double rowResidue = cand.computeRowResidue(mat, row, false); if (max[0] < rowResidue) { max[0] = rowResidue; best[0] = row; } return false; } }); } // Test columns: if (cand.colcard > 2) { cand.visitRow( mat, 0, CellVisitor.SELECTED, new CellVisitor() { @Override public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { assert (selcol); double colResidue = cand.computeColResidue(mat, col); if (max[0] < colResidue) { max[0] = colResidue; best[1] = col; } return false; } }); } if (best[1] >= 0) { // then override bestrow! cand.selectColumn(best[1], false); } else { assert (best[0] >= 0); cand.selectRow(best[0], false); } // TODO: incremental update could be much faster? cand.updateRowAndColumnMeans(mat, false); cand.computeMeanSquaredDeviation(mat); if (LOG.isDebuggingFine()) { LOG.debugFine( "Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } } }
@Override public Clustering<BiclusterWithInversionsModel> biclustering() { double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; for (int i = 0; i < n; i++) { cand.reset(); multipleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } singleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } nodeAddition(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } cand.maskMatrix(mat, dist); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); noise.removeDBIDs(cids); result.addToplevelCluster(new Cluster<>(cids, model)); if (LOG.isVerbose()) { LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); LOG.verbose("Number of rows: " + cand.rowcard + "\n"); LOG.verbose("Number of columns: " + cand.colcard + "\n"); // LOG.verbose("Total number of masked values: " + maskedVals.size() + // "\n"); } LOG.incrementProcessed(prog); } // Add a noise cluster, full-dimensional. if (!noise.isEmpty()) { long[] allcols = BitsUtil.ones(getColDim()); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); result.addToplevelCluster(new Cluster<>(noise, true, model)); } LOG.ensureCompleted(prog); return result; }
/** * The standard k-means algorithm, using Lloyd-style bulk iterations. * * <p>Reference:<br> * S. Lloyd<br> * Least squares quantization in PCM<br> * IEEE Transactions on Information Theory 28 (2)<br> * previously published as Bell Telephone Laboratories Paper * * @author Arthur Zimek * @apiviz.landmark * @apiviz.has KMeansModel * @param <V> vector datatype */ @Title("K-Means") @Description("Finds a least-squared partitioning into k clusters.") @Reference( authors = "S. Lloyd", // title = "Least squares quantization in PCM", // booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", // url = "http://dx.doi.org/10.1109/TIT.1982.1056489") public class KMeansLloyd<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { /** The logger for this class. */ private static final Logging LOG = Logging.getLogger(KMeansLloyd.class); /** Key for statistics logging. */ private static final String KEY = KMeansLloyd.class.getName(); /** * Constructor. * * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter * @param initializer Initialization method */ public KMeansLloyd( NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { super(distanceFunction, k, maxiter, initializer); } @Override public Clustering<KMeansModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; int iteration = 0; for (; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); logVarstat(varstat, varsum); // Stop if no cluster assignment changed. if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } LOG.setCompleted(prog); if (LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); } // Wrap result Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); if (ids.size() == 0) { continue; } KMeansModel model = new KMeansModel(means[i], varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * @apiviz.exclude */ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { @Override protected Logging getLogger() { return LOG; } @Override protected KMeansLloyd<V> makeInstance() { return new KMeansLloyd<>(distanceFunction, k, maxiter, initializer); } } }
/** * Fast Outlier Detection Using the "Local Correlation Integral". * * <p>Exact implementation only, not aLOCI. See {@link ALOCI}. * * <p>Outlier detection using multiple epsilon neighborhoods. * * <p>This implementation has O(n<sup>3</sup> log n) runtime complexity! * * <p>Based on: S. Papadimitriou, H. Kitagawa, P. B. Gibbons and C. Faloutsos: LOCI: Fast Outlier * Detection Using the Local Correlation Integral. In: Proc. 19th IEEE Int. Conf. on Data * Engineering (ICDE '03), Bangalore, India, 2003. * * @author Erich Schubert * @apiviz.has RangeQuery * @param <O> Object type */ @Title("LOCI: Fast Outlier Detection Using the Local Correlation Integral") @Description("Algorithm to compute outliers based on the Local Correlation Integral") @Reference( authors = "S. Papadimitriou, H. Kitagawa, P. B. Gibbons, C. Faloutsos", title = "LOCI: Fast Outlier Detection Using the Local Correlation Integral", booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003", url = "http://dx.doi.org/10.1109/ICDE.2003.1260802") @Alias({"de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI"}) public class LOCI<O> extends AbstractDistanceBasedAlgorithm<O, OutlierResult> implements OutlierAlgorithm { /** The logger for this class. */ private static final Logging LOG = Logging.getLogger(LOCI.class); /** * Parameter to specify the maximum radius of the neighborhood to be considered, must be suitable * to the distance function specified. */ public static final OptionID RMAX_ID = new OptionID("loci.rmax", "The maximum radius of the neighborhood to be considered."); /** Parameter to specify the minimum neighborhood size */ public static final OptionID NMIN_ID = new OptionID("loci.nmin", "Minimum neighborhood size to be considered."); /** Parameter to specify the averaging neighborhood scaling. */ public static final OptionID ALPHA_ID = new OptionID("loci.alpha", "Scaling factor for averaging neighborhood"); /** Holds the value of {@link #RMAX_ID}. */ private double rmax; /** Holds the value of {@link #NMIN_ID}. */ private int nmin; /** Holds the value of {@link #ALPHA_ID}. */ private double alpha; /** * Constructor. * * @param distanceFunction Distance function * @param rmax Maximum radius * @param nmin Minimum neighborhood size * @param alpha Alpha value */ public LOCI(DistanceFunction<? super O> distanceFunction, double rmax, int nmin, double alpha) { super(distanceFunction); this.rmax = rmax; this.nmin = nmin; this.alpha = alpha; } /** * Run the algorithm * * @param database Database to process * @param relation Relation to process * @return Outlier result */ public OutlierResult run(Database database, Relation<O> relation) { DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc); DBIDs ids = relation.getDBIDs(); // LOCI preprocessing step WritableDataStore<DoubleIntArrayList> interestingDistances = DataStoreUtil.makeStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, DoubleIntArrayList.class); precomputeInterestingRadii(ids, rangeQuery, interestingDistances); // LOCI main step FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null; WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); // Shared instance, to save allocations. MeanVariance mv_n_r_alpha = new MeanVariance(); for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { final DoubleIntArrayList cdist = interestingDistances.get(iditer); final double maxdist = cdist.getDouble(cdist.size() - 1); final int maxneig = cdist.getInt(cdist.size() - 1); double maxmdefnorm = 0.0; double maxnormr = 0; if (maxneig >= nmin) { // Compute the largest neighborhood we will need. DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist); // TODO: Ensure the result is sorted. This is currently implied. // For any critical distance, compute the normalized MDEF score. for (int i = 0, size = cdist.size(); i < size; i++) { // Only start when minimum size is fulfilled if (cdist.getInt(i) < nmin) { continue; } final double r = cdist.getDouble(i); final double alpha_r = alpha * r; // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!) final int n_alphar = cdist.getInt(cdist.find(alpha_r)); // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF} mv_n_r_alpha.reset(); for (DoubleDBIDListIter neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) { // Stop at radius r if (neighbor.doubleValue() > r) { break; } DoubleIntArrayList cdist2 = interestingDistances.get(neighbor); int rn_alphar = cdist2.getInt(cdist2.find(alpha_r)); mv_n_r_alpha.put(rn_alphar); } // We only use the average and standard deviation final double nhat_r_alpha = mv_n_r_alpha.getMean(); final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev(); // Redundant divisions by nhat_r_alpha removed. final double mdef = nhat_r_alpha - n_alphar; final double sigmamdef = sigma_nhat_r_alpha; final double mdefnorm = mdef / sigmamdef; if (mdefnorm > maxmdefnorm) { maxmdefnorm = mdefnorm; maxnormr = r; } } } else { // FIXME: when nmin was not fulfilled - what is the proper value then? maxmdefnorm = Double.POSITIVE_INFINITY; maxnormr = maxdist; } mdef_norm.putDouble(iditer, maxmdefnorm); mdef_radius.putDouble(iditer, maxnormr); minmax.put(maxmdefnorm); LOG.incrementProcessed(progressLOCI); } LOG.ensureCompleted(progressLOCI); DoubleRelation scoreResult = new MaterializedDoubleRelation( "LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta( minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); result.addChildResult( new MaterializedDoubleRelation( "LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs())); return result; } /** * Preprocessing step: determine the radii of interest for each point. * * @param ids IDs to process * @param rangeQuery Range query * @param interestingDistances Distances of interest */ protected void precomputeInterestingRadii( DBIDs ids, RangeQuery<O> rangeQuery, WritableDataStore<DoubleIntArrayList> interestingDistances) { FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null; for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax); // build list of critical distances DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1); { int i = 0; DoubleDBIDListIter ni = neighbors.iter(); while (ni.valid()) { final double curdist = ni.doubleValue(); ++i; ni.advance(); // Skip, if tied to the next object: if (ni.valid() && curdist == ni.doubleValue()) { continue; } cdist.append(curdist, i); // Scale radius, and reinsert if (alpha != 1.) { final double ri = curdist / alpha; if (ri <= rmax) { cdist.append(ri, Integer.MIN_VALUE); } } } } cdist.sort(); // fill the gaps to have fast lookups of number of neighbors at a given // distance. int lastk = 0; for (int i = 0, size = cdist.size(); i < size; i++) { final int k = cdist.getInt(i); if (k == Integer.MIN_VALUE) { cdist.setValue(i, lastk); } else { lastk = k; } } // TODO: shrink the list, removing duplicate radii? interestingDistances.put(iditer, cdist); LOG.incrementProcessed(progressPreproc); } LOG.ensureCompleted(progressPreproc); } /** * Array of double-int values. * * @author Erich Schubert * @apiviz.exclude */ protected static class DoubleIntArrayList { /** Double keys */ double[] keys; /** Integer values */ int[] vals; /** Used size */ int size = 0; /** * Constructor. * * @param alloc Initial allocation. */ public DoubleIntArrayList(int alloc) { keys = new double[alloc]; vals = new int[alloc]; size = 0; } /** * Collection size. * * @return Size */ public int size() { return size; } /** * Get the key at the given position. * * @param i Position * @return Key */ public double getDouble(int i) { return keys[i]; } /** * Get the value at the given position. * * @param i Position * @return Value */ public int getInt(int i) { return vals[i]; } /** * Get the value at the given position. * * @param i Position * @param val New value */ public void setValue(int i, int val) { vals[i] = val; } /** * Append a key-value pair. * * @param key Key to append * @param val Value to append. */ public void append(double key, int val) { if (size == keys.length) { keys = Arrays.copyOf(keys, size << 1); vals = Arrays.copyOf(vals, size << 1); } keys[size] = key; vals[size] = val; ++size; } /** * Find the last position with a smaller or equal key. * * @param search Key * @return Position */ public int find(final double search) { int a = 0, b = size - 1; while (a <= b) { final int mid = (a + b) >>> 1; final double cur = keys[mid]; if (cur > search) { b = mid - 1; } else { // less or equal! a = mid + 1; } } return b; } /** Sort the array list. */ public void sort() { DoubleIntegerArrayQuickSort.sort(keys, vals, size); } } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * @apiviz.exclude * @param <O> Object type */ public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { protected double rmax; protected int nmin = 0; protected double alpha = 0.5; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); final DoubleParameter rmaxP = new DoubleParameter(RMAX_ID); if (config.grab(rmaxP)) { rmax = rmaxP.doubleValue(); } final IntParameter nminP = new IntParameter(NMIN_ID, 20); if (config.grab(nminP)) { nmin = nminP.intValue(); } final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5); if (config.grab(alphaP)) { alpha = alphaP.getValue(); } } @Override protected LOCI<O> makeInstance() { return new LOCI<>(distanceFunction, rmax, nmin, alpha); } } }
/** * Implementation of the SUBCLU algorithm, an algorithm to detect arbitrarily shaped and positioned * clusters in subspaces. SUBCLU delivers for each subspace the same clusters DBSCAN would have * found, when applied to this subspace separately. * * <p>Reference: <br> * K. Kailing, H.-P. Kriegel, P. Kröger:<br> * Density connected Subspace Clustering for High Dimensional Data<br> * In Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004. * * @author Elke Achtert * @apiviz.uses DBSCAN * @apiviz.uses DimensionSelectingSubspaceDistanceFunction * @apiviz.has SubspaceModel * @param <V> the type of FeatureVector handled by this Algorithm */ @Title("SUBCLU: Density connected Subspace Clustering") @Description( "Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. " // + "SUBCLU delivers for each subspace the same clusters DBSCAN would have found, " // + "when applied to this subspace seperately.") @Reference( authors = "K. Kailing, H.-P. Kriegel, P. Kröger", // title = "Density connected Subspace Clustering for High Dimensional Data", // booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004", // url = "http://www.siam.org/meetings/sdm04/proceedings/sdm04_023.pdf") public class SUBCLU<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> { /** The logger for this class. */ private static final Logging LOG = Logging.getLogger(SUBCLU.class); /** * The distance function to determine the distance between database objects. * * <p>Default value: {@link SubspaceEuclideanDistanceFunction} * * <p>Key: {@code -subclu.distancefunction} */ public static final OptionID DISTANCE_FUNCTION_ID = new OptionID( "subclu.distancefunction", "Distance function to determine the distance between database objects."); /** * Parameter to specify the maximum radius of the neighborhood to be considered, must be suitable * to {@link DimensionSelectingSubspaceDistanceFunction}. * * <p>Key: {@code -subclu.epsilon} */ public static final OptionID EPSILON_ID = new OptionID("subclu.epsilon", "The maximum radius of the neighborhood to be considered."); /** * Parameter to specify the threshold for minimum number of points in the epsilon-neighborhood of * a point, must be an integer greater than 0. * * <p>Key: {@code -subclu.minpts} */ public static final OptionID MINPTS_ID = new OptionID( "subclu.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point."); /** Holds the instance of the distance function specified by {@link #DISTANCE_FUNCTION_ID}. */ private DimensionSelectingSubspaceDistanceFunction<V> distanceFunction; /** Holds the value of {@link #EPSILON_ID}. */ private double epsilon; /** Holds the value of {@link #MINPTS_ID}. */ private int minpts; /** Holds the result; */ private Clustering<SubspaceModel> result; /** * Constructor. * * @param distanceFunction Distance function * @param epsilon Epsilon value * @param minpts Minpts value */ public SUBCLU( DimensionSelectingSubspaceDistanceFunction<V> distanceFunction, double epsilon, int minpts) { super(); this.distanceFunction = distanceFunction; this.epsilon = epsilon; this.minpts = minpts; } /** * Performs the SUBCLU algorithm on the given database. * * @param relation Relation to process * @return Clustering result */ public Clustering<SubspaceModel> run(Relation<V> relation) { final int dimensionality = RelationUtil.dimensionality(relation); StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null; // Generate all 1-dimensional clusters LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters."); // mapping of dimensionality to set of subspaces HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>(); // list of 1-dimensional subspaces containing clusters List<Subspace> s_1 = new ArrayList<>(); subspaceMap.put(0, s_1); // mapping of subspaces to list of clusters TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<>(new Subspace.DimensionComparator()); for (int d = 0; d < dimensionality; d++) { Subspace currentSubspace = new Subspace(d); List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace); if (LOG.isDebuggingFiner()) { StringBuilder msg = new StringBuilder(); msg.append('\n') .append(clusters.size()) .append(" clusters in subspace ") .append(currentSubspace.dimensonsToString()) .append(": \n"); for (Cluster<Model> cluster : clusters) { msg.append(" " + cluster.getIDs() + "\n"); } LOG.debugFiner(msg.toString()); } if (!clusters.isEmpty()) { s_1.add(currentSubspace); clusterMap.put(currentSubspace, clusters); } } // Generate (d+1)-dimensional clusters from d-dimensional clusters for (int d = 0; d < dimensionality - 1; d++) { if (stepprog != null) { stepprog.beginStep( d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", LOG); } List<Subspace> subspaces = subspaceMap.get(d); if (subspaces == null || subspaces.isEmpty()) { if (stepprog != null) { for (int dim = d + 1; dim < dimensionality - 1; dim++) { stepprog.beginStep( dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", LOG); } } break; } List<Subspace> candidates = generateSubspaceCandidates(subspaces); List<Subspace> s_d = new ArrayList<>(); for (Subspace candidate : candidates) { Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap); if (LOG.isDebuggingFine()) { LOG.debugFine( "best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString()); } List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace); List<Cluster<Model>> clusters = new ArrayList<>(); for (Cluster<Model> cluster : bestSubspaceClusters) { List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate); if (!candidateClusters.isEmpty()) { clusters.addAll(candidateClusters); } } if (LOG.isDebuggingFine()) { StringBuilder msg = new StringBuilder(); msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n"); for (Cluster<Model> c : clusters) { msg.append(" " + c.getIDs() + "\n"); } LOG.debugFine(msg.toString()); } if (!clusters.isEmpty()) { s_d.add(candidate); clusterMap.put(candidate, clusters); } } if (!s_d.isEmpty()) { subspaceMap.put(d + 1, s_d); } } // build result int numClusters = 1; result = new Clustering<>("SUBCLU clustering", "subclu-clustering"); for (Subspace subspace : clusterMap.descendingKeySet()) { List<Cluster<Model>> clusters = clusterMap.get(subspace); for (Cluster<Model> cluster : clusters) { Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs()); newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs()))); newCluster.setName("cluster_" + numClusters++); result.addToplevelCluster(newCluster); } } LOG.setCompleted(stepprog); return result; } /** * Returns the result of the algorithm. * * @return the result of the algorithm */ public Clustering<SubspaceModel> getResult() { return result; } /** * Runs the DBSCAN algorithm on the specified partition of the database in the given subspace. If * parameter {@code ids} is null DBSCAN will be applied to the whole database. * * @param relation the database holding the objects to run DBSCAN on * @param ids the IDs of the database defining the partition to run DBSCAN on - if this parameter * is null DBSCAN will be applied to the whole database * @param subspace the subspace to run DBSCAN on * @return the clustering result of the DBSCAN run */ private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace subspace) { // distance function distanceFunction.setSelectedDimensions(subspace.getDimensions()); ProxyDatabase proxy; if (ids == null) { // TODO: in this case, we might want to use an index - the proxy below // will prevent this! ids = relation.getDBIDs(); } proxy = new ProxyDatabase(ids, relation); DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts); // run DBSCAN if (LOG.isVerbose()) { LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString()); } Clustering<Model> dbsres = dbscan.run(proxy); // separate cluster and noise List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters(); List<Cluster<Model>> clusters = new ArrayList<>(); for (Cluster<Model> c : clusterAndNoise) { if (!c.isNoise()) { clusters.add(c); } } return clusters; } /** * Generates {@code d+1}-dimensional subspace candidates from the specified {@code d}-dimensional * subspaces. * * @param subspaces the {@code d}-dimensional subspaces * @return the {@code d+1}-dimensional subspace candidates */ private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) { List<Subspace> candidates = new ArrayList<>(); if (subspaces.isEmpty()) { return candidates; } // Generate (d+1)-dimensional candidate subspaces int d = subspaces.get(0).dimensionality(); StringBuilder msgFine = new StringBuilder("\n"); if (LOG.isDebuggingFiner()) { msgFine.append("subspaces ").append(subspaces).append('\n'); } for (int i = 0; i < subspaces.size(); i++) { Subspace s1 = subspaces.get(i); for (int j = i + 1; j < subspaces.size(); j++) { Subspace s2 = subspaces.get(j); Subspace candidate = s1.join(s2); if (candidate != null) { if (LOG.isDebuggingFiner()) { msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n'); } // prune irrelevant candidate subspaces List<Subspace> lowerSubspaces = lowerSubspaces(candidate); if (LOG.isDebuggingFiner()) { msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n'); } boolean irrelevantCandidate = false; for (Subspace s : lowerSubspaces) { if (!subspaces.contains(s)) { irrelevantCandidate = true; break; } } if (!irrelevantCandidate) { candidates.add(candidate); } } } } if (LOG.isDebuggingFiner()) { LOG.debugFiner(msgFine.toString()); } if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append(d + 1).append("-dimensional candidate subspaces: "); for (Subspace candidate : candidates) { msg.append(candidate.dimensonsToString()).append(' '); } LOG.debug(msg.toString()); } return candidates; } /** * Returns the list of all {@code (d-1)}-dimensional subspaces of the specified {@code * d}-dimensional subspace. * * @param subspace the {@code d}-dimensional subspace * @return a list of all {@code (d-1)}-dimensional subspaces */ private List<Subspace> lowerSubspaces(Subspace subspace) { int dimensionality = subspace.dimensionality(); if (dimensionality <= 1) { return null; } // order result according to the dimensions List<Subspace> result = new ArrayList<>(); long[] dimensions = subspace.getDimensions(); for (int dim = BitsUtil.nextSetBit(dimensions, 0); dim >= 0; dim = BitsUtil.nextSetBit(dimensions, dim + 1)) { long[] newDimensions = dimensions.clone(); BitsUtil.clearI(newDimensions, dim); result.add(new Subspace(newDimensions)); } return result; } /** * Determines the {@code d}-dimensional subspace of the {@code (d+1)} -dimensional candidate with * minimal number of objects in the cluster. * * @param subspaces the list of {@code d}-dimensional subspaces containing clusters * @param candidate the {@code (d+1)}-dimensional candidate subspace * @param clusterMap the mapping of subspaces to clusters * @return the {@code d}-dimensional subspace of the {@code (d+1)} -dimensional candidate with * minimal number of objects in the cluster */ private Subspace bestSubspace( List<Subspace> subspaces, Subspace candidate, TreeMap<Subspace, List<Cluster<Model>>> clusterMap) { Subspace bestSubspace = null; for (Subspace subspace : subspaces) { int min = Integer.MAX_VALUE; if (subspace.isSubspace(candidate)) { List<Cluster<Model>> clusters = clusterMap.get(subspace); for (Cluster<Model> cluster : clusters) { int clusterSize = cluster.size(); if (clusterSize < min) { min = clusterSize; bestSubspace = subspace; } } } } return bestSubspace; } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * @apiviz.exclude */ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { protected int minpts = 0; protected double epsilon; protected DimensionSelectingSubspaceDistanceFunction<V> distance = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); ObjectParameter<DimensionSelectingSubspaceDistanceFunction<V>> param = new ObjectParameter<>( DISTANCE_FUNCTION_ID, DimensionSelectingSubspaceDistanceFunction.class, SubspaceEuclideanDistanceFunction.class); if (config.grab(param)) { distance = param.instantiateClass(config); } DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID); if (config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } IntParameter minptsP = new IntParameter(MINPTS_ID); minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(minptsP)) { minpts = minptsP.getValue(); } } @Override protected SUBCLU<V> makeInstance() { return new SUBCLU<>(distance, epsilon, minpts); } } }
/** * Run the algorithm * * @param database Database to process * @param relation Relation to process * @return Outlier result */ public OutlierResult run(Database database, Relation<O> relation) { DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc); DBIDs ids = relation.getDBIDs(); // LOCI preprocessing step WritableDataStore<DoubleIntArrayList> interestingDistances = DataStoreUtil.makeStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, DoubleIntArrayList.class); precomputeInterestingRadii(ids, rangeQuery, interestingDistances); // LOCI main step FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null; WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); // Shared instance, to save allocations. MeanVariance mv_n_r_alpha = new MeanVariance(); for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { final DoubleIntArrayList cdist = interestingDistances.get(iditer); final double maxdist = cdist.getDouble(cdist.size() - 1); final int maxneig = cdist.getInt(cdist.size() - 1); double maxmdefnorm = 0.0; double maxnormr = 0; if (maxneig >= nmin) { // Compute the largest neighborhood we will need. DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist); // TODO: Ensure the result is sorted. This is currently implied. // For any critical distance, compute the normalized MDEF score. for (int i = 0, size = cdist.size(); i < size; i++) { // Only start when minimum size is fulfilled if (cdist.getInt(i) < nmin) { continue; } final double r = cdist.getDouble(i); final double alpha_r = alpha * r; // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!) final int n_alphar = cdist.getInt(cdist.find(alpha_r)); // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF} mv_n_r_alpha.reset(); for (DoubleDBIDListIter neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) { // Stop at radius r if (neighbor.doubleValue() > r) { break; } DoubleIntArrayList cdist2 = interestingDistances.get(neighbor); int rn_alphar = cdist2.getInt(cdist2.find(alpha_r)); mv_n_r_alpha.put(rn_alphar); } // We only use the average and standard deviation final double nhat_r_alpha = mv_n_r_alpha.getMean(); final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev(); // Redundant divisions by nhat_r_alpha removed. final double mdef = nhat_r_alpha - n_alphar; final double sigmamdef = sigma_nhat_r_alpha; final double mdefnorm = mdef / sigmamdef; if (mdefnorm > maxmdefnorm) { maxmdefnorm = mdefnorm; maxnormr = r; } } } else { // FIXME: when nmin was not fulfilled - what is the proper value then? maxmdefnorm = Double.POSITIVE_INFINITY; maxnormr = maxdist; } mdef_norm.putDouble(iditer, maxmdefnorm); mdef_radius.putDouble(iditer, maxnormr); minmax.put(maxmdefnorm); LOG.incrementProcessed(progressLOCI); } LOG.ensureCompleted(progressLOCI); DoubleRelation scoreResult = new MaterializedDoubleRelation( "LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta( minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); result.addChildResult( new MaterializedDoubleRelation( "LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs())); return result; }
/** * Density-Based Clustering of Applications with Noise (DBSCAN), an algorithm to find * density-connected sets in a database. * * <p>Reference: <br> * M. Ester, H.-P. Kriegel, J. Sander, X. Xu<br> * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise<br> * In Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996. * * @author Arthur Zimek * @param <O> the type of Object the algorithm is applied to */ @Title("DBSCAN: Density-Based Clustering of Applications with Noise") @Description( "Algorithm to find density-connected sets in a database based on the parameters 'minpts' and 'epsilon' (specifying a volume). " + "These two parameters determine a density threshold for clustering.") @Reference( authors = "M. Ester, H.-P. Kriegel, J. Sander, X. Xu", // title = "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", // booktitle = "Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996", // url = "http://www.aaai.org/Papers/KDD/1996/KDD96-037") public class DBSCAN<O> extends AbstractDistanceBasedAlgorithm<O, Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> { /** The logger for this class. */ private static final Logging LOG = Logging.getLogger(DBSCAN.class); /** Holds the epsilon radius threshold. */ protected double epsilon; /** Holds the minimum cluster size. */ protected int minpts; /** Holds a list of clusters found. */ protected List<ModifiableDBIDs> resultList; /** Holds a set of noise. */ protected ModifiableDBIDs noise; /** Holds a set of processed ids. */ protected ModifiableDBIDs processedIDs; /** Number of neighbors. */ protected long ncounter; /** * Constructor with parameters. * * @param distanceFunction Distance function * @param epsilon Epsilon value * @param minpts Minpts parameter */ public DBSCAN(DistanceFunction<? super O> distanceFunction, double epsilon, int minpts) { super(distanceFunction); this.epsilon = epsilon; this.minpts = minpts; } /** Performs the DBSCAN algorithm on the given database. */ public Clustering<Model> run(Relation<O> relation) { final int size = relation.size(); if (size < minpts) { Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); result.addToplevelCluster( new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER)); return result; } RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction()); resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); runDBSCAN(relation, rangeQuery); double averagen = ncounter / (double) relation.size(); LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen)); if (averagen < 1 + 0.1 * (minpts - 1)) { LOG.warning("There are very few neighbors found. Epsilon may be too small."); } if (averagen > 100 * minpts) { LOG.warning("There are very many neighbors found. Epsilon may be too large."); } Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); for (ModifiableDBIDs res : resultList) { result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER)); } result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER)); return result; } /** * Run the DBSCAN algorithm * * @param relation Data relation * @param rangeQuery Range query class */ protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) { final int size = relation.size(); FiniteProgress objprog = LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null; IndefiniteProgress clusprog = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null; processedIDs = DBIDUtil.newHashSet(size); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if (!processedIDs.contains(iditer)) { expandCluster(relation, rangeQuery, iditer, objprog, clusprog); } if (objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), LOG); clusprog.setProcessed(resultList.size(), LOG); } if (processedIDs.size() == size) { break; } } // Finish progress logging LOG.ensureCompleted(objprog); LOG.setCompleted(clusprog); } /** * DBSCAN-function expandCluster. * * <p>Border-Objects become members of the first possible cluster. * * @param relation Database relation to run on * @param rangeQuery Range query to use * @param startObjectID potential seed of a new potential cluster * @param objprog the progress object for logging the current status */ protected void expandCluster( Relation<O> relation, RangeQuery<O> rangeQuery, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); ncounter += neighbors.size(); // startObject is no core-object if (neighbors.size() < minpts) { noise.add(startObjectID); processedIDs.add(startObjectID); if (objprog != null) { objprog.incrementProcessed(LOG); } return; } ModifiableDBIDs currentCluster = DBIDUtil.newArray(); currentCluster.add(startObjectID); processedIDs.add(startObjectID); // try to expand the cluster HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet(); processNeighbors(neighbors.iter(), currentCluster, seeds); DBIDVar o = DBIDUtil.newVar(); while (!seeds.isEmpty()) { seeds.pop(o); neighbors = rangeQuery.getRangeForDBID(o, epsilon); ncounter += neighbors.size(); if (neighbors.size() >= minpts) { processNeighbors(neighbors.iter(), currentCluster, seeds); } if (objprog != null) { objprog.incrementProcessed(LOG); } } resultList.add(currentCluster); if (clusprog != null) { clusprog.setProcessed(resultList.size(), LOG); } } /** * Process a single core point. * * @param neighbor Iterator over neighbors * @param currentCluster Current cluster * @param seeds Seed set */ private void processNeighbors( DBIDIter neighbor, ModifiableDBIDs currentCluster, HashSetModifiableDBIDs seeds) { for (; neighbor.valid(); neighbor.advance()) { if (processedIDs.add(neighbor)) { seeds.add(neighbor); } else if (!noise.remove(neighbor)) { continue; } currentCluster.add(neighbor); } } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * @apiviz.exclude */ public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { /** * Parameter to specify the maximum radius of the neighborhood to be considered, must be * suitable to the distance function specified. */ public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered."); /** * Parameter to specify the threshold for minimum number of points in the epsilon-neighborhood * of a point, must be an integer greater than 0. */ public static final OptionID MINPTS_ID = new OptionID( "dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point. The suggested value is '2 * dim - 1'."); /** Holds the epsilon radius threshold. */ protected double epsilon; /** Holds the minimum cluster size. */ protected int minpts; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID) // .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); if (config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } IntParameter minptsP = new IntParameter(MINPTS_ID) // .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(minptsP)) { minpts = minptsP.getValue(); if (minpts <= 2) { LOG.warning( "DBSCAN with minPts <= 2 is equivalent to single-link clustering at a single height. Consider using larger values of minPts."); } } } @Override protected DBSCAN<O> makeInstance() { return new DBSCAN<>(distanceFunction, epsilon, minpts); } } }