public Result run(Database database, Relation<O> rel) { DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction()); int size = rel.size(); long pairs = (size * (long) size) >> 1; final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling; if (ssize > Integer.MAX_VALUE) { throw new AbortException("Sampling size too large."); } final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize); DoubleMaxHeap heap = new DoubleMaxHeap(qsize); ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs()); DBIDArrayIter i1 = ids.iter(), i2 = ids.iter(); Random r = rand.getSingleThreadedRandom(); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null; for (long i = 0; i < ssize; i++) { int x = r.nextInt(size - 1) + 1, y = r.nextInt(x); double dist = dq.distance(i1.seek(x), i2.seek(y)); // Skip NaN, and/or zeros. if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) { continue; } heap.add(dist, qsize); LOG.incrementProcessed(prog); } LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile)); LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize)); LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek())); LOG.ensureCompleted(prog); Collection<String> header = Arrays.asList(new String[] {"Distance"}); Collection<Vector> data = Arrays.asList(new Vector[] {new Vector(heap.peek())}); return new CollectionResult<Vector>("Distances sample", "distance-sample", data, header); }
/** * Constructor. * * @param prob Probability * @param rnd Random generator */ public RandomSamplingStreamFilter(double prob, RandomFactory rnd) { super(); this.prob = prob; this.random = rnd.getSingleThreadedRandom(); }
/** * Performs a single run of FastDOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runFastDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) { // Relevant attributes of highest cardinality. long[] D = null; // The seed point for the best dimensions. DBIDVar dV = DBIDUtil.newVar(); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); outer: for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) { D = nD; dV.set(iter); if (BitsUtil.cardinality(D) >= d_zero) { if (iprogress != null) { iprogress.setProcessed(iprogress.getTotal(), LOG); } break outer; } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); // If no relevant dimensions were found, skip it. if (D == null || BitsUtil.cardinality(D) == 0) { return null; } // Get all points in the box. SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq, DatabaseQuery.HINT_SINGLE); // TODO: add filtering capabilities into query API! DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w)); // If we have a non-empty cluster, return it. return (C.size() > 0) ? makeCluster(relation, C, D) : null; }
/** * Performs a single run of DOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @param minClusterSize Minimum size a cluster must have to be accepted. * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) { // Best cluster for the current run. DBIDs C = null; // Relevant attributes for the best cluster. long[] D = null; // Quality of the best cluster. double quality = Double.NEGATIVE_INFINITY; // Bounds for our cluster. // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new // double[d], new double[d]); // Weights for distance (= rectangle query) SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d)); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension and build bounding box. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (BitsUtil.cardinality(nD) > 0) { // Get all points in the box. df.setSelectedDimensions(nD); // TODO: add filtering capabilities into query API! DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w)); if (LOG.isDebuggingFiner()) { LOG.finer( "Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + BitsUtil.cardinality(nD)); } // Is the cluster large enough? if (nC.size() < minClusterSize) { // Too small. if (LOG.isDebuggingFiner()) { LOG.finer("... but it's too small."); } } else { // Better cluster than before? double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD)); if (nQuality > quality) { if (LOG.isDebuggingFiner()) { LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality); } C = nC; D = nD; quality = nQuality; } else { if (LOG.isDebuggingFiner()) { LOG.finer("... but we already have a better one."); } } } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); return (C != null) ? makeCluster(relation, C, D) : null; }