/** * DBSCAN-function expandCluster. * * <p>Border-Objects become members of the first possible cluster. * * @param relation Database relation to run on * @param rangeQuery Range query to use * @param startObjectID potential seed of a new potential cluster * @param objprog the progress object for logging the current status */ protected void expandCluster( Relation<O> relation, RangeQuery<O> rangeQuery, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); ncounter += neighbors.size(); // startObject is no core-object if (neighbors.size() < minpts) { noise.add(startObjectID); processedIDs.add(startObjectID); if (objprog != null) { objprog.incrementProcessed(LOG); } return; } ModifiableDBIDs currentCluster = DBIDUtil.newArray(); currentCluster.add(startObjectID); processedIDs.add(startObjectID); // try to expand the cluster HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet(); processNeighbors(neighbors.iter(), currentCluster, seeds); DBIDVar o = DBIDUtil.newVar(); while (!seeds.isEmpty()) { seeds.pop(o); neighbors = rangeQuery.getRangeForDBID(o, epsilon); ncounter += neighbors.size(); if (neighbors.size() >= minpts) { processNeighbors(neighbors.iter(), currentCluster, seeds); } if (objprog != null) { objprog.incrementProcessed(LOG); } } resultList.add(currentCluster); if (clusprog != null) { clusprog.setProcessed(resultList.size(), LOG); } }
/** * Main loop of OUTRES. Run for each object * * @param s start dimension * @param subspace Current subspace * @param id Current object ID * @param kernel Kernel * @return Score */ public double outresScore( final int s, long[] subspace, DBIDRef id, KernelDensityEstimator kernel) { double score = 1.0; // Initial score is 1.0 final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(subspace); MeanVariance meanv = new MeanVariance(); for (int i = s; i < kernel.dim; i++) { if (BitsUtil.get(subspace, i)) { // TODO: needed? Or should we always start // with i=0? continue; } BitsUtil.setI(subspace, i); df.setSelectedDimensions(subspace); final double adjustedEps = kernel.adjustedEps(kernel.dim); // Query with a larger window, to also get neighbors of neighbors // Subspace euclidean is metric! final double range = adjustedEps * 2.; RangeQuery<V> rq = QueryUtil.getRangeQuery(kernel.relation, df, range); DoubleDBIDList neighc = rq.getRangeForDBID(id, range); DoubleDBIDList neigh = refineRange(neighc, adjustedEps); if (neigh.size() > 2) { // Relevance test if (relevantSubspace(subspace, neigh, kernel)) { final double density = kernel.subspaceDensity(subspace, neigh); // Compute mean and standard deviation for densities of neighbors. meanv.reset(); for (DoubleDBIDListIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) { DoubleDBIDList n2 = subsetNeighborhoodQuery(neighc, neighbor, df, adjustedEps, kernel); meanv.put(kernel.subspaceDensity(subspace, n2)); } final double deviation = (meanv.getMean() - density) / (2. * meanv.getSampleStddev()); // High deviation: if (deviation >= 1) { score *= (density / deviation); } // Recursion score *= outresScore(i + 1, subspace, id, kernel); } } BitsUtil.clearI(subspace, i); } return score; }
/** * Preprocessing step: determine the radii of interest for each point. * * @param ids IDs to process * @param rangeQuery Range query * @param interestingDistances Distances of interest */ protected void precomputeInterestingRadii( DBIDs ids, RangeQuery<O> rangeQuery, WritableDataStore<DoubleIntArrayList> interestingDistances) { FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null; for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax); // build list of critical distances DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1); { int i = 0; DoubleDBIDListIter ni = neighbors.iter(); while (ni.valid()) { final double curdist = ni.doubleValue(); ++i; ni.advance(); // Skip, if tied to the next object: if (ni.valid() && curdist == ni.doubleValue()) { continue; } cdist.append(curdist, i); // Scale radius, and reinsert if (alpha != 1.) { final double ri = curdist / alpha; if (ri <= rmax) { cdist.append(ri, Integer.MIN_VALUE); } } } } cdist.sort(); // fill the gaps to have fast lookups of number of neighbors at a given // distance. int lastk = 0; for (int i = 0, size = cdist.size(); i < size; i++) { final int k = cdist.getInt(i); if (k == Integer.MIN_VALUE) { cdist.setValue(i, lastk); } else { lastk = k; } } // TODO: shrink the list, removing duplicate radii? interestingDistances.put(iditer, cdist); LOG.incrementProcessed(progressPreproc); } LOG.ensureCompleted(progressPreproc); }
/** * Run the algorithm * * @param database Database to process * @param relation Relation to process * @return Outlier result */ public OutlierResult run(Database database, Relation<O> relation) { DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc); DBIDs ids = relation.getDBIDs(); // LOCI preprocessing step WritableDataStore<DoubleIntArrayList> interestingDistances = DataStoreUtil.makeStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, DoubleIntArrayList.class); precomputeInterestingRadii(ids, rangeQuery, interestingDistances); // LOCI main step FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null; WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); // Shared instance, to save allocations. MeanVariance mv_n_r_alpha = new MeanVariance(); for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { final DoubleIntArrayList cdist = interestingDistances.get(iditer); final double maxdist = cdist.getDouble(cdist.size() - 1); final int maxneig = cdist.getInt(cdist.size() - 1); double maxmdefnorm = 0.0; double maxnormr = 0; if (maxneig >= nmin) { // Compute the largest neighborhood we will need. DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist); // TODO: Ensure the result is sorted. This is currently implied. // For any critical distance, compute the normalized MDEF score. for (int i = 0, size = cdist.size(); i < size; i++) { // Only start when minimum size is fulfilled if (cdist.getInt(i) < nmin) { continue; } final double r = cdist.getDouble(i); final double alpha_r = alpha * r; // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!) final int n_alphar = cdist.getInt(cdist.find(alpha_r)); // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF} mv_n_r_alpha.reset(); for (DoubleDBIDListIter neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) { // Stop at radius r if (neighbor.doubleValue() > r) { break; } DoubleIntArrayList cdist2 = interestingDistances.get(neighbor); int rn_alphar = cdist2.getInt(cdist2.find(alpha_r)); mv_n_r_alpha.put(rn_alphar); } // We only use the average and standard deviation final double nhat_r_alpha = mv_n_r_alpha.getMean(); final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev(); // Redundant divisions by nhat_r_alpha removed. final double mdef = nhat_r_alpha - n_alphar; final double sigmamdef = sigma_nhat_r_alpha; final double mdefnorm = mdef / sigmamdef; if (mdefnorm > maxmdefnorm) { maxmdefnorm = mdefnorm; maxnormr = r; } } } else { // FIXME: when nmin was not fulfilled - what is the proper value then? maxmdefnorm = Double.POSITIVE_INFINITY; maxnormr = maxdist; } mdef_norm.putDouble(iditer, maxmdefnorm); mdef_radius.putDouble(iditer, maxnormr); minmax.put(maxmdefnorm); LOG.incrementProcessed(progressLOCI); } LOG.ensureCompleted(progressLOCI); DoubleRelation scoreResult = new MaterializedDoubleRelation( "LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta( minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); result.addChildResult( new MaterializedDoubleRelation( "LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs())); return result; }
/** * Performs a single run of FastDOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runFastDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) { // Relevant attributes of highest cardinality. long[] D = null; // The seed point for the best dimensions. DBIDVar dV = DBIDUtil.newVar(); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); outer: for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) { D = nD; dV.set(iter); if (BitsUtil.cardinality(D) >= d_zero) { if (iprogress != null) { iprogress.setProcessed(iprogress.getTotal(), LOG); } break outer; } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); // If no relevant dimensions were found, skip it. if (D == null || BitsUtil.cardinality(D) == 0) { return null; } // Get all points in the box. SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq, DatabaseQuery.HINT_SINGLE); // TODO: add filtering capabilities into query API! DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w)); // If we have a non-empty cluster, return it. return (C.size() > 0) ? makeCluster(relation, C, D) : null; }
/** * Performs a single run of DOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @param minClusterSize Minimum size a cluster must have to be accepted. * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) { // Best cluster for the current run. DBIDs C = null; // Relevant attributes for the best cluster. long[] D = null; // Quality of the best cluster. double quality = Double.NEGATIVE_INFINITY; // Bounds for our cluster. // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new // double[d], new double[d]); // Weights for distance (= rectangle query) SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d)); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension and build bounding box. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (BitsUtil.cardinality(nD) > 0) { // Get all points in the box. df.setSelectedDimensions(nD); // TODO: add filtering capabilities into query API! DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w)); if (LOG.isDebuggingFiner()) { LOG.finer( "Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + BitsUtil.cardinality(nD)); } // Is the cluster large enough? if (nC.size() < minClusterSize) { // Too small. if (LOG.isDebuggingFiner()) { LOG.finer("... but it's too small."); } } else { // Better cluster than before? double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD)); if (nQuality > quality) { if (LOG.isDebuggingFiner()) { LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality); } C = nC; D = nD; quality = nQuality; } else { if (LOG.isDebuggingFiner()) { LOG.finer("... but we already have a better one."); } } } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); return (C != null) ? makeCluster(relation, C, D) : null; }