@Override public ProjectedIndex<O, O> instantiate(Relation<O> relation) { if (!proj.getInputDataTypeInformation() .isAssignableFromType(relation.getDataTypeInformation())) { return null; } proj.initialize(relation.getDataTypeInformation()); final Relation<O> view; if (materialize) { DBIDs ids = relation.getDBIDs(); WritableDataStore<O> content = DataStoreUtil.makeStorage( ids, DataStoreFactory.HINT_DB, proj.getOutputDataTypeInformation().getRestrictionClass()); for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { content.put(iter, proj.project(relation.get(iter))); } view = new MaterializedRelation<>( "ECEF Projection", "ecef-projection", proj.getOutputDataTypeInformation(), content, ids); } else { view = new ProjectedView<>(relation, proj); } Index inneri = inner.instantiate(view); if (inneri == null) { return null; } return new LngLatAsECEFIndex<>(relation, proj, view, inneri, norefine); }
private void extractItemsets( DBIDs iset, DBIDs[] idx, int[] buf, int depth, int start, int minsupp, List<Itemset> solution) { // TODO: reuse arrays. for (int i = start; i < idx.length; i++) { if (idx[i] == null) { continue; } DBIDs ids = mergeJoin(iset, idx[i]); if (ids.size() < minsupp) { continue; } buf[depth] = i; int[] items = Arrays.copyOf(buf, depth + 1); if (depth >= minlength) { solution.add(new SparseItemset(items, ids.size())); } if (depth < maxlength) { extractItemsets(ids, idx, buf, depth + 1, i + 1, minsupp, solution); } } }
@Override public Clustering<KMeansModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; int iteration = 0; for (; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); logVarstat(varstat, varsum); // Stop if no cluster assignment changed. if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } LOG.setCompleted(prog); if (LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); } // Wrap result Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); if (ids.size() == 0) { continue; } KMeansModel model = new KMeansModel(means[i], varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; }
/** * Run the algorithm on the given relation. * * @param database Database * @param relation Relation to process * @return Outlier result */ public OutlierResult run(Database database, Relation<? extends NumberVector> relation) { @SuppressWarnings("unchecked") PrimitiveDistanceQuery<? super NumberVector> distq = (PrimitiveDistanceQuery<? super NumberVector>) database.getDistanceQuery(relation, distanceFunction); Collection<? extends NumberVector> refPoints = refp.getReferencePoints(relation); if (refPoints.size() < 1) { throw new AbortException("Cannot compute ROS without reference points!"); } DBIDs ids = relation.getDBIDs(); if (k >= ids.size()) { throw new AbortException("k must not be chosen larger than the database size!"); } // storage of distance/score values. WritableDoubleDataStore rbod_score = DataStoreUtil.makeDoubleStorage( ids, DataStoreFactory.HINT_STATIC | DataStoreFactory.HINT_HOT, Double.NaN); // Compute density estimation: for (NumberVector refPoint : refPoints) { DoubleDBIDList referenceDists = computeDistanceVector(refPoint, relation, distq); updateDensities(rbod_score, referenceDists); } // compute maximum density DoubleMinMax mm = new DoubleMinMax(); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { mm.put(rbod_score.doubleValue(iditer)); } // compute ROS double scale = mm.getMax() > 0. ? 1. / mm.getMax() : 1.; mm.reset(); // Reuse for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double score = 1 - (rbod_score.doubleValue(iditer) * scale); mm.put(score); rbod_score.putDouble(iditer, score); } DoubleRelation scoreResult = new MaterializedDoubleRelation( "Reference-points Outlier Scores", "reference-outlier", rbod_score, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax(), 0., 1., 0.); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); // adds reference points to the result. header information for the // visualizer to find the reference points in the result result.addChildResult( new ReferencePointsResult<>("Reference points", "reference-points", refPoints)); return result; }
/** * Preprocessing step: determine the radii of interest for each point. * * @param ids IDs to process * @param rangeQuery Range query * @param interestingDistances Distances of interest */ protected void precomputeInterestingRadii( DBIDs ids, RangeQuery<O> rangeQuery, WritableDataStore<DoubleIntArrayList> interestingDistances) { FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null; for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax); // build list of critical distances DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1); { int i = 0; DoubleDBIDListIter ni = neighbors.iter(); while (ni.valid()) { final double curdist = ni.doubleValue(); ++i; ni.advance(); // Skip, if tied to the next object: if (ni.valid() && curdist == ni.doubleValue()) { continue; } cdist.append(curdist, i); // Scale radius, and reinsert if (alpha != 1.) { final double ri = curdist / alpha; if (ri <= rmax) { cdist.append(ri, Integer.MIN_VALUE); } } } } cdist.sort(); // fill the gaps to have fast lookups of number of neighbors at a given // distance. int lastk = 0; for (int i = 0, size = cdist.size(); i < size; i++) { final int k = cdist.getInt(i); if (k == Integer.MIN_VALUE) { cdist.setValue(i, lastk); } else { lastk = k; } } // TODO: shrink the list, removing duplicate radii? interestingDistances.put(iditer, cdist); LOG.incrementProcessed(progressPreproc); } LOG.ensureCompleted(progressPreproc); }
// TODO: implement diffsets. private void extractItemsets(DBIDs[] idx, int start, int minsupp, List<Itemset> solution) { int[] buf = new int[idx.length]; DBIDs iset = idx[start]; if (iset == null || iset.size() < minsupp) { return; } if (minlength <= 1) { solution.add(new OneItemset(start, iset.size())); } if (maxlength > 1) { buf[0] = start; extractItemsets(iset, idx, buf, 1, start + 1, minsupp, solution); } }
/** * Compute the intersection size. * * @param neighbors1 SORTED neighbor ids of first * @param neighbors2 SORTED neighbor ids of second * @return Intersection size */ protected static int countSharedNeighbors(DBIDs neighbors1, DBIDs neighbors2) { int intersection = 0; DBIDIter iter1 = neighbors1.iter(); DBIDIter iter2 = neighbors2.iter(); while (iter1.valid() && iter2.valid()) { final int comp = DBIDUtil.compare(iter1, iter2); if (comp == 0) { intersection++; iter1.advance(); iter2.advance(); } else if (comp < 0) { iter1.advance(); } else // iter2 < iter1 { iter2.advance(); } } return intersection; }
private DBIDs mergeJoin(DBIDs first, DBIDs second) { assert (!(first instanceof HashSetDBIDs)); assert (!(second instanceof HashSetDBIDs)); ArrayModifiableDBIDs ids = DBIDUtil.newArray(); DBIDIter i1 = first.iter(), i2 = second.iter(); while (i1.valid() && i2.valid()) { int c = DBIDUtil.compare(i1, i2); if (c < 0) { i1.advance(); } else if (c > 0) { i2.advance(); } else { ids.add(i1); i1.advance(); i2.advance(); } } return ids; }
/** * Inserts the specified objects into this index. If a bulk load mode is implemented, the objects * are inserted in one bulk. * * @param ids the objects to be inserted */ @Override public void insertAll(DBIDs ids) { if (ids.isEmpty() || (ids.size() == 1)) { return; } // Make an example leaf if (canBulkLoad()) { List<SpatialEntry> leafs = new ArrayList<>(ids.size()); for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { leafs.add(createNewLeafEntry(iter)); } bulkLoad(leafs); } else { for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { insert(DBIDUtil.deref(iter)); } } doExtraIntegrityChecks(); }
/** * Utility method to test if a given dimension is relevant as determined via a set of reference * points (i.e. if the variance along the attribute is lower than the threshold). * * @param dimension the dimension to test. * @param relation used to get actual values for DBIDs. * @param points the points to test. * @return <code>true</code> if the dimension is relevant. */ private boolean dimensionIsRelevant(int dimension, Relation<V> relation, DBIDs points) { double min = Double.POSITIVE_INFINITY, max = Double.NEGATIVE_INFINITY; for (DBIDIter iter = points.iter(); iter.valid(); iter.advance()) { double xV = relation.get(iter).doubleValue(dimension); min = (xV < min) ? xV : min; max = (xV > max) ? xV : max; if (max - min > w) { return false; } } return true; }
/** * Run the ODIN algorithm * * @param database Database to run on. * @param relation Relation to process. * @return ODIN outlier result. */ public OutlierResult run(Database database, Relation<O> relation) { // Get the query functions: DistanceQuery<O> dq = database.getDistanceQuery(relation, getDistanceFunction()); KNNQuery<O> knnq = database.getKNNQuery(dq, k); // Get the objects to process, and a data storage for counting and output: DBIDs ids = relation.getDBIDs(); WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_DB, 0.); double inc = 1. / (k - 1); double min = Double.POSITIVE_INFINITY, max = 0.0; // Process all objects for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { // Find the nearest neighbors (using an index, if available!) DBIDs neighbors = knnq.getKNNForDBID(iter, k); // For each neighbor, except ourselves, increase the in-degree: for (DBIDIter nei = neighbors.iter(); nei.valid(); nei.advance()) { if (DBIDUtil.equal(iter, nei)) { continue; } final double value = scores.doubleValue(nei) + inc; if (value < min) { min = value; } if (value > max) { max = value; } scores.put(nei, value); } } // Wrap the result and add metadata. OutlierScoreMeta meta = new InvertedOutlierScoreMeta(min, max, 0., inc * (ids.size() - 1), 1); DoubleRelation rel = new MaterializedDoubleRelation("ODIN In-Degree", "odin", scores, ids); return new OutlierResult(meta, rel); }
/** * The main run method * * @param database Database to use (actually unused) * @param spatial Relation for neighborhood * @param relation Attributes to evaluate * @return Outlier result */ public OutlierResult run(Database database, Relation<N> spatial, Relation<O> relation) { final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(database, spatial); DistanceQuery<O> distFunc = getNonSpatialDistanceFunction().instantiate(relation); WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT); WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax lofminmax = new DoubleMinMax(); // Compute densities for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { DBIDs neighbors = npred.getNeighborDBIDs(iditer); double avg = 0; for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { avg += distFunc.distance(iditer, iter); } double lrd = 1 / (avg / neighbors.size()); if (Double.isNaN(lrd)) { lrd = 0; } lrds.putDouble(iditer, lrd); } // Compute density quotients for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { DBIDs neighbors = npred.getNeighborDBIDs(iditer); double avg = 0; for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) { avg += lrds.doubleValue(iter); } final double lrd = (avg / neighbors.size()) / lrds.doubleValue(iditer); if (!Double.isNaN(lrd)) { lofs.putDouble(iditer, lrd); lofminmax.put(lrd); } else { lofs.putDouble(iditer, 0.0); } } // Build result representation. DoubleRelation scoreResult = new MaterializedDoubleRelation( "Spatial Outlier Factor", "sof-outlier", lofs, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta( lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0); OutlierResult or = new OutlierResult(scoreMeta, scoreResult); or.addChildResult(npred); return or; }
/** * Evaluate a single outlier result as histogram. * * @param database Database to process * @param or Outlier result * @return Result */ public HistogramResult<DoubleVector> evaluateOutlierResult(Database database, OutlierResult or) { if (scaling instanceof OutlierScalingFunction) { OutlierScalingFunction oscaling = (OutlierScalingFunction) scaling; oscaling.prepare(or); } ModifiableDBIDs ids = DBIDUtil.newHashSet(or.getScores().getDBIDs()); DBIDs outlierIds = DatabaseUtil.getObjectsByLabelMatch(database, positiveClassName); // first value for outliers, second for each object // If we have useful (finite) min/max, use these for binning. double min = scaling.getMin(); double max = scaling.getMax(); final ObjHistogram<DoubleDoublePair> hist; if (Double.isInfinite(min) || Double.isNaN(min) || Double.isInfinite(max) || Double.isNaN(max)) { hist = new AbstractObjDynamicHistogram<DoubleDoublePair>(bins) { @Override public DoubleDoublePair aggregate(DoubleDoublePair first, DoubleDoublePair second) { first.first += second.first; first.second += second.second; return first; } @Override protected DoubleDoublePair makeObject() { return new DoubleDoublePair(0., 0.); } @Override protected DoubleDoublePair cloneForCache(DoubleDoublePair data) { return new DoubleDoublePair(data.first, data.second); } @Override protected DoubleDoublePair downsample(Object[] data, int start, int end, int size) { DoubleDoublePair sum = new DoubleDoublePair(0, 0); for (int i = start; i < end; i++) { DoubleDoublePair p = (DoubleDoublePair) data[i]; if (p != null) { sum.first += p.first; sum.second += p.second; } } return sum; } }; } else { hist = new AbstractObjStaticHistogram<DoubleDoublePair>(bins, min, max) { @Override protected DoubleDoublePair makeObject() { return new DoubleDoublePair(0., 0.); } @Override public void putData(double coord, DoubleDoublePair data) { DoubleDoublePair exist = get(coord); exist.first += data.first; exist.second += data.second; } }; } // first fill histogram only with values of outliers DoubleDoublePair negative, positive; if (!splitfreq) { negative = new DoubleDoublePair(1. / ids.size(), 0); positive = new DoubleDoublePair(0, 1. / ids.size()); } else { negative = new DoubleDoublePair(1. / (ids.size() - outlierIds.size()), 0); positive = new DoubleDoublePair(0, 1. / outlierIds.size()); } ids.removeDBIDs(outlierIds); // fill histogram with values of each object for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { double result = or.getScores().doubleValue(iter); result = scaling.getScaled(result); if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) { hist.putData(result, negative); } } for (DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) { double result = or.getScores().doubleValue(iter); result = scaling.getScaled(result); if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) { hist.putData(result, positive); } } Collection<DoubleVector> collHist = new ArrayList<>(hist.getNumBins()); for (ObjHistogram.Iter<DoubleDoublePair> iter = hist.iter(); iter.valid(); iter.advance()) { DoubleDoublePair data = iter.getValue(); DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), data.first, data.second}); collHist.add(row); } return new HistogramResult<>("Outlier Score Histogram", "outlier-histogram", collHist); }
@Override public void deleteAll(DBIDs ids) { for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { delete(iter); } }
/** * Run the algorithm * * @param database Database to process * @param relation Relation to process * @return Outlier result */ public OutlierResult run(Database database, Relation<O> relation) { DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc); DBIDs ids = relation.getDBIDs(); // LOCI preprocessing step WritableDataStore<DoubleIntArrayList> interestingDistances = DataStoreUtil.makeStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, DoubleIntArrayList.class); precomputeInterestingRadii(ids, rangeQuery, interestingDistances); // LOCI main step FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null; WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); // Shared instance, to save allocations. MeanVariance mv_n_r_alpha = new MeanVariance(); for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { final DoubleIntArrayList cdist = interestingDistances.get(iditer); final double maxdist = cdist.getDouble(cdist.size() - 1); final int maxneig = cdist.getInt(cdist.size() - 1); double maxmdefnorm = 0.0; double maxnormr = 0; if (maxneig >= nmin) { // Compute the largest neighborhood we will need. DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist); // TODO: Ensure the result is sorted. This is currently implied. // For any critical distance, compute the normalized MDEF score. for (int i = 0, size = cdist.size(); i < size; i++) { // Only start when minimum size is fulfilled if (cdist.getInt(i) < nmin) { continue; } final double r = cdist.getDouble(i); final double alpha_r = alpha * r; // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!) final int n_alphar = cdist.getInt(cdist.find(alpha_r)); // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF} mv_n_r_alpha.reset(); for (DoubleDBIDListIter neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) { // Stop at radius r if (neighbor.doubleValue() > r) { break; } DoubleIntArrayList cdist2 = interestingDistances.get(neighbor); int rn_alphar = cdist2.getInt(cdist2.find(alpha_r)); mv_n_r_alpha.put(rn_alphar); } // We only use the average and standard deviation final double nhat_r_alpha = mv_n_r_alpha.getMean(); final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev(); // Redundant divisions by nhat_r_alpha removed. final double mdef = nhat_r_alpha - n_alphar; final double sigmamdef = sigma_nhat_r_alpha; final double mdefnorm = mdef / sigmamdef; if (mdefnorm > maxmdefnorm) { maxmdefnorm = mdefnorm; maxnormr = r; } } } else { // FIXME: when nmin was not fulfilled - what is the proper value then? maxmdefnorm = Double.POSITIVE_INFINITY; maxnormr = maxdist; } mdef_norm.putDouble(iditer, maxmdefnorm); mdef_radius.putDouble(iditer, maxnormr); minmax.put(maxmdefnorm); LOG.incrementProcessed(progressLOCI); } LOG.ensureCompleted(progressLOCI); DoubleRelation scoreResult = new MaterializedDoubleRelation( "LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta( minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); result.addChildResult( new MaterializedDoubleRelation( "LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs())); return result; }
/** * Performs a single run of DOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @param minClusterSize Minimum size a cluster must have to be accepted. * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) { // Best cluster for the current run. DBIDs C = null; // Relevant attributes for the best cluster. long[] D = null; // Quality of the best cluster. double quality = Double.NEGATIVE_INFINITY; // Bounds for our cluster. // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new // double[d], new double[d]); // Weights for distance (= rectangle query) SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d)); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension and build bounding box. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (BitsUtil.cardinality(nD) > 0) { // Get all points in the box. df.setSelectedDimensions(nD); // TODO: add filtering capabilities into query API! DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w)); if (LOG.isDebuggingFiner()) { LOG.finer( "Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + BitsUtil.cardinality(nD)); } // Is the cluster large enough? if (nC.size() < minClusterSize) { // Too small. if (LOG.isDebuggingFiner()) { LOG.finer("... but it's too small."); } } else { // Better cluster than before? double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD)); if (nQuality > quality) { if (LOG.isDebuggingFiner()) { LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality); } C = nC; D = nD; quality = nQuality; } else { if (LOG.isDebuggingFiner()) { LOG.finer("... but we already have a better one."); } } } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); return (C != null) ? makeCluster(relation, C, D) : null; }
/** * Performs a single run of FastDOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runFastDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) { // Relevant attributes of highest cardinality. long[] D = null; // The seed point for the best dimensions. DBIDVar dV = DBIDUtil.newVar(); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); outer: for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) { D = nD; dV.set(iter); if (BitsUtil.cardinality(D) >= d_zero) { if (iprogress != null) { iprogress.setProcessed(iprogress.getTotal(), LOG); } break outer; } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); // If no relevant dimensions were found, skip it. if (D == null || BitsUtil.cardinality(D) == 0) { return null; } // Get all points in the box. SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq, DatabaseQuery.HINT_SINGLE); // TODO: add filtering capabilities into query API! DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w)); // If we have a non-empty cluster, return it. return (C.size() > 0) ? makeCluster(relation, C, D) : null; }