/** * Main loop for OUTRES * * @param relation Relation to process * @return Outlier detection result */ public OutlierResult run(Relation<V> relation) { WritableDoubleDataStore ranks = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); KernelDensityEstimator kernel = new KernelDensityEstimator(relation); long[] subspace = BitsUtil.zero(kernel.dim); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("OUTRES scores", relation.size(), LOG) : null; for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { BitsUtil.zeroI(subspace); double score = outresScore(0, subspace, iditer, kernel); ranks.putDouble(iditer, score); minmax.put(score); LOG.incrementProcessed(progress); } LOG.ensureCompleted(progress); OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.); OutlierResult outresResult = new OutlierResult( meta, new MaterializedDoubleRelation("OUTRES", "outres-score", ranks, relation.getDBIDs())); return outresResult; }
/** * Run the Eclat algorithm * * @param db Database to process * @param relation Bit vector relation * @return Frequent patterns found */ public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) { // TODO: implement with resizable arrays, to not need dim. final int dim = RelationUtil.dimensionality(relation); final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation); // Compute absolute minsupport final int minsupp = getMinimumSupport(relation.size()); LOG.verbose("Build 1-dimensional transaction lists."); Duration ctime = LOG.newDuration(STAT + "eclat.transposition.time").begin(); DBIDs[] idx = buildIndex(relation, dim, minsupp); LOG.statistics(ctime.end()); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Building frequent itemsets", idx.length, LOG) : null; Duration etime = LOG.newDuration(STAT + "eclat.extraction.time").begin(); final List<Itemset> solution = new ArrayList<>(); for (int i = 0; i < idx.length; i++) { LOG.incrementProcessed(prog); extractItemsets(idx, i, minsupp, solution); } LOG.ensureCompleted(prog); Collections.sort(solution); LOG.statistics(etime.end()); LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size())); return new FrequentItemsetsResult("Eclat", "eclat", solution, meta); }
@Override public void run() { Database database = input.getDatabase(); Relation<O> relation = database.getRelation(distance.getInputTypeRestriction()); DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, distance); KNNQuery<O> knnQ = database.getKNNQuery(distanceQuery, DatabaseQuery.HINT_HEAVY_USE); // open file. try (RandomAccessFile file = new RandomAccessFile(out, "rw"); FileChannel channel = file.getChannel(); // and acquire a file write lock FileLock lock = channel.lock()) { // write magic header file.writeInt(KNN_CACHE_MAGIC); int bufsize = k * 12 * 2 + 10; // Initial size, enough for 2 kNN. ByteBuffer buffer = ByteBuffer.allocateDirect(bufsize); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Computing kNN", relation.size(), LOG) : null; for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { final KNNList nn = knnQ.getKNNForDBID(it, k); final int nnsize = nn.size(); // Grow the buffer when needed: if (nnsize * 12 + 10 > bufsize) { while (nnsize * 12 + 10 > bufsize) { bufsize <<= 1; } buffer = ByteBuffer.allocateDirect(bufsize); } buffer.clear(); ByteArrayUtil.writeUnsignedVarint(buffer, it.internalGetIndex()); ByteArrayUtil.writeUnsignedVarint(buffer, nnsize); int c = 0; for (DoubleDBIDListIter ni = nn.iter(); ni.valid(); ni.advance(), c++) { ByteArrayUtil.writeUnsignedVarint(buffer, ni.internalGetIndex()); buffer.putDouble(ni.doubleValue()); } if (c != nn.size()) { throw new AbortException("Sizes did not agree. Cache is invalid."); } buffer.flip(); channel.write(buffer); LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); lock.release(); } catch (IOException e) { LOG.exception(e); } // FIXME: close! }
@Override public Clustering<KMeansModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; int iteration = 0; for (; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); logVarstat(varstat, varsum); // Stop if no cluster assignment changed. if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } LOG.setCompleted(prog); if (LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); } // Wrap result Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); if (ids.size() == 0) { continue; } KMeansModel model = new KMeansModel(means[i], varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; }
@Override public Clustering<BiclusterWithInversionsModel> biclustering() { double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; for (int i = 0; i < n; i++) { cand.reset(); multipleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } singleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } nodeAddition(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } cand.maskMatrix(mat, dist); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); noise.removeDBIDs(cids); result.addToplevelCluster(new Cluster<>(cids, model)); if (LOG.isVerbose()) { LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); LOG.verbose("Number of rows: " + cand.rowcard + "\n"); LOG.verbose("Number of columns: " + cand.colcard + "\n"); // LOG.verbose("Total number of masked values: " + maskedVals.size() + // "\n"); } LOG.incrementProcessed(prog); } // Add a noise cluster, full-dimensional. if (!noise.isEmpty()) { long[] allcols = BitsUtil.ones(getColDim()); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); result.addToplevelCluster(new Cluster<>(noise, true, model)); } LOG.ensureCompleted(prog); return result; }
/** * Preprocessing step: determine the radii of interest for each point. * * @param ids IDs to process * @param rangeQuery Range query * @param interestingDistances Distances of interest */ protected void precomputeInterestingRadii( DBIDs ids, RangeQuery<O> rangeQuery, WritableDataStore<DoubleIntArrayList> interestingDistances) { FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null; for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax); // build list of critical distances DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1); { int i = 0; DoubleDBIDListIter ni = neighbors.iter(); while (ni.valid()) { final double curdist = ni.doubleValue(); ++i; ni.advance(); // Skip, if tied to the next object: if (ni.valid() && curdist == ni.doubleValue()) { continue; } cdist.append(curdist, i); // Scale radius, and reinsert if (alpha != 1.) { final double ri = curdist / alpha; if (ri <= rmax) { cdist.append(ri, Integer.MIN_VALUE); } } } } cdist.sort(); // fill the gaps to have fast lookups of number of neighbors at a given // distance. int lastk = 0; for (int i = 0, size = cdist.size(); i < size; i++) { final int k = cdist.getInt(i); if (k == Integer.MIN_VALUE) { cdist.setValue(i, lastk); } else { lastk = k; } } // TODO: shrink the list, removing duplicate radii? interestingDistances.put(iditer, cdist); LOG.incrementProcessed(progressPreproc); } LOG.ensureCompleted(progressPreproc); }
/** * Process a database * * @param database Database to process * @param relation Relation to process * @return Histogram of ranking qualities */ public HistogramResult<DoubleVector> run(Database database, Relation<O> relation) { final DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, getDistanceFunction()); final KNNQuery<O> knnQuery = database.getKNNQuery(distanceQuery, relation.size()); if (LOG.isVerbose()) { LOG.verbose("Preprocessing clusters..."); } // Cluster by labels Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters(); DoubleStaticHistogram hist = new DoubleStaticHistogram(numbins, 0.0, 1.0); if (LOG.isVerbose()) { LOG.verbose("Processing points..."); } FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null; MeanVariance mv = new MeanVariance(); // sort neighbors for (Cluster<?> clus : split) { for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { KNNList knn = knnQuery.getKNNForDBID(iter, relation.size()); double result = new ROCEvaluation().evaluate(clus, knn); mv.put(result); hist.increment(result, 1. / relation.size()); LOG.incrementProcessed(progress); } } LOG.ensureCompleted(progress); // Transform Histogram into a Double Vector array. Collection<DoubleVector> res = new ArrayList<>(relation.size()); for (DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) { DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), iter.getValue()}); res.add(row); } HistogramResult<DoubleVector> result = new HistogramResult<>("Ranking Quality Histogram", "ranking-histogram", res); result.addHeader("Mean: " + mv.getMean() + " Variance: " + mv.getSampleVariance()); return result; }
public Result run(Database database, Relation<O> rel) { DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction()); int size = rel.size(); long pairs = (size * (long) size) >> 1; final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling; if (ssize > Integer.MAX_VALUE) { throw new AbortException("Sampling size too large."); } final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize); DoubleMaxHeap heap = new DoubleMaxHeap(qsize); ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs()); DBIDArrayIter i1 = ids.iter(), i2 = ids.iter(); Random r = rand.getSingleThreadedRandom(); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null; for (long i = 0; i < ssize; i++) { int x = r.nextInt(size - 1) + 1, y = r.nextInt(x); double dist = dq.distance(i1.seek(x), i2.seek(y)); // Skip NaN, and/or zeros. if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) { continue; } heap.add(dist, qsize); LOG.incrementProcessed(prog); } LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile)); LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize)); LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek())); LOG.ensureCompleted(prog); Collection<String> header = Arrays.asList(new String[] {"Distance"}); Collection<Vector> data = Arrays.asList(new Vector[] {new Vector(heap.peek())}); return new CollectionResult<Vector>("Distances sample", "distance-sample", data, header); }
/** * Run the algorithm * * @param database Database to process * @param relation Relation to process * @return Outlier result */ public OutlierResult run(Database database, Relation<O> relation) { DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc); DBIDs ids = relation.getDBIDs(); // LOCI preprocessing step WritableDataStore<DoubleIntArrayList> interestingDistances = DataStoreUtil.makeStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, DoubleIntArrayList.class); precomputeInterestingRadii(ids, rangeQuery, interestingDistances); // LOCI main step FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null; WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); // Shared instance, to save allocations. MeanVariance mv_n_r_alpha = new MeanVariance(); for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { final DoubleIntArrayList cdist = interestingDistances.get(iditer); final double maxdist = cdist.getDouble(cdist.size() - 1); final int maxneig = cdist.getInt(cdist.size() - 1); double maxmdefnorm = 0.0; double maxnormr = 0; if (maxneig >= nmin) { // Compute the largest neighborhood we will need. DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist); // TODO: Ensure the result is sorted. This is currently implied. // For any critical distance, compute the normalized MDEF score. for (int i = 0, size = cdist.size(); i < size; i++) { // Only start when minimum size is fulfilled if (cdist.getInt(i) < nmin) { continue; } final double r = cdist.getDouble(i); final double alpha_r = alpha * r; // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!) final int n_alphar = cdist.getInt(cdist.find(alpha_r)); // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF} mv_n_r_alpha.reset(); for (DoubleDBIDListIter neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) { // Stop at radius r if (neighbor.doubleValue() > r) { break; } DoubleIntArrayList cdist2 = interestingDistances.get(neighbor); int rn_alphar = cdist2.getInt(cdist2.find(alpha_r)); mv_n_r_alpha.put(rn_alphar); } // We only use the average and standard deviation final double nhat_r_alpha = mv_n_r_alpha.getMean(); final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev(); // Redundant divisions by nhat_r_alpha removed. final double mdef = nhat_r_alpha - n_alphar; final double sigmamdef = sigma_nhat_r_alpha; final double mdefnorm = mdef / sigmamdef; if (mdefnorm > maxmdefnorm) { maxmdefnorm = mdefnorm; maxnormr = r; } } } else { // FIXME: when nmin was not fulfilled - what is the proper value then? maxmdefnorm = Double.POSITIVE_INFINITY; maxnormr = maxdist; } mdef_norm.putDouble(iditer, maxmdefnorm); mdef_radius.putDouble(iditer, maxnormr); minmax.put(maxmdefnorm); LOG.incrementProcessed(progressLOCI); } LOG.ensureCompleted(progressLOCI); DoubleRelation scoreResult = new MaterializedDoubleRelation( "LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta( minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); result.addChildResult( new MaterializedDoubleRelation( "LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs())); return result; }
/** * Run the algorithm * * @param db Database * @param relation Relation * @return Clustering hierarchy */ public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) { DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); if (size > 0x10000) { throw new AbortException( "This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow."); } if (Linkage.SINGLE.equals(linkage)) { LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!"); } // Compute the initial (lower triangular) distance matrix. double[] scratch = new double[triangleSize(size)]; DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter(); // Position counter - must agree with computeOffset! int pos = 0; boolean square = Linkage.WARD.equals(linkage) && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction())); for (int x = 0; ix.valid(); x++, ix.advance()) { iy.seek(0); for (int y = 0; y < x; y++, iy.advance()) { scratch[pos] = dq.distance(ix, iy); // Ward uses variances -- i.e. squared values if (square) { scratch[pos] *= scratch[pos]; } pos++; } } // Initialize space for result: WritableDBIDDataStore parent = DataStoreUtil.makeDBIDStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableDoubleDataStore height = DataStoreUtil.makeDoubleStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { parent.put(it, it); height.put(it, Double.POSITIVE_INFINITY); csize.put(it, 1); } // Repeat until everything merged, except the desired number of clusters: FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null; for (int i = 1; i < size; i++) { double min = Double.POSITIVE_INFINITY; int minx = -1, miny = -1; for (ix.seek(0); ix.valid(); ix.advance()) { if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) { continue; } final int xbase = triangleSize(ix.getOffset()); for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) { continue; } final int idx = xbase + iy.getOffset(); if (scratch[idx] <= min) { min = scratch[idx]; minx = ix.getOffset(); miny = iy.getOffset(); } } } assert (minx >= 0 && miny >= 0); // Avoid allocating memory, by reusing existing iterators: ix.seek(minx); iy.seek(miny); // Perform merge in data structure: x -> y // Since y < x, prefer keeping y, dropping x. int sizex = csize.intValue(ix), sizey = csize.intValue(iy); height.put(ix, min); parent.put(ix, iy); csize.put(iy, sizex + sizey); // Update distance matrix. Note: miny < minx final int xbase = triangleSize(minx), ybase = triangleSize(miny); // Write to (y, j), with j < y for (ij.seek(0); ij.getOffset() < miny; ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int sizej = csize.intValue(ij); scratch[ybase + ij.getOffset()] = linkage.combine( sizex, scratch[xbase + ij.getOffset()], sizey, scratch[ybase + ij.getOffset()], sizej, min); } // Write to (j, y), with y < j < x for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + miny] = linkage.combine( sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min); } // Write to (j, y), with y < x < j for (ij.seek(minx + 1); ij.valid(); ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min); } LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); return new PointerHierarchyRepresentationResult(ids, parent, height); }
@Override public void parse(InputStream in, DistanceCacheWriter cache) { reader.reset(in); int min = Integer.MAX_VALUE, max = Integer.MIN_VALUE; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Parsing distance matrix", LOG) : null; try { while (reader.nextLineExceptComments()) { LOG.incrementProcessed(prog); if (!tokenizer.valid()) { throw new IllegalArgumentException( "Less than three values in line " + reader.getLineNumber()); } int id1, id2; try { id1 = (int) tokenizer.getLongBase10(); tokenizer.advance(); } catch (NumberFormatException e) { throw new IllegalArgumentException( "Error in line " + reader.getLineNumber() + ": id1 is not an integer!"); } if (!tokenizer.valid()) { throw new IllegalArgumentException( "Less than three values in line " + reader.getLineNumber()); } try { id2 = (int) tokenizer.getLongBase10(); tokenizer.advance(); } catch (NumberFormatException e) { throw new IllegalArgumentException( "Error in line " + reader.getLineNumber() + ": id2 is not an integer!"); } if (!tokenizer.valid()) { throw new IllegalArgumentException( "Less than three values in line " + reader.getLineNumber()); } // Track minimum and maximum if (id1 < id2) { min = (id1 < min) ? id1 : min; max = (id2 > min) ? id2 : max; } else { min = (id2 < min) ? id2 : min; max = (id1 > min) ? id1 : max; } try { double distance = tokenizer.getDouble(); cache.put(id1, id2, distance); } catch (IllegalArgumentException e) { throw new IllegalArgumentException( "Error in line " + reader.getLineNumber() + ":" + e.getMessage(), e); } tokenizer.advance(); if (tokenizer.valid()) { throw new IllegalArgumentException( "More than three values in line " + reader.getLineNumber()); } } } catch (IOException e) { throw new IllegalArgumentException( "Error while parsing line " + reader.getLineNumber() + "."); } LOG.setCompleted(prog); // check if all distance values are specified for (int i1 = min; i1 <= max; i1++) { for (int i2 = i1 + 1; i2 <= max; i2++) { if (!cache.containsKey(i1, i2)) { throw new IllegalArgumentException( "Distance value for " + i1 + " to " + i2 + " is missing!"); } } } }
/** * Performs a single run of FastDOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runFastDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) { // Relevant attributes of highest cardinality. long[] D = null; // The seed point for the best dimensions. DBIDVar dV = DBIDUtil.newVar(); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); outer: for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) { D = nD; dV.set(iter); if (BitsUtil.cardinality(D) >= d_zero) { if (iprogress != null) { iprogress.setProcessed(iprogress.getTotal(), LOG); } break outer; } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); // If no relevant dimensions were found, skip it. if (D == null || BitsUtil.cardinality(D) == 0) { return null; } // Get all points in the box. SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq, DatabaseQuery.HINT_SINGLE); // TODO: add filtering capabilities into query API! DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w)); // If we have a non-empty cluster, return it. return (C.size() > 0) ? makeCluster(relation, C, D) : null; }
/** * Performs a single run of DOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @param minClusterSize Minimum size a cluster must have to be accepted. * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) { // Best cluster for the current run. DBIDs C = null; // Relevant attributes for the best cluster. long[] D = null; // Quality of the best cluster. double quality = Double.NEGATIVE_INFINITY; // Bounds for our cluster. // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new // double[d], new double[d]); // Weights for distance (= rectangle query) SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d)); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension and build bounding box. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (BitsUtil.cardinality(nD) > 0) { // Get all points in the box. df.setSelectedDimensions(nD); // TODO: add filtering capabilities into query API! DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w)); if (LOG.isDebuggingFiner()) { LOG.finer( "Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + BitsUtil.cardinality(nD)); } // Is the cluster large enough? if (nC.size() < minClusterSize) { // Too small. if (LOG.isDebuggingFiner()) { LOG.finer("... but it's too small."); } } else { // Better cluster than before? double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD)); if (nQuality > quality) { if (LOG.isDebuggingFiner()) { LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality); } C = nC; D = nD; quality = nQuality; } else { if (LOG.isDebuggingFiner()) { LOG.finer("... but we already have a better one."); } } } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); return (C != null) ? makeCluster(relation, C, D) : null; }