public Result run(Database database, Relation<O> rel) { DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction()); int size = rel.size(); long pairs = (size * (long) size) >> 1; final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling; if (ssize > Integer.MAX_VALUE) { throw new AbortException("Sampling size too large."); } final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize); DoubleMaxHeap heap = new DoubleMaxHeap(qsize); ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs()); DBIDArrayIter i1 = ids.iter(), i2 = ids.iter(); Random r = rand.getSingleThreadedRandom(); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null; for (long i = 0; i < ssize; i++) { int x = r.nextInt(size - 1) + 1, y = r.nextInt(x); double dist = dq.distance(i1.seek(x), i2.seek(y)); // Skip NaN, and/or zeros. if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) { continue; } heap.add(dist, qsize); LOG.incrementProcessed(prog); } LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile)); LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize)); LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek())); LOG.ensureCompleted(prog); Collection<String> header = Arrays.asList(new String[] {"Distance"}); Collection<Vector> data = Arrays.asList(new Vector[] {new Vector(heap.peek())}); return new CollectionResult<Vector>("Distances sample", "distance-sample", data, header); }
/** * Run the algorithm * * @param db Database * @param relation Relation * @return Clustering hierarchy */ public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) { DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); if (size > 0x10000) { throw new AbortException( "This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow."); } if (Linkage.SINGLE.equals(linkage)) { LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!"); } // Compute the initial (lower triangular) distance matrix. double[] scratch = new double[triangleSize(size)]; DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter(); // Position counter - must agree with computeOffset! int pos = 0; boolean square = Linkage.WARD.equals(linkage) && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction())); for (int x = 0; ix.valid(); x++, ix.advance()) { iy.seek(0); for (int y = 0; y < x; y++, iy.advance()) { scratch[pos] = dq.distance(ix, iy); // Ward uses variances -- i.e. squared values if (square) { scratch[pos] *= scratch[pos]; } pos++; } } // Initialize space for result: WritableDBIDDataStore parent = DataStoreUtil.makeDBIDStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableDoubleDataStore height = DataStoreUtil.makeDoubleStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { parent.put(it, it); height.put(it, Double.POSITIVE_INFINITY); csize.put(it, 1); } // Repeat until everything merged, except the desired number of clusters: FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null; for (int i = 1; i < size; i++) { double min = Double.POSITIVE_INFINITY; int minx = -1, miny = -1; for (ix.seek(0); ix.valid(); ix.advance()) { if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) { continue; } final int xbase = triangleSize(ix.getOffset()); for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) { continue; } final int idx = xbase + iy.getOffset(); if (scratch[idx] <= min) { min = scratch[idx]; minx = ix.getOffset(); miny = iy.getOffset(); } } } assert (minx >= 0 && miny >= 0); // Avoid allocating memory, by reusing existing iterators: ix.seek(minx); iy.seek(miny); // Perform merge in data structure: x -> y // Since y < x, prefer keeping y, dropping x. int sizex = csize.intValue(ix), sizey = csize.intValue(iy); height.put(ix, min); parent.put(ix, iy); csize.put(iy, sizex + sizey); // Update distance matrix. Note: miny < minx final int xbase = triangleSize(minx), ybase = triangleSize(miny); // Write to (y, j), with j < y for (ij.seek(0); ij.getOffset() < miny; ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int sizej = csize.intValue(ij); scratch[ybase + ij.getOffset()] = linkage.combine( sizex, scratch[xbase + ij.getOffset()], sizey, scratch[ybase + ij.getOffset()], sizej, min); } // Write to (j, y), with y < j < x for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + miny] = linkage.combine( sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min); } // Write to (j, y), with y < x < j for (ij.seek(minx + 1); ij.valid(); ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min); } LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); return new PointerHierarchyRepresentationResult(ids, parent, height); }