Пример #1
0
  public Result run(Database database, Relation<O> rel) {
    DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction());
    int size = rel.size();
    long pairs = (size * (long) size) >> 1;

    final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling;
    if (ssize > Integer.MAX_VALUE) {
      throw new AbortException("Sampling size too large.");
    }
    final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize);

    DoubleMaxHeap heap = new DoubleMaxHeap(qsize);

    ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs());
    DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
    Random r = rand.getSingleThreadedRandom();

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null;
    for (long i = 0; i < ssize; i++) {
      int x = r.nextInt(size - 1) + 1, y = r.nextInt(x);
      double dist = dq.distance(i1.seek(x), i2.seek(y));
      // Skip NaN, and/or zeros.
      if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) {
        continue;
      }
      heap.add(dist, qsize);
      LOG.incrementProcessed(prog);
    }

    LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile));
    LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize));
    LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek()));
    LOG.ensureCompleted(prog);
    Collection<String> header = Arrays.asList(new String[] {"Distance"});
    Collection<Vector> data = Arrays.asList(new Vector[] {new Vector(heap.peek())});
    return new CollectionResult<Vector>("Distances sample", "distance-sample", data, header);
  }
  /**
   * Run the algorithm
   *
   * @param db Database
   * @param relation Relation
   * @return Clustering hierarchy
   */
  public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();

    if (size > 0x10000) {
      throw new AbortException(
          "This implementation does not scale to data sets larger than "
              + 0x10000
              + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    if (Linkage.SINGLE.equals(linkage)) {
      LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    }

    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    boolean square =
        Linkage.WARD.equals(linkage)
            && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction()));
    for (int x = 0; ix.valid(); x++, ix.advance()) {
      iy.seek(0);
      for (int y = 0; y < x; y++, iy.advance()) {
        scratch[pos] = dq.distance(ix, iy);
        // Ward uses variances -- i.e. squared values
        if (square) {
          scratch[pos] *= scratch[pos];
        }
        pos++;
      }
    }

    // Initialize space for result:
    WritableDBIDDataStore parent =
        DataStoreUtil.makeDBIDStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore height =
        DataStoreUtil.makeDoubleStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableIntegerDataStore csize =
        DataStoreUtil.makeIntegerStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
      parent.put(it, it);
      height.put(it, Double.POSITIVE_INFINITY);
      csize.put(it, 1);
    }

    // Repeat until everything merged, except the desired number of clusters:
    FiniteProgress prog =
        LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null;
    for (int i = 1; i < size; i++) {
      double min = Double.POSITIVE_INFINITY;
      int minx = -1, miny = -1;
      for (ix.seek(0); ix.valid(); ix.advance()) {
        if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int xbase = triangleSize(ix.getOffset());
        for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) {
          if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) {
            continue;
          }
          final int idx = xbase + iy.getOffset();
          if (scratch[idx] <= min) {
            min = scratch[idx];
            minx = ix.getOffset();
            miny = iy.getOffset();
          }
        }
      }
      assert (minx >= 0 && miny >= 0);
      // Avoid allocating memory, by reusing existing iterators:
      ix.seek(minx);
      iy.seek(miny);
      // Perform merge in data structure: x -> y
      // Since y < x, prefer keeping y, dropping x.
      int sizex = csize.intValue(ix), sizey = csize.intValue(iy);
      height.put(ix, min);
      parent.put(ix, iy);
      csize.put(iy, sizex + sizey);

      // Update distance matrix. Note: miny < minx
      final int xbase = triangleSize(minx), ybase = triangleSize(miny);
      // Write to (y, j), with j < y
      for (ij.seek(0); ij.getOffset() < miny; ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int sizej = csize.intValue(ij);
        scratch[ybase + ij.getOffset()] =
            linkage.combine(
                sizex,
                scratch[xbase + ij.getOffset()],
                sizey,
                scratch[ybase + ij.getOffset()],
                sizej,
                min);
      }
      // Write to (j, y), with y < j < x
      for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int jbase = triangleSize(ij.getOffset());
        final int sizej = csize.intValue(ij);
        scratch[jbase + miny] =
            linkage.combine(
                sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min);
      }
      // Write to (j, y), with y < x < j
      for (ij.seek(minx + 1); ij.valid(); ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int jbase = triangleSize(ij.getOffset());
        final int sizej = csize.intValue(ij);
        scratch[jbase + miny] =
            linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min);
      }
      LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);

    return new PointerHierarchyRepresentationResult(ids, parent, height);
  }