Beispiel #1
0
 /**
  * Constructor.
  *
  * @param size Size
  * @param idmap ID map
  */
 public ArrayDBIDStore(int size, DataStoreIDMap idmap) {
   super();
   this.data = DBIDUtil.newArray(size);
   // Initialize
   DBIDRef inv = DBIDUtil.invalid();
   for (int i = 0; i < size; i++) {
     data.add(inv);
   }
   this.idmap = idmap;
 }
Beispiel #2
0
  /**
   * DBSCAN-function expandCluster.
   *
   * <p>Border-Objects become members of the first possible cluster.
   *
   * @param relation Database relation to run on
   * @param rangeQuery Range query to use
   * @param startObjectID potential seed of a new potential cluster
   * @param objprog the progress object for logging the current status
   */
  protected void expandCluster(
      Relation<O> relation,
      RangeQuery<O> rangeQuery,
      DBIDRef startObjectID,
      FiniteProgress objprog,
      IndefiniteProgress clusprog) {
    DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
    ncounter += neighbors.size();

    // startObject is no core-object
    if (neighbors.size() < minpts) {
      noise.add(startObjectID);
      processedIDs.add(startObjectID);
      if (objprog != null) {
        objprog.incrementProcessed(LOG);
      }
      return;
    }

    ModifiableDBIDs currentCluster = DBIDUtil.newArray();
    currentCluster.add(startObjectID);
    processedIDs.add(startObjectID);

    // try to expand the cluster
    HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet();
    processNeighbors(neighbors.iter(), currentCluster, seeds);

    DBIDVar o = DBIDUtil.newVar();
    while (!seeds.isEmpty()) {
      seeds.pop(o);
      neighbors = rangeQuery.getRangeForDBID(o, epsilon);
      ncounter += neighbors.size();

      if (neighbors.size() >= minpts) {
        processNeighbors(neighbors.iter(), currentCluster, seeds);
      }

      if (objprog != null) {
        objprog.incrementProcessed(LOG);
      }
    }
    resultList.add(currentCluster);
    if (clusprog != null) {
      clusprog.setProcessed(resultList.size(), LOG);
    }
  }
Beispiel #3
0
  private DBIDs mergeJoin(DBIDs first, DBIDs second) {
    assert (!(first instanceof HashSetDBIDs));
    assert (!(second instanceof HashSetDBIDs));
    ArrayModifiableDBIDs ids = DBIDUtil.newArray();

    DBIDIter i1 = first.iter(), i2 = second.iter();
    while (i1.valid() && i2.valid()) {
      int c = DBIDUtil.compare(i1, i2);
      if (c < 0) {
        i1.advance();
      } else if (c > 0) {
        i2.advance();
      } else {
        ids.add(i1);
        i1.advance();
        i2.advance();
      }
    }
    return ids;
  }
Beispiel #4
0
 private DBIDs[] buildIndex(Relation<BitVector> relation, int dim, int minsupp) {
   ArrayModifiableDBIDs[] idx = new ArrayModifiableDBIDs[dim];
   for (int i = 0; i < dim; i++) {
     idx[i] = DBIDUtil.newArray();
   }
   for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
     SparseFeatureVector<?> bv = relation.get(iter);
     // TODO: only count those which satisfy minlength?
     for (int it = bv.iter(); bv.iterValid(it); it = bv.iterAdvance(it)) {
       idx[bv.iterDim(it)].add(iter);
     }
   }
   // Forget non-frequent 1-itemsets.
   for (int i = 0; i < dim; i++) {
     if (idx[i].size() < minsupp) {
       idx[i] = null;
     } else {
       idx[i].sort();
     }
   }
   return idx;
 }
Beispiel #5
0
  /**
   * Performs the DOC or FastDOC (as configured) algorithm on the given Database.
   *
   * <p>This will run exhaustively, i.e. run DOC until no clusters are found anymore / the database
   * size has shrunk below the threshold for minimum cluster size.
   *
   * @param database Database
   * @param relation Data relation
   */
  public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    // Dimensionality of our set.
    final int d = RelationUtil.dimensionality(relation);

    // Get available DBIDs as a set we can remove items from.
    ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());

    // Precompute values as described in Figure 2.
    double r = Math.abs(Math.log(d + d) / Math.log(beta * .5));
    // Outer loop count.
    int n = (int) (2. / alpha);
    // Inner loop count.
    int m = (int) (Math.pow(2. / alpha, r) * Math.log(4));
    if (heuristics) {
      m = Math.min(m, Math.min(1000000, d * d));
    }

    // Minimum size for a cluster for it to be accepted.
    int minClusterSize = (int) (alpha * S.size());

    // List of all clusters we found.
    Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC");

    // Inform the user about the number of actual clusters found so far.
    IndefiniteProgress cprogress =
        LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;

    // To not only find a single cluster, we continue running until our set
    // of points is empty.
    while (S.size() > minClusterSize) {
      Cluster<SubspaceModel> C;
      if (heuristics) {
        C = runFastDOC(database, relation, S, d, n, m, (int) r);
      } else {
        C = runDOC(database, relation, S, d, n, m, (int) r, minClusterSize);
      }

      if (C == null) {
        // Stop trying if we couldn't find a cluster.
        break;
      }
      // Found a cluster, remember it, remove its points from the set.
      result.addToplevelCluster(C);

      // Remove all points of the cluster from the set and continue.
      S.removeDBIDs(C.getIDs());

      if (cprogress != null) {
        cprogress.setProcessed(result.getAllClusters().size(), LOG);
      }
    }

    // Add the remainder as noise.
    if (S.size() > 0) {
      long[] alldims = BitsUtil.ones(d);
      result.addToplevelCluster(
          new Cluster<>(
              S,
              true,
              new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S).getArrayRef())));
    }
    LOG.setCompleted(cprogress);
    return result;
  }