Пример #1
0
  /** Performs the DBSCAN algorithm on the given database. */
  public Clustering<Model> run(Relation<O> relation) {
    final int size = relation.size();
    if (size < minpts) {
      Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
      result.addToplevelCluster(
          new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER));
      return result;
    }

    RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
    resultList = new ArrayList<>();
    noise = DBIDUtil.newHashSet();
    runDBSCAN(relation, rangeQuery);

    double averagen = ncounter / (double) relation.size();
    LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen));
    if (averagen < 1 + 0.1 * (minpts - 1)) {
      LOG.warning("There are very few neighbors found. Epsilon may be too small.");
    }
    if (averagen > 100 * minpts) {
      LOG.warning("There are very many neighbors found. Epsilon may be too large.");
    }

    Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
    for (ModifiableDBIDs res : resultList) {
      result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER));
    }
    result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return result;
  }
Пример #2
0
  /**
   * Handles a DataStoreEvent with the specified type. If the current event type is not equal to the
   * specified type, the events accumulated up to now will be fired first.
   *
   * <p>The new event will be aggregated and fired on demand if {@link #accumulateDataStoreEvents}
   * is set, otherwise all registered <code>DataStoreListener</code> will be notified immediately
   * that the content of the database has been changed.
   *
   * @param objects the objects that have been changed, i.e. inserted, deleted or updated
   */
  private void fireObjectsChanged(DBIDs objects, Type type) {
    // flush first
    if (currentDataStoreEventType != null && !currentDataStoreEventType.equals(type)) {
      flushDataStoreEvents();
    }
    if (accumulateDataStoreEvents) {
      if (this.dataStoreObjects == null) {
        this.dataStoreObjects = DBIDUtil.newHashSet();
      }
      this.dataStoreObjects.addDBIDs(objects);
      currentDataStoreEventType = type;
      return;
    }
    // Execute immediately:
    DataStoreEvent e;
    switch (type) {
      case INSERT:
        e = DataStoreEvent.insertionEvent(objects);
        break;
      case REMOVE:
        e = DataStoreEvent.removalEvent(objects);
        break;
      case UPDATE:
        e = DataStoreEvent.updateEvent(objects);
        break;
      default:
        return;
    }

    for (int i = dataListenerList.size(); --i >= 0; ) {
      dataListenerList.get(i).contentChanged(e);
    }
  }
Пример #3
0
 /**
  * Utility method to create a subspace cluster from a list of DBIDs and the relevant attributes.
  *
  * @param relation to compute a centroid.
  * @param C the cluster points.
  * @param D the relevant dimensions.
  * @return an object representing the subspace cluster.
  */
 private Cluster<SubspaceModel> makeCluster(Relation<V> relation, DBIDs C, long[] D) {
   DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values!
   Cluster<SubspaceModel> cluster = new Cluster<>(ids);
   cluster.setModel(
       new SubspaceModel(new Subspace(D), Centroid.make(relation, ids).getArrayRef()));
   return cluster;
 }
Пример #4
0
  @Override
  public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
      return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
      clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment =
        DataStoreUtil.makeIntegerStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];

    IndefiniteProgress prog =
        LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat =
        LOG.isStatistics()
            ? new DoubleStatistic(this.getClass().getName() + ".variance-sum")
            : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
      LOG.incrementProcessed(prog);
      boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
      logVarstat(varstat, varsum);
      // Stop if no cluster assignment changed.
      if (!changed) {
        break;
      }
      // Recompute means.
      means = means(clusters, means, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
      LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }

    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
      DBIDs ids = clusters.get(i);
      if (ids.size() == 0) {
        continue;
      }
      KMeansModel model = new KMeansModel(means[i], varsum[i]);
      result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
  }
Пример #5
0
  @Override
  public Clustering<BiclusterWithInversionsModel> biclustering() {
    double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);

    BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());

    Clustering<BiclusterWithInversionsModel> result =
        new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
    ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
    for (int i = 0; i < n; i++) {
      cand.reset();
      multipleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      singleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      nodeAddition(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      cand.maskMatrix(mat, dist);
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
      final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
      noise.removeDBIDs(cids);
      result.addToplevelCluster(new Cluster<>(cids, model));

      if (LOG.isVerbose()) {
        LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
        LOG.verbose("Number of rows: " + cand.rowcard + "\n");
        LOG.verbose("Number of columns: " + cand.colcard + "\n");
        // LOG.verbose("Total number of masked values: " + maskedVals.size() +
        // "\n");
      }
      LOG.incrementProcessed(prog);
    }
    // Add a noise cluster, full-dimensional.
    if (!noise.isEmpty()) {
      long[] allcols = BitsUtil.ones(getColDim());
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
      result.addToplevelCluster(new Cluster<>(noise, true, model));
    }
    LOG.ensureCompleted(prog);
    return result;
  }
Пример #6
0
  /**
   * Handles a DataStoreEvent with the specified type. If the current event type is not equal to the
   * specified type, the events accumulated up to now will be fired first.
   *
   * <p>The new event will be aggregated and fired on demand if {@link #accumulateDataStoreEvents}
   * is set, otherwise all registered <code>DataStoreListener</code> will be notified immediately
   * that the content of the database has been changed.
   *
   * @param object the object that has been changed, i.e. inserted, deleted or updated
   */
  private void fireObjectChanged(DBIDRef object, Type type) {
    // flush first
    if (currentDataStoreEventType != null && !currentDataStoreEventType.equals(type)) {
      flushDataStoreEvents();
    }
    if (this.dataStoreObjects == null) {
      this.dataStoreObjects = DBIDUtil.newHashSet();
    }
    this.dataStoreObjects.add(object);
    currentDataStoreEventType = type;

    if (!accumulateDataStoreEvents) {
      flushDataStoreEvents();
    }
  }
Пример #7
0
  /**
   * DBSCAN-function expandCluster.
   *
   * <p>Border-Objects become members of the first possible cluster.
   *
   * @param relation Database relation to run on
   * @param rangeQuery Range query to use
   * @param startObjectID potential seed of a new potential cluster
   * @param objprog the progress object for logging the current status
   */
  protected void expandCluster(
      Relation<O> relation,
      RangeQuery<O> rangeQuery,
      DBIDRef startObjectID,
      FiniteProgress objprog,
      IndefiniteProgress clusprog) {
    DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
    ncounter += neighbors.size();

    // startObject is no core-object
    if (neighbors.size() < minpts) {
      noise.add(startObjectID);
      processedIDs.add(startObjectID);
      if (objprog != null) {
        objprog.incrementProcessed(LOG);
      }
      return;
    }

    ModifiableDBIDs currentCluster = DBIDUtil.newArray();
    currentCluster.add(startObjectID);
    processedIDs.add(startObjectID);

    // try to expand the cluster
    HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet();
    processNeighbors(neighbors.iter(), currentCluster, seeds);

    DBIDVar o = DBIDUtil.newVar();
    while (!seeds.isEmpty()) {
      seeds.pop(o);
      neighbors = rangeQuery.getRangeForDBID(o, epsilon);
      ncounter += neighbors.size();

      if (neighbors.size() >= minpts) {
        processNeighbors(neighbors.iter(), currentCluster, seeds);
      }

      if (objprog != null) {
        objprog.incrementProcessed(LOG);
      }
    }
    resultList.add(currentCluster);
    if (clusprog != null) {
      clusprog.setProcessed(resultList.size(), LOG);
    }
  }
Пример #8
0
  /**
   * Run the DBSCAN algorithm
   *
   * @param relation Data relation
   * @param rangeQuery Range query class
   */
  protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) {
    final int size = relation.size();
    FiniteProgress objprog =
        LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null;
    IndefiniteProgress clusprog =
        LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;

    processedIDs = DBIDUtil.newHashSet(size);
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      if (!processedIDs.contains(iditer)) {
        expandCluster(relation, rangeQuery, iditer, objprog, clusprog);
      }
      if (objprog != null && clusprog != null) {
        objprog.setProcessed(processedIDs.size(), LOG);
        clusprog.setProcessed(resultList.size(), LOG);
      }
      if (processedIDs.size() == size) {
        break;
      }
    }
    // Finish progress logging
    LOG.ensureCompleted(objprog);
    LOG.setCompleted(clusprog);
  }
Пример #9
0
  /**
   * Evaluate a single outlier result as histogram.
   *
   * @param database Database to process
   * @param or Outlier result
   * @return Result
   */
  public HistogramResult<DoubleVector> evaluateOutlierResult(Database database, OutlierResult or) {
    if (scaling instanceof OutlierScalingFunction) {
      OutlierScalingFunction oscaling = (OutlierScalingFunction) scaling;
      oscaling.prepare(or);
    }

    ModifiableDBIDs ids = DBIDUtil.newHashSet(or.getScores().getDBIDs());
    DBIDs outlierIds = DatabaseUtil.getObjectsByLabelMatch(database, positiveClassName);
    // first value for outliers, second for each object
    // If we have useful (finite) min/max, use these for binning.
    double min = scaling.getMin();
    double max = scaling.getMax();
    final ObjHistogram<DoubleDoublePair> hist;
    if (Double.isInfinite(min)
        || Double.isNaN(min)
        || Double.isInfinite(max)
        || Double.isNaN(max)) {
      hist =
          new AbstractObjDynamicHistogram<DoubleDoublePair>(bins) {
            @Override
            public DoubleDoublePair aggregate(DoubleDoublePair first, DoubleDoublePair second) {
              first.first += second.first;
              first.second += second.second;
              return first;
            }

            @Override
            protected DoubleDoublePair makeObject() {
              return new DoubleDoublePair(0., 0.);
            }

            @Override
            protected DoubleDoublePair cloneForCache(DoubleDoublePair data) {
              return new DoubleDoublePair(data.first, data.second);
            }

            @Override
            protected DoubleDoublePair downsample(Object[] data, int start, int end, int size) {
              DoubleDoublePair sum = new DoubleDoublePair(0, 0);
              for (int i = start; i < end; i++) {
                DoubleDoublePair p = (DoubleDoublePair) data[i];
                if (p != null) {
                  sum.first += p.first;
                  sum.second += p.second;
                }
              }
              return sum;
            }
          };
    } else {
      hist =
          new AbstractObjStaticHistogram<DoubleDoublePair>(bins, min, max) {
            @Override
            protected DoubleDoublePair makeObject() {
              return new DoubleDoublePair(0., 0.);
            }

            @Override
            public void putData(double coord, DoubleDoublePair data) {
              DoubleDoublePair exist = get(coord);
              exist.first += data.first;
              exist.second += data.second;
            }
          };
    }

    // first fill histogram only with values of outliers
    DoubleDoublePair negative, positive;
    if (!splitfreq) {
      negative = new DoubleDoublePair(1. / ids.size(), 0);
      positive = new DoubleDoublePair(0, 1. / ids.size());
    } else {
      negative = new DoubleDoublePair(1. / (ids.size() - outlierIds.size()), 0);
      positive = new DoubleDoublePair(0, 1. / outlierIds.size());
    }
    ids.removeDBIDs(outlierIds);
    // fill histogram with values of each object
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
      double result = or.getScores().doubleValue(iter);
      result = scaling.getScaled(result);
      if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) {
        hist.putData(result, negative);
      }
    }
    for (DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) {
      double result = or.getScores().doubleValue(iter);
      result = scaling.getScaled(result);
      if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) {
        hist.putData(result, positive);
      }
    }
    Collection<DoubleVector> collHist = new ArrayList<>(hist.getNumBins());
    for (ObjHistogram.Iter<DoubleDoublePair> iter = hist.iter(); iter.valid(); iter.advance()) {
      DoubleDoublePair data = iter.getValue();
      DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), data.first, data.second});
      collHist.add(row);
    }
    return new HistogramResult<>("Outlier Score Histogram", "outlier-histogram", collHist);
  }