コード例 #1
0
ファイル: SUBCLU.java プロジェクト: fjfd/elki
  /**
   * Runs the DBSCAN algorithm on the specified partition of the database in the given subspace. If
   * parameter {@code ids} is null DBSCAN will be applied to the whole database.
   *
   * @param relation the database holding the objects to run DBSCAN on
   * @param ids the IDs of the database defining the partition to run DBSCAN on - if this parameter
   *     is null DBSCAN will be applied to the whole database
   * @param subspace the subspace to run DBSCAN on
   * @return the clustering result of the DBSCAN run
   */
  private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace subspace) {
    // distance function
    distanceFunction.setSelectedDimensions(subspace.getDimensions());

    ProxyDatabase proxy;
    if (ids == null) {
      // TODO: in this case, we might want to use an index - the proxy below
      // will prevent this!
      ids = relation.getDBIDs();
    }

    proxy = new ProxyDatabase(ids, relation);

    DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts);
    // run DBSCAN
    if (LOG.isVerbose()) {
      LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString());
    }
    Clustering<Model> dbsres = dbscan.run(proxy);

    // separate cluster and noise
    List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters();
    List<Cluster<Model>> clusters = new ArrayList<>();
    for (Cluster<Model> c : clusterAndNoise) {
      if (!c.isNoise()) {
        clusters.add(c);
      }
    }
    return clusters;
  }
コード例 #2
0
  protected double[] computeWithinDistances(
      Relation<? extends NumberVector> rel, List<? extends Cluster<?>> clusters, int withinPairs) {
    double[] concordant = new double[withinPairs];
    int i = 0;
    for (Cluster<?> cluster : clusters) {
      if (cluster.size() <= 1 || cluster.isNoise()) {
        switch (noiseHandling) {
          case IGNORE_NOISE:
            continue;
          case TREAT_NOISE_AS_SINGLETONS:
            continue; // No concordant distances.
          case MERGE_NOISE:
            break; // Treat like a cluster below.
        }
      }

      for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
        NumberVector obj = rel.get(it1);
        for (DBIDIter it2 = cluster.getIDs().iter(); it2.valid(); it2.advance()) {
          if (DBIDUtil.compare(it1, it2) <= 0) {
            continue;
          }
          concordant[i++] = distanceFunction.distance(obj, rel.get(it2));
        }
      }
    }
    assert (concordant.length == i);
    Arrays.sort(concordant);
    return concordant;
  }
コード例 #3
0
 /**
  * Compute centroids.
  *
  * @param rel Data relation
  * @param clusters Clusters
  * @param centroids Output array for centroids
  * @return Number of ignored noise elements.
  */
 public static int centroids(
     Relation<? extends NumberVector> rel,
     List<? extends Cluster<?>> clusters,
     NumberVector[] centroids,
     NoiseHandling noiseOption) {
   assert (centroids.length == clusters.size());
   int ignorednoise = 0;
   Iterator<? extends Cluster<?>> ci = clusters.iterator();
   for (int i = 0; ci.hasNext(); i++) {
     Cluster<?> cluster = ci.next();
     if (cluster.size() <= 1 || cluster.isNoise()) {
       switch (noiseOption) {
         case IGNORE_NOISE:
           ignorednoise += cluster.size();
         case TREAT_NOISE_AS_SINGLETONS:
           centroids[i] = null;
           continue;
         case MERGE_NOISE:
           break; // Treat as cluster below
       }
     }
     centroids[i] = ModelUtil.getPrototypeOrCentroid(cluster.getModel(), rel, cluster.getIDs());
   }
   return ignorednoise;
 }
コード例 #4
0
  /**
   * Evaluate a single clustering.
   *
   * @param db Database
   * @param rel Data relation
   * @param c Clustering
   * @return Gamma index
   */
  public double evaluateClustering(
      Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();

    int ignorednoise = 0, withinPairs = 0;
    for (Cluster<?> cluster : clusters) {
      if ((cluster.size() <= 1 || cluster.isNoise())) {
        switch (noiseHandling) {
          case IGNORE_NOISE:
            ignorednoise += cluster.size();
            continue;
          case TREAT_NOISE_AS_SINGLETONS:
            continue; // No concordant distances.
          case MERGE_NOISE:
            break; // Treat like a cluster below.
        }
      }
      withinPairs += (cluster.size() * (cluster.size() - 1)) >>> 1;
      if (withinPairs < 0) {
        throw new AbortException(
            "Integer overflow - clusters too large to compute pairwise distances.");
      }
    }
    // Materialize within-cluster distances (sorted):
    double[] withinDistances = computeWithinDistances(rel, clusters, withinPairs);
    int[] withinTies = new int[withinDistances.length];
    // Count ties within
    countTies(withinDistances, withinTies);

    long concordantPairs = 0, discordantPairs = 0, betweenPairs = 0;

    // Step two, compute discordant distances:
    for (int i = 0; i < clusters.size(); i++) {
      Cluster<?> ocluster1 = clusters.get(i);
      if ((ocluster1.size() <= 1 || ocluster1.isNoise()) //
          && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
        continue;
      }
      for (int j = i + 1; j < clusters.size(); j++) {
        Cluster<?> ocluster2 = clusters.get(j);
        if ((ocluster2.size() <= 1 || ocluster2.isNoise()) //
            && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
          continue;
        }
        betweenPairs += ocluster1.size() * ocluster2.size();
        for (DBIDIter oit1 = ocluster1.getIDs().iter(); oit1.valid(); oit1.advance()) {
          NumberVector obj = rel.get(oit1);
          for (DBIDIter oit2 = ocluster2.getIDs().iter(); oit2.valid(); oit2.advance()) {
            double dist = distanceFunction.distance(obj, rel.get(oit2));
            int p = Arrays.binarySearch(withinDistances, dist);
            if (p >= 0) { // Tied distances:
              while (p > 0 && withinDistances[p - 1] >= dist) {
                --p;
              }
              concordantPairs += p;
              discordantPairs += withinDistances.length - p - withinTies[p];
              continue;
            }
            p = -p - 1;
            concordantPairs += p;
            discordantPairs += withinDistances.length - p;
          }
        }
      }
    }

    // Total number of pairs possible:
    final long t = ((rel.size() - ignorednoise) * (long) (rel.size() - ignorednoise - 1)) >>> 1;
    final long tt = (t * (t - 1)) >>> 1;

    final double gamma =
        (concordantPairs - discordantPairs) / (double) (concordantPairs + discordantPairs);
    final double tau =
        computeTau(concordantPairs, discordantPairs, tt, withinDistances.length, betweenPairs);

    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString()));
      if (ignorednoise > 0) {
        LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise));
      }
      LOG.statistics(new DoubleStatistic(key + ".gamma", gamma));
      LOG.statistics(new DoubleStatistic(key + ".tau", tau));
    }

    EvaluationResult ev =
        EvaluationResult.findOrCreate(
            db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Concordance-based Evaluation");
    g.addMeasure("Gamma", gamma, -1., 1., 0., false);
    g.addMeasure("Tau", tau, -1., +1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return gamma;
  }
コード例 #5
0
  /**
   * Evaluate a single clustering.
   *
   * @param db Database
   * @param rel Data relation
   * @param c Clustering
   * @return Mean simplified silhouette
   */
  public double evaluateClustering(
      Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int ignorednoise = centroids(rel, clusters, centroids, noiseOption);

    MeanVariance mssil = new MeanVariance();

    Iterator<? extends Cluster<?>> ci = clusters.iterator();
    for (int i = 0; ci.hasNext(); i++) {
      Cluster<?> cluster = ci.next();
      if (cluster.size() <= 1) {
        // As suggested in Rousseeuw, we use 0 for singletons.
        mssil.put(0., cluster.size());
        continue;
      }
      if (cluster.isNoise()) {
        switch (noiseOption) {
          case IGNORE_NOISE:
            continue; // Ignore elements
          case TREAT_NOISE_AS_SINGLETONS:
            // As suggested in Rousseeuw, we use 0 for singletons.
            mssil.put(0., cluster.size());
            continue;
          case MERGE_NOISE:
            break; // Treat as cluster below
        }
      }

      // Cluster center:
      final NumberVector center = centroids[i];
      assert (center != null);
      for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
        NumberVector obj = rel.get(it);
        // a: Distance to own centroid
        double a = distance.distance(center, obj);

        // b: Distance to other clusters centroids:
        double min = Double.POSITIVE_INFINITY;
        Iterator<? extends Cluster<?>> cj = clusters.iterator();
        for (int j = 0; cj.hasNext(); j++) {
          Cluster<?> ocluster = cj.next();
          if (i == j) {
            continue;
          }
          NumberVector other = centroids[j];
          if (other == null) { // Noise!
            switch (noiseOption) {
              case IGNORE_NOISE:
                continue;
              case TREAT_NOISE_AS_SINGLETONS:
                // Treat each object like a centroid!
                for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
                  double dist = distance.distance(rel.get(it2), obj);
                  min = dist < min ? dist : min;
                }
                continue;
              case MERGE_NOISE:
                break; // Treat as cluster below, but should not be reachable.
            }
          }
          // Clusters: use centroid.
          double dist = distance.distance(other, obj);
          min = dist < min ? dist : min;
        }

        // One 'real' cluster only?
        min = min < Double.POSITIVE_INFINITY ? min : a;
        mssil.put((min - a) / (min > a ? min : a));
      }
    }

    double penalty = 1.;
    // Only if {@link NoiseHandling#IGNORE_NOISE}:
    if (penalize && ignorednoise > 0) {
      penalty = (rel.size() - ignorednoise) / (double) rel.size();
    }
    final double meanssil = penalty * mssil.getMean();
    final double stdssil = penalty * mssil.getSampleStddev();
    if (LOG.isStatistics()) {
      LOG.statistics(
          new StringStatistic(
              key + ".simplified-silhouette.noise-handling", noiseOption.toString()));
      if (ignorednoise > 0) {
        LOG.statistics(new LongStatistic(key + ".simplified-silhouette.ignored", ignorednoise));
      }
      LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.mean", meanssil));
      LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.stddev", stdssil));
    }

    EvaluationResult ev =
        EvaluationResult.findOrCreate(
            db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure(
        "Simp. Silhouette +-" + FormatUtil.NF2.format(stdssil), meanssil, -1., 1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return meanssil;
  }
コード例 #6
0
ファイル: EvaluateCIndex.java プロジェクト: victordov/elki
  /**
   * Evaluate a single clustering.
   *
   * @param db Database
   * @param rel Data relation
   * @param c Clustering
   * @return C-Index
   */
  public double evaluateClustering(
      Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();

    // theta is the sum, w the number of within group distances
    double theta = 0;
    int w = 0;
    int ignorednoise = 0;
    int isize = clusters.size() <= 1 ? rel.size() : rel.size() / (clusters.size() - 1);
    DoubleArray pairDists = new DoubleArray(isize);

    for (int i = 0; i < clusters.size(); i++) {
      Cluster<?> cluster = clusters.get(i);
      if (cluster.size() <= 1 || cluster.isNoise()) {
        switch (noiseOption) {
          case IGNORE_NOISE:
            ignorednoise += cluster.size();
            continue; // Ignore
          case TREAT_NOISE_AS_SINGLETONS:
            continue; // No within-cluster distances!
          case MERGE_NOISE:
            break; // Treat like a cluster
        }
      }
      for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
        O obj = rel.get(it1);
        // Compare object to every cluster, but only once
        for (int j = i; j < clusters.size(); j++) {
          Cluster<?> ocluster = clusters.get(j);
          if (ocluster.size() <= 1 || ocluster.isNoise()) {
            switch (noiseOption) {
              case IGNORE_NOISE:
                continue; // Ignore this cluster.
              case TREAT_NOISE_AS_SINGLETONS:
              case MERGE_NOISE:
                break; // Treat like a cluster
            }
          }
          for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
            if (DBIDUtil.compare(it1, it2) <= 0) { // Only once.
              continue;
            }
            double dist = dq.distance(obj, rel.get(it2));
            pairDists.add(dist);
            if (ocluster == cluster) { // Within-cluster distances.
              theta += dist;
              w++;
            }
          }
        }
      }
    }

    // Simulate best and worst cases:
    pairDists.sort();
    double min = 0, max = 0;
    for (int i = 0, j = pairDists.size() - 1; i < w; i++, j--) {
      min += pairDists.get(i);
      max += pairDists.get(j);
    }

    double cIndex = (max > min) ? (theta - min) / (max - min) : 0.;

    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString()));
      if (ignorednoise > 0) {
        LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise));
      }
      LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex));
    }

    EvaluationResult ev =
        EvaluationResult.findOrCreate(
            db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("C-Index", cIndex, 0., 1., 0., true);
    db.getHierarchy().resultChanged(ev);
    return cIndex;
  }