Beispiel #1
0
  /** Performs the DBSCAN algorithm on the given database. */
  public Clustering<Model> run(Relation<O> relation) {
    final int size = relation.size();
    if (size < minpts) {
      Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
      result.addToplevelCluster(
          new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER));
      return result;
    }

    RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
    resultList = new ArrayList<>();
    noise = DBIDUtil.newHashSet();
    runDBSCAN(relation, rangeQuery);

    double averagen = ncounter / (double) relation.size();
    LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen));
    if (averagen < 1 + 0.1 * (minpts - 1)) {
      LOG.warning("There are very few neighbors found. Epsilon may be too small.");
    }
    if (averagen > 100 * minpts) {
      LOG.warning("There are very many neighbors found. Epsilon may be too large.");
    }

    Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
    for (ModifiableDBIDs res : resultList) {
      result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER));
    }
    result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return result;
  }
 protected void autoEvaluateClusterings(ResultHierarchy hier, Result newResult) {
   Collection<Clustering<?>> clusterings =
       ResultUtil.filterResults(hier, newResult, Clustering.class);
   if (LOG.isDebugging()) {
     LOG.warning("Number of new clustering results: " + clusterings.size());
   }
   for (Iterator<Clustering<?>> c = clusterings.iterator(); c.hasNext(); ) {
     Clustering<?> test = c.next();
     if ("allinone-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("allinnoise-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("bylabel-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("bymodel-clustering".equals(test.getShortName())) {
       c.remove();
     }
   }
   if (clusterings.size() > 0) {
     try {
       new EvaluateClustering(new ByLabelClustering(), false, true)
           .processNewResult(hier, newResult);
     } catch (NoSupportedDataTypeException e) {
       // Pass - the data probably did not have labels.
     }
   }
 }
Beispiel #3
0
  /**
   * Runs the DBSCAN algorithm on the specified partition of the database in the given subspace. If
   * parameter {@code ids} is null DBSCAN will be applied to the whole database.
   *
   * @param relation the database holding the objects to run DBSCAN on
   * @param ids the IDs of the database defining the partition to run DBSCAN on - if this parameter
   *     is null DBSCAN will be applied to the whole database
   * @param subspace the subspace to run DBSCAN on
   * @return the clustering result of the DBSCAN run
   */
  private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace subspace) {
    // distance function
    distanceFunction.setSelectedDimensions(subspace.getDimensions());

    ProxyDatabase proxy;
    if (ids == null) {
      // TODO: in this case, we might want to use an index - the proxy below
      // will prevent this!
      ids = relation.getDBIDs();
    }

    proxy = new ProxyDatabase(ids, relation);

    DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts);
    // run DBSCAN
    if (LOG.isVerbose()) {
      LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString());
    }
    Clustering<Model> dbsres = dbscan.run(proxy);

    // separate cluster and noise
    List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters();
    List<Cluster<Model>> clusters = new ArrayList<>();
    for (Cluster<Model> c : clusterAndNoise) {
      if (!c.isNoise()) {
        clusters.add(c);
      }
    }
    return clusters;
  }
Beispiel #4
0
  @Override
  public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
      return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
      clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment =
        DataStoreUtil.makeIntegerStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];

    IndefiniteProgress prog =
        LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat =
        LOG.isStatistics()
            ? new DoubleStatistic(this.getClass().getName() + ".variance-sum")
            : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
      LOG.incrementProcessed(prog);
      boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
      logVarstat(varstat, varsum);
      // Stop if no cluster assignment changed.
      if (!changed) {
        break;
      }
      // Recompute means.
      means = means(clusters, means, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
      LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }

    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
      DBIDs ids = clusters.get(i);
      if (ids.size() == 0) {
        continue;
      }
      KMeansModel model = new KMeansModel(means[i], varsum[i]);
      result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
  }
Beispiel #5
0
 @Override
 public void processNewResult(ResultHierarchy hier, Result newResult) {
   // We may just have added this result.
   if (newResult instanceof Clustering && isReferenceResult((Clustering<?>) newResult)) {
     return;
   }
   Database db = ResultUtil.findDatabase(hier);
   List<Clustering<?>> crs = ResultUtil.getClusteringResults(newResult);
   if (crs == null || crs.size() < 1) {
     return;
   }
   // Compute the reference clustering
   Clustering<?> refc = null;
   // Try to find an existing reference clustering (globally)
   {
     Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, db, Clustering.class);
     for (Clustering<?> test : cs) {
       if (isReferenceResult(test)) {
         refc = test;
         break;
       }
     }
   }
   // Try to find an existing reference clustering (locally)
   if (refc == null) {
     Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, newResult, Clustering.class);
     for (Clustering<?> test : cs) {
       if (isReferenceResult(test)) {
         refc = test;
         break;
       }
     }
   }
   if (refc == null) {
     LOG.debug("Generating a new reference clustering.");
     Result refres = referencealg.run(db);
     List<Clustering<?>> refcrs = ResultUtil.getClusteringResults(refres);
     if (refcrs.size() == 0) {
       LOG.warning("Reference algorithm did not return a clustering result!");
       return;
     }
     if (refcrs.size() > 1) {
       LOG.warning("Reference algorithm returned more than one result!");
     }
     refc = refcrs.get(0);
   } else {
     LOG.debug("Using existing clustering: " + refc.getLongName() + " " + refc.getShortName());
   }
   for (Clustering<?> c : crs) {
     if (c == refc) {
       continue;
     }
     evaluteResult(db, c, refc);
   }
 }
Beispiel #6
0
  @Override
  public Clustering<BiclusterWithInversionsModel> biclustering() {
    double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);

    BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());

    Clustering<BiclusterWithInversionsModel> result =
        new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
    ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
    for (int i = 0; i < n; i++) {
      cand.reset();
      multipleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      singleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      nodeAddition(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      cand.maskMatrix(mat, dist);
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
      final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
      noise.removeDBIDs(cids);
      result.addToplevelCluster(new Cluster<>(cids, model));

      if (LOG.isVerbose()) {
        LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
        LOG.verbose("Number of rows: " + cand.rowcard + "\n");
        LOG.verbose("Number of columns: " + cand.colcard + "\n");
        // LOG.verbose("Total number of masked values: " + maskedVals.size() +
        // "\n");
      }
      LOG.incrementProcessed(prog);
    }
    // Add a noise cluster, full-dimensional.
    if (!noise.isEmpty()) {
      long[] allcols = BitsUtil.ones(getColDim());
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
      result.addToplevelCluster(new Cluster<>(noise, true, model));
    }
    LOG.ensureCompleted(prog);
    return result;
  }
Beispiel #7
0
 /**
  * Test if a clustering result is a valid reference result.
  *
  * @param t Clustering to test.
  * @return {@code true} if it is considered to be a reference result.
  */
 private boolean isReferenceResult(Clustering<?> t) {
   // FIXME: don't hard-code strings
   if ("bylabel-clustering".equals(t.getShortName())) {
     return true;
   }
   if ("bymodel-clustering".equals(t.getShortName())) {
     return true;
   }
   if ("allinone-clustering".equals(t.getShortName())) {
     return true;
   }
   if ("allinnoise-clustering".equals(t.getShortName())) {
     return true;
   }
   return false;
 }
Beispiel #8
0
  /**
   * Evaluate a clustering result.
   *
   * @param db Database
   * @param c Clustering
   * @param refc Reference clustering
   */
  protected void evaluteResult(Database db, Clustering<?> c, Clustering<?> refc) {
    ClusterContingencyTable contmat =
        new ClusterContingencyTable(selfPairing, noiseSpecialHandling);
    contmat.process(refc, c);

    ScoreResult sr = new ScoreResult(contmat);
    sr.addHeader(c.getLongName());
    db.getHierarchy().add(c, sr);
  }
Beispiel #9
0
  /**
   * Performs the SUBCLU algorithm on the given database.
   *
   * @param relation Relation to process
   * @return Clustering result
   */
  public Clustering<SubspaceModel> run(Relation<V> relation) {
    final int dimensionality = RelationUtil.dimensionality(relation);

    StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null;

    // Generate all 1-dimensional clusters
    LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters.");

    // mapping of dimensionality to set of subspaces
    HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>();

    // list of 1-dimensional subspaces containing clusters
    List<Subspace> s_1 = new ArrayList<>();
    subspaceMap.put(0, s_1);

    // mapping of subspaces to list of clusters
    TreeMap<Subspace, List<Cluster<Model>>> clusterMap =
        new TreeMap<>(new Subspace.DimensionComparator());

    for (int d = 0; d < dimensionality; d++) {
      Subspace currentSubspace = new Subspace(d);
      List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace);

      if (LOG.isDebuggingFiner()) {
        StringBuilder msg = new StringBuilder();
        msg.append('\n')
            .append(clusters.size())
            .append(" clusters in subspace ")
            .append(currentSubspace.dimensonsToString())
            .append(": \n");
        for (Cluster<Model> cluster : clusters) {
          msg.append("      " + cluster.getIDs() + "\n");
        }
        LOG.debugFiner(msg.toString());
      }

      if (!clusters.isEmpty()) {
        s_1.add(currentSubspace);
        clusterMap.put(currentSubspace, clusters);
      }
    }

    // Generate (d+1)-dimensional clusters from d-dimensional clusters
    for (int d = 0; d < dimensionality - 1; d++) {
      if (stepprog != null) {
        stepprog.beginStep(
            d + 2,
            "Generate "
                + (d + 2)
                + "-dimensional clusters from "
                + (d + 1)
                + "-dimensional clusters.",
            LOG);
      }

      List<Subspace> subspaces = subspaceMap.get(d);
      if (subspaces == null || subspaces.isEmpty()) {
        if (stepprog != null) {
          for (int dim = d + 1; dim < dimensionality - 1; dim++) {
            stepprog.beginStep(
                dim + 2,
                "Generation of"
                    + (dim + 2)
                    + "-dimensional clusters not applicable, because no more "
                    + (d + 2)
                    + "-dimensional subspaces found.",
                LOG);
          }
        }
        break;
      }

      List<Subspace> candidates = generateSubspaceCandidates(subspaces);
      List<Subspace> s_d = new ArrayList<>();

      for (Subspace candidate : candidates) {
        Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap);
        if (LOG.isDebuggingFine()) {
          LOG.debugFine(
              "best subspace of "
                  + candidate.dimensonsToString()
                  + ": "
                  + bestSubspace.dimensonsToString());
        }

        List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace);
        List<Cluster<Model>> clusters = new ArrayList<>();
        for (Cluster<Model> cluster : bestSubspaceClusters) {
          List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate);
          if (!candidateClusters.isEmpty()) {
            clusters.addAll(candidateClusters);
          }
        }

        if (LOG.isDebuggingFine()) {
          StringBuilder msg = new StringBuilder();
          msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n");
          for (Cluster<Model> c : clusters) {
            msg.append("      " + c.getIDs() + "\n");
          }
          LOG.debugFine(msg.toString());
        }

        if (!clusters.isEmpty()) {
          s_d.add(candidate);
          clusterMap.put(candidate, clusters);
        }
      }

      if (!s_d.isEmpty()) {
        subspaceMap.put(d + 1, s_d);
      }
    }

    // build result
    int numClusters = 1;
    result = new Clustering<>("SUBCLU clustering", "subclu-clustering");
    for (Subspace subspace : clusterMap.descendingKeySet()) {
      List<Cluster<Model>> clusters = clusterMap.get(subspace);
      for (Cluster<Model> cluster : clusters) {
        Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs());
        newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs())));
        newCluster.setName("cluster_" + numClusters++);
        result.addToplevelCluster(newCluster);
      }
    }

    LOG.setCompleted(stepprog);
    return result;
  }
  /**
   * Evaluate a single clustering.
   *
   * @param db Database
   * @param rel Data relation
   * @param c Clustering
   * @return Gamma index
   */
  public double evaluateClustering(
      Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();

    int ignorednoise = 0, withinPairs = 0;
    for (Cluster<?> cluster : clusters) {
      if ((cluster.size() <= 1 || cluster.isNoise())) {
        switch (noiseHandling) {
          case IGNORE_NOISE:
            ignorednoise += cluster.size();
            continue;
          case TREAT_NOISE_AS_SINGLETONS:
            continue; // No concordant distances.
          case MERGE_NOISE:
            break; // Treat like a cluster below.
        }
      }
      withinPairs += (cluster.size() * (cluster.size() - 1)) >>> 1;
      if (withinPairs < 0) {
        throw new AbortException(
            "Integer overflow - clusters too large to compute pairwise distances.");
      }
    }
    // Materialize within-cluster distances (sorted):
    double[] withinDistances = computeWithinDistances(rel, clusters, withinPairs);
    int[] withinTies = new int[withinDistances.length];
    // Count ties within
    countTies(withinDistances, withinTies);

    long concordantPairs = 0, discordantPairs = 0, betweenPairs = 0;

    // Step two, compute discordant distances:
    for (int i = 0; i < clusters.size(); i++) {
      Cluster<?> ocluster1 = clusters.get(i);
      if ((ocluster1.size() <= 1 || ocluster1.isNoise()) //
          && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
        continue;
      }
      for (int j = i + 1; j < clusters.size(); j++) {
        Cluster<?> ocluster2 = clusters.get(j);
        if ((ocluster2.size() <= 1 || ocluster2.isNoise()) //
            && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
          continue;
        }
        betweenPairs += ocluster1.size() * ocluster2.size();
        for (DBIDIter oit1 = ocluster1.getIDs().iter(); oit1.valid(); oit1.advance()) {
          NumberVector obj = rel.get(oit1);
          for (DBIDIter oit2 = ocluster2.getIDs().iter(); oit2.valid(); oit2.advance()) {
            double dist = distanceFunction.distance(obj, rel.get(oit2));
            int p = Arrays.binarySearch(withinDistances, dist);
            if (p >= 0) { // Tied distances:
              while (p > 0 && withinDistances[p - 1] >= dist) {
                --p;
              }
              concordantPairs += p;
              discordantPairs += withinDistances.length - p - withinTies[p];
              continue;
            }
            p = -p - 1;
            concordantPairs += p;
            discordantPairs += withinDistances.length - p;
          }
        }
      }
    }

    // Total number of pairs possible:
    final long t = ((rel.size() - ignorednoise) * (long) (rel.size() - ignorednoise - 1)) >>> 1;
    final long tt = (t * (t - 1)) >>> 1;

    final double gamma =
        (concordantPairs - discordantPairs) / (double) (concordantPairs + discordantPairs);
    final double tau =
        computeTau(concordantPairs, discordantPairs, tt, withinDistances.length, betweenPairs);

    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString()));
      if (ignorednoise > 0) {
        LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise));
      }
      LOG.statistics(new DoubleStatistic(key + ".gamma", gamma));
      LOG.statistics(new DoubleStatistic(key + ".tau", tau));
    }

    EvaluationResult ev =
        EvaluationResult.findOrCreate(
            db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Concordance-based Evaluation");
    g.addMeasure("Gamma", gamma, -1., 1., 0., false);
    g.addMeasure("Tau", tau, -1., +1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return gamma;
  }
Beispiel #11
0
  /**
   * Performs the DOC or FastDOC (as configured) algorithm on the given Database.
   *
   * <p>This will run exhaustively, i.e. run DOC until no clusters are found anymore / the database
   * size has shrunk below the threshold for minimum cluster size.
   *
   * @param database Database
   * @param relation Data relation
   */
  public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    // Dimensionality of our set.
    final int d = RelationUtil.dimensionality(relation);

    // Get available DBIDs as a set we can remove items from.
    ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());

    // Precompute values as described in Figure 2.
    double r = Math.abs(Math.log(d + d) / Math.log(beta * .5));
    // Outer loop count.
    int n = (int) (2. / alpha);
    // Inner loop count.
    int m = (int) (Math.pow(2. / alpha, r) * Math.log(4));
    if (heuristics) {
      m = Math.min(m, Math.min(1000000, d * d));
    }

    // Minimum size for a cluster for it to be accepted.
    int minClusterSize = (int) (alpha * S.size());

    // List of all clusters we found.
    Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC");

    // Inform the user about the number of actual clusters found so far.
    IndefiniteProgress cprogress =
        LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;

    // To not only find a single cluster, we continue running until our set
    // of points is empty.
    while (S.size() > minClusterSize) {
      Cluster<SubspaceModel> C;
      if (heuristics) {
        C = runFastDOC(database, relation, S, d, n, m, (int) r);
      } else {
        C = runDOC(database, relation, S, d, n, m, (int) r, minClusterSize);
      }

      if (C == null) {
        // Stop trying if we couldn't find a cluster.
        break;
      }
      // Found a cluster, remember it, remove its points from the set.
      result.addToplevelCluster(C);

      // Remove all points of the cluster from the set and continue.
      S.removeDBIDs(C.getIDs());

      if (cprogress != null) {
        cprogress.setProcessed(result.getAllClusters().size(), LOG);
      }
    }

    // Add the remainder as noise.
    if (S.size() > 0) {
      long[] alldims = BitsUtil.ones(d);
      result.addToplevelCluster(
          new Cluster<>(
              S,
              true,
              new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S).getArrayRef())));
    }
    LOG.setCompleted(cprogress);
    return result;
  }
  /**
   * Evaluate a single clustering.
   *
   * @param db Database
   * @param rel Data relation
   * @param c Clustering
   * @return Mean simplified silhouette
   */
  public double evaluateClustering(
      Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int ignorednoise = centroids(rel, clusters, centroids, noiseOption);

    MeanVariance mssil = new MeanVariance();

    Iterator<? extends Cluster<?>> ci = clusters.iterator();
    for (int i = 0; ci.hasNext(); i++) {
      Cluster<?> cluster = ci.next();
      if (cluster.size() <= 1) {
        // As suggested in Rousseeuw, we use 0 for singletons.
        mssil.put(0., cluster.size());
        continue;
      }
      if (cluster.isNoise()) {
        switch (noiseOption) {
          case IGNORE_NOISE:
            continue; // Ignore elements
          case TREAT_NOISE_AS_SINGLETONS:
            // As suggested in Rousseeuw, we use 0 for singletons.
            mssil.put(0., cluster.size());
            continue;
          case MERGE_NOISE:
            break; // Treat as cluster below
        }
      }

      // Cluster center:
      final NumberVector center = centroids[i];
      assert (center != null);
      for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
        NumberVector obj = rel.get(it);
        // a: Distance to own centroid
        double a = distance.distance(center, obj);

        // b: Distance to other clusters centroids:
        double min = Double.POSITIVE_INFINITY;
        Iterator<? extends Cluster<?>> cj = clusters.iterator();
        for (int j = 0; cj.hasNext(); j++) {
          Cluster<?> ocluster = cj.next();
          if (i == j) {
            continue;
          }
          NumberVector other = centroids[j];
          if (other == null) { // Noise!
            switch (noiseOption) {
              case IGNORE_NOISE:
                continue;
              case TREAT_NOISE_AS_SINGLETONS:
                // Treat each object like a centroid!
                for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
                  double dist = distance.distance(rel.get(it2), obj);
                  min = dist < min ? dist : min;
                }
                continue;
              case MERGE_NOISE:
                break; // Treat as cluster below, but should not be reachable.
            }
          }
          // Clusters: use centroid.
          double dist = distance.distance(other, obj);
          min = dist < min ? dist : min;
        }

        // One 'real' cluster only?
        min = min < Double.POSITIVE_INFINITY ? min : a;
        mssil.put((min - a) / (min > a ? min : a));
      }
    }

    double penalty = 1.;
    // Only if {@link NoiseHandling#IGNORE_NOISE}:
    if (penalize && ignorednoise > 0) {
      penalty = (rel.size() - ignorednoise) / (double) rel.size();
    }
    final double meanssil = penalty * mssil.getMean();
    final double stdssil = penalty * mssil.getSampleStddev();
    if (LOG.isStatistics()) {
      LOG.statistics(
          new StringStatistic(
              key + ".simplified-silhouette.noise-handling", noiseOption.toString()));
      if (ignorednoise > 0) {
        LOG.statistics(new LongStatistic(key + ".simplified-silhouette.ignored", ignorednoise));
      }
      LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.mean", meanssil));
      LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.stddev", stdssil));
    }

    EvaluationResult ev =
        EvaluationResult.findOrCreate(
            db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure(
        "Simp. Silhouette +-" + FormatUtil.NF2.format(stdssil), meanssil, -1., 1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return meanssil;
  }
Beispiel #13
0
 protected void autoEvaluateOutliers(ResultHierarchy hier, Result newResult) {
   Collection<OutlierResult> outliers =
       ResultUtil.filterResults(hier, newResult, OutlierResult.class);
   if (LOG.isDebugging()) {
     LOG.debug("Number of new outlier results: " + outliers.size());
   }
   if (outliers.size() > 0) {
     Database db = ResultUtil.findDatabase(hier);
     ResultUtil.ensureClusteringResult(db, db);
     Collection<Clustering<?>> clusterings = ResultUtil.filterResults(hier, db, Clustering.class);
     if (clusterings.size() == 0) {
       LOG.warning(
           "Could not find a clustering result, even after running 'ensureClusteringResult'?!?");
       return;
     }
     Clustering<?> basec = clusterings.iterator().next();
     // Find minority class label
     int min = Integer.MAX_VALUE;
     int total = 0;
     String label = null;
     if (basec.getAllClusters().size() > 1) {
       for (Cluster<?> c : basec.getAllClusters()) {
         final int csize = c.getIDs().size();
         total += csize;
         if (csize < min) {
           min = csize;
           label = c.getName();
         }
       }
     }
     if (label == null) {
       LOG.warning("Could not evaluate outlier results, as I could not find a minority label.");
       return;
     }
     if (min == 1) {
       LOG.warning(
           "The minority class label had a single object. Try using 'ClassLabelFilter' to identify the class label column.");
     }
     if (min > 0.05 * total) {
       LOG.warning(
           "The minority class I discovered (labeled '"
               + label
               + "') has "
               + (min * 100. / total)
               + "% of objects. Outlier classes should be more rare!");
     }
     LOG.verbose("Evaluating using minority class: " + label);
     Pattern pat = Pattern.compile("^" + Pattern.quote(label) + "$");
     // Evaluate rankings.
     new OutlierRankingEvaluation(pat).processNewResult(hier, newResult);
     // Compute ROC curve
     new OutlierROCCurve(pat).processNewResult(hier, newResult);
     // Compute Precision at k
     new OutlierPrecisionAtKCurve(pat, min << 1).processNewResult(hier, newResult);
     // Compute ROC curve
     new OutlierPrecisionRecallCurve(pat).processNewResult(hier, newResult);
     // Compute outlier histogram
     new ComputeOutlierHistogram(pat, 50, new LinearScaling(), false)
         .processNewResult(hier, newResult);
   }
 }
Beispiel #14
0
  /**
   * Evaluate a single clustering.
   *
   * @param db Database
   * @param rel Data relation
   * @param c Clustering
   * @return C-Index
   */
  public double evaluateClustering(
      Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();

    // theta is the sum, w the number of within group distances
    double theta = 0;
    int w = 0;
    int ignorednoise = 0;
    int isize = clusters.size() <= 1 ? rel.size() : rel.size() / (clusters.size() - 1);
    DoubleArray pairDists = new DoubleArray(isize);

    for (int i = 0; i < clusters.size(); i++) {
      Cluster<?> cluster = clusters.get(i);
      if (cluster.size() <= 1 || cluster.isNoise()) {
        switch (noiseOption) {
          case IGNORE_NOISE:
            ignorednoise += cluster.size();
            continue; // Ignore
          case TREAT_NOISE_AS_SINGLETONS:
            continue; // No within-cluster distances!
          case MERGE_NOISE:
            break; // Treat like a cluster
        }
      }
      for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
        O obj = rel.get(it1);
        // Compare object to every cluster, but only once
        for (int j = i; j < clusters.size(); j++) {
          Cluster<?> ocluster = clusters.get(j);
          if (ocluster.size() <= 1 || ocluster.isNoise()) {
            switch (noiseOption) {
              case IGNORE_NOISE:
                continue; // Ignore this cluster.
              case TREAT_NOISE_AS_SINGLETONS:
              case MERGE_NOISE:
                break; // Treat like a cluster
            }
          }
          for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
            if (DBIDUtil.compare(it1, it2) <= 0) { // Only once.
              continue;
            }
            double dist = dq.distance(obj, rel.get(it2));
            pairDists.add(dist);
            if (ocluster == cluster) { // Within-cluster distances.
              theta += dist;
              w++;
            }
          }
        }
      }
    }

    // Simulate best and worst cases:
    pairDists.sort();
    double min = 0, max = 0;
    for (int i = 0, j = pairDists.size() - 1; i < w; i++, j--) {
      min += pairDists.get(i);
      max += pairDists.get(j);
    }

    double cIndex = (max > min) ? (theta - min) / (max - min) : 0.;

    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString()));
      if (ignorednoise > 0) {
        LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise));
      }
      LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex));
    }

    EvaluationResult ev =
        EvaluationResult.findOrCreate(
            db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("C-Index", cIndex, 0., 1., 0., true);
    db.getHierarchy().resultChanged(ev);
    return cIndex;
  }