Exemple #1
0
  /**
   * Generates {@code d+1}-dimensional subspace candidates from the specified {@code d}-dimensional
   * subspaces.
   *
   * @param subspaces the {@code d}-dimensional subspaces
   * @return the {@code d+1}-dimensional subspace candidates
   */
  private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) {
    List<Subspace> candidates = new ArrayList<>();

    if (subspaces.isEmpty()) {
      return candidates;
    }

    // Generate (d+1)-dimensional candidate subspaces
    int d = subspaces.get(0).dimensionality();

    StringBuilder msgFine = new StringBuilder("\n");
    if (LOG.isDebuggingFiner()) {
      msgFine.append("subspaces ").append(subspaces).append('\n');
    }

    for (int i = 0; i < subspaces.size(); i++) {
      Subspace s1 = subspaces.get(i);
      for (int j = i + 1; j < subspaces.size(); j++) {
        Subspace s2 = subspaces.get(j);
        Subspace candidate = s1.join(s2);

        if (candidate != null) {
          if (LOG.isDebuggingFiner()) {
            msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n');
          }
          // prune irrelevant candidate subspaces
          List<Subspace> lowerSubspaces = lowerSubspaces(candidate);
          if (LOG.isDebuggingFiner()) {
            msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n');
          }
          boolean irrelevantCandidate = false;
          for (Subspace s : lowerSubspaces) {
            if (!subspaces.contains(s)) {
              irrelevantCandidate = true;
              break;
            }
          }
          if (!irrelevantCandidate) {
            candidates.add(candidate);
          }
        }
      }
    }

    if (LOG.isDebuggingFiner()) {
      LOG.debugFiner(msgFine.toString());
    }
    if (LOG.isDebugging()) {
      StringBuilder msg = new StringBuilder();
      msg.append(d + 1).append("-dimensional candidate subspaces: ");
      for (Subspace candidate : candidates) {
        msg.append(candidate.dimensonsToString()).append(' ');
      }
      LOG.debug(msg.toString());
    }

    return candidates;
  }
Exemple #2
0
  /**
   * Performs the SUBCLU algorithm on the given database.
   *
   * @param relation Relation to process
   * @return Clustering result
   */
  public Clustering<SubspaceModel> run(Relation<V> relation) {
    final int dimensionality = RelationUtil.dimensionality(relation);

    StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null;

    // Generate all 1-dimensional clusters
    LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters.");

    // mapping of dimensionality to set of subspaces
    HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>();

    // list of 1-dimensional subspaces containing clusters
    List<Subspace> s_1 = new ArrayList<>();
    subspaceMap.put(0, s_1);

    // mapping of subspaces to list of clusters
    TreeMap<Subspace, List<Cluster<Model>>> clusterMap =
        new TreeMap<>(new Subspace.DimensionComparator());

    for (int d = 0; d < dimensionality; d++) {
      Subspace currentSubspace = new Subspace(d);
      List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace);

      if (LOG.isDebuggingFiner()) {
        StringBuilder msg = new StringBuilder();
        msg.append('\n')
            .append(clusters.size())
            .append(" clusters in subspace ")
            .append(currentSubspace.dimensonsToString())
            .append(": \n");
        for (Cluster<Model> cluster : clusters) {
          msg.append("      " + cluster.getIDs() + "\n");
        }
        LOG.debugFiner(msg.toString());
      }

      if (!clusters.isEmpty()) {
        s_1.add(currentSubspace);
        clusterMap.put(currentSubspace, clusters);
      }
    }

    // Generate (d+1)-dimensional clusters from d-dimensional clusters
    for (int d = 0; d < dimensionality - 1; d++) {
      if (stepprog != null) {
        stepprog.beginStep(
            d + 2,
            "Generate "
                + (d + 2)
                + "-dimensional clusters from "
                + (d + 1)
                + "-dimensional clusters.",
            LOG);
      }

      List<Subspace> subspaces = subspaceMap.get(d);
      if (subspaces == null || subspaces.isEmpty()) {
        if (stepprog != null) {
          for (int dim = d + 1; dim < dimensionality - 1; dim++) {
            stepprog.beginStep(
                dim + 2,
                "Generation of"
                    + (dim + 2)
                    + "-dimensional clusters not applicable, because no more "
                    + (d + 2)
                    + "-dimensional subspaces found.",
                LOG);
          }
        }
        break;
      }

      List<Subspace> candidates = generateSubspaceCandidates(subspaces);
      List<Subspace> s_d = new ArrayList<>();

      for (Subspace candidate : candidates) {
        Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap);
        if (LOG.isDebuggingFine()) {
          LOG.debugFine(
              "best subspace of "
                  + candidate.dimensonsToString()
                  + ": "
                  + bestSubspace.dimensonsToString());
        }

        List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace);
        List<Cluster<Model>> clusters = new ArrayList<>();
        for (Cluster<Model> cluster : bestSubspaceClusters) {
          List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate);
          if (!candidateClusters.isEmpty()) {
            clusters.addAll(candidateClusters);
          }
        }

        if (LOG.isDebuggingFine()) {
          StringBuilder msg = new StringBuilder();
          msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n");
          for (Cluster<Model> c : clusters) {
            msg.append("      " + c.getIDs() + "\n");
          }
          LOG.debugFine(msg.toString());
        }

        if (!clusters.isEmpty()) {
          s_d.add(candidate);
          clusterMap.put(candidate, clusters);
        }
      }

      if (!s_d.isEmpty()) {
        subspaceMap.put(d + 1, s_d);
      }
    }

    // build result
    int numClusters = 1;
    result = new Clustering<>("SUBCLU clustering", "subclu-clustering");
    for (Subspace subspace : clusterMap.descendingKeySet()) {
      List<Cluster<Model>> clusters = clusterMap.get(subspace);
      for (Cluster<Model> cluster : clusters) {
        Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs());
        newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs())));
        newCluster.setName("cluster_" + numClusters++);
        result.addToplevelCluster(newCluster);
      }
    }

    LOG.setCompleted(stepprog);
    return result;
  }
Exemple #3
0
  /**
   * Performs a single run of DOC, finding a single cluster.
   *
   * @param database Database context
   * @param relation used to get actual values for DBIDs.
   * @param S The set of points we're working on.
   * @param d Dimensionality of the data set we're currently working on.
   * @param r Size of random samples.
   * @param m Number of inner iterations (per seed point).
   * @param n Number of outer iterations (seed points).
   * @param minClusterSize Minimum size a cluster must have to be accepted.
   * @return a cluster, if one is found, else <code>null</code>.
   */
  private Cluster<SubspaceModel> runDOC(
      Database database,
      Relation<V> relation,
      ArrayModifiableDBIDs S,
      final int d,
      int n,
      int m,
      int r,
      int minClusterSize) {
    // Best cluster for the current run.
    DBIDs C = null;
    // Relevant attributes for the best cluster.
    long[] D = null;
    // Quality of the best cluster.
    double quality = Double.NEGATIVE_INFINITY;

    // Bounds for our cluster.
    // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new
    // double[d], new double[d]);

    // Weights for distance (= rectangle query)
    SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d));
    DistanceQuery<V> dq = database.getDistanceQuery(relation, df);
    RangeQuery<V> rq = database.getRangeQuery(dq);

    // Inform the user about the progress in the current iteration.
    FiniteProgress iprogress =
        LOG.isVerbose()
            ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG)
            : null;

    Random random = rnd.getSingleThreadedRandom();
    DBIDArrayIter iter = S.iter();

    for (int i = 0; i < n; ++i) {
      // Pick a random seed point.
      iter.seek(random.nextInt(S.size()));

      for (int j = 0; j < m; ++j) {
        // Choose a set of random points.
        DBIDs randomSet = DBIDUtil.randomSample(S, r, random);

        // Initialize cluster info.
        long[] nD = BitsUtil.zero(d);

        // Test each dimension and build bounding box.
        for (int k = 0; k < d; ++k) {
          if (dimensionIsRelevant(k, relation, randomSet)) {
            BitsUtil.setI(nD, k);
          }
        }
        if (BitsUtil.cardinality(nD) > 0) {
          // Get all points in the box.
          df.setSelectedDimensions(nD);
          // TODO: add filtering capabilities into query API!
          DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w));

          if (LOG.isDebuggingFiner()) {
            LOG.finer(
                "Testing a cluster candidate, |C| = "
                    + nC.size()
                    + ", |D| = "
                    + BitsUtil.cardinality(nD));
          }

          // Is the cluster large enough?
          if (nC.size() < minClusterSize) {
            // Too small.
            if (LOG.isDebuggingFiner()) {
              LOG.finer("... but it's too small.");
            }
          } else {
            // Better cluster than before?
            double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD));
            if (nQuality > quality) {
              if (LOG.isDebuggingFiner()) {
                LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality);
              }
              C = nC;
              D = nD;
              quality = nQuality;
            } else {
              if (LOG.isDebuggingFiner()) {
                LOG.finer("... but we already have a better one.");
              }
            }
          }
        }
        LOG.incrementProcessed(iprogress);
      }
    }
    LOG.ensureCompleted(iprogress);

    return (C != null) ? makeCluster(relation, C, D) : null;
  }