Exemple #1
0
  /** Performs the DBSCAN algorithm on the given database. */
  public Clustering<Model> run(Relation<O> relation) {
    final int size = relation.size();
    if (size < minpts) {
      Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
      result.addToplevelCluster(
          new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER));
      return result;
    }

    RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
    resultList = new ArrayList<>();
    noise = DBIDUtil.newHashSet();
    runDBSCAN(relation, rangeQuery);

    double averagen = ncounter / (double) relation.size();
    LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen));
    if (averagen < 1 + 0.1 * (minpts - 1)) {
      LOG.warning("There are very few neighbors found. Epsilon may be too small.");
    }
    if (averagen > 100 * minpts) {
      LOG.warning("There are very many neighbors found. Epsilon may be too large.");
    }

    Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
    for (ModifiableDBIDs res : resultList) {
      result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER));
    }
    result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return result;
  }
Exemple #2
0
  /**
   * Main loop for OUTRES
   *
   * @param relation Relation to process
   * @return Outlier detection result
   */
  public OutlierResult run(Relation<V> relation) {
    WritableDoubleDataStore ranks =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax minmax = new DoubleMinMax();

    KernelDensityEstimator kernel = new KernelDensityEstimator(relation);
    long[] subspace = BitsUtil.zero(kernel.dim);

    FiniteProgress progress =
        LOG.isVerbose() ? new FiniteProgress("OUTRES scores", relation.size(), LOG) : null;

    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      BitsUtil.zeroI(subspace);
      double score = outresScore(0, subspace, iditer, kernel);
      ranks.putDouble(iditer, score);
      minmax.put(score);
      LOG.incrementProcessed(progress);
    }
    LOG.ensureCompleted(progress);

    OutlierScoreMeta meta =
        new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.);
    OutlierResult outresResult =
        new OutlierResult(
            meta,
            new MaterializedDoubleRelation("OUTRES", "outres-score", ranks, relation.getDBIDs()));
    return outresResult;
  }
Exemple #3
0
  /**
   * Runs the DBSCAN algorithm on the specified partition of the database in the given subspace. If
   * parameter {@code ids} is null DBSCAN will be applied to the whole database.
   *
   * @param relation the database holding the objects to run DBSCAN on
   * @param ids the IDs of the database defining the partition to run DBSCAN on - if this parameter
   *     is null DBSCAN will be applied to the whole database
   * @param subspace the subspace to run DBSCAN on
   * @return the clustering result of the DBSCAN run
   */
  private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace subspace) {
    // distance function
    distanceFunction.setSelectedDimensions(subspace.getDimensions());

    ProxyDatabase proxy;
    if (ids == null) {
      // TODO: in this case, we might want to use an index - the proxy below
      // will prevent this!
      ids = relation.getDBIDs();
    }

    proxy = new ProxyDatabase(ids, relation);

    DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts);
    // run DBSCAN
    if (LOG.isVerbose()) {
      LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString());
    }
    Clustering<Model> dbsres = dbscan.run(proxy);

    // separate cluster and noise
    List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters();
    List<Cluster<Model>> clusters = new ArrayList<>();
    for (Cluster<Model> c : clusterAndNoise) {
      if (!c.isNoise()) {
        clusters.add(c);
      }
    }
    return clusters;
  }
 protected void autoEvaluateClusterings(ResultHierarchy hier, Result newResult) {
   Collection<Clustering<?>> clusterings =
       ResultUtil.filterResults(hier, newResult, Clustering.class);
   if (LOG.isDebugging()) {
     LOG.warning("Number of new clustering results: " + clusterings.size());
   }
   for (Iterator<Clustering<?>> c = clusterings.iterator(); c.hasNext(); ) {
     Clustering<?> test = c.next();
     if ("allinone-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("allinnoise-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("bylabel-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("bymodel-clustering".equals(test.getShortName())) {
       c.remove();
     }
   }
   if (clusterings.size() > 0) {
     try {
       new EvaluateClustering(new ByLabelClustering(), false, true)
           .processNewResult(hier, newResult);
     } catch (NoSupportedDataTypeException e) {
       // Pass - the data probably did not have labels.
     }
   }
 }
  @Override
  public void run() {
    Database database = input.getDatabase();
    Relation<O> relation = database.getRelation(distance.getInputTypeRestriction());
    DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, distance);
    KNNQuery<O> knnQ = database.getKNNQuery(distanceQuery, DatabaseQuery.HINT_HEAVY_USE);

    // open file.
    try (RandomAccessFile file = new RandomAccessFile(out, "rw");
        FileChannel channel = file.getChannel();
        // and acquire a file write lock
        FileLock lock = channel.lock()) {
      // write magic header
      file.writeInt(KNN_CACHE_MAGIC);

      int bufsize = k * 12 * 2 + 10; // Initial size, enough for 2 kNN.
      ByteBuffer buffer = ByteBuffer.allocateDirect(bufsize);

      FiniteProgress prog =
          LOG.isVerbose() ? new FiniteProgress("Computing kNN", relation.size(), LOG) : null;

      for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
        final KNNList nn = knnQ.getKNNForDBID(it, k);
        final int nnsize = nn.size();

        // Grow the buffer when needed:
        if (nnsize * 12 + 10 > bufsize) {
          while (nnsize * 12 + 10 > bufsize) {
            bufsize <<= 1;
          }
          buffer = ByteBuffer.allocateDirect(bufsize);
        }

        buffer.clear();
        ByteArrayUtil.writeUnsignedVarint(buffer, it.internalGetIndex());
        ByteArrayUtil.writeUnsignedVarint(buffer, nnsize);
        int c = 0;
        for (DoubleDBIDListIter ni = nn.iter(); ni.valid(); ni.advance(), c++) {
          ByteArrayUtil.writeUnsignedVarint(buffer, ni.internalGetIndex());
          buffer.putDouble(ni.doubleValue());
        }
        if (c != nn.size()) {
          throw new AbortException("Sizes did not agree. Cache is invalid.");
        }

        buffer.flip();
        channel.write(buffer);
        LOG.incrementProcessed(prog);
      }
      LOG.ensureCompleted(prog);
      lock.release();
    } catch (IOException e) {
      LOG.exception(e);
    }
    // FIXME: close!
  }
 @Override
 public void processNewResult(ResultHierarchy hier, Result newResult) {
   // We may just have added this result.
   if (newResult instanceof Clustering && isReferenceResult((Clustering<?>) newResult)) {
     return;
   }
   Database db = ResultUtil.findDatabase(hier);
   List<Clustering<?>> crs = ResultUtil.getClusteringResults(newResult);
   if (crs == null || crs.size() < 1) {
     return;
   }
   // Compute the reference clustering
   Clustering<?> refc = null;
   // Try to find an existing reference clustering (globally)
   {
     Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, db, Clustering.class);
     for (Clustering<?> test : cs) {
       if (isReferenceResult(test)) {
         refc = test;
         break;
       }
     }
   }
   // Try to find an existing reference clustering (locally)
   if (refc == null) {
     Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, newResult, Clustering.class);
     for (Clustering<?> test : cs) {
       if (isReferenceResult(test)) {
         refc = test;
         break;
       }
     }
   }
   if (refc == null) {
     LOG.debug("Generating a new reference clustering.");
     Result refres = referencealg.run(db);
     List<Clustering<?>> refcrs = ResultUtil.getClusteringResults(refres);
     if (refcrs.size() == 0) {
       LOG.warning("Reference algorithm did not return a clustering result!");
       return;
     }
     if (refcrs.size() > 1) {
       LOG.warning("Reference algorithm returned more than one result!");
     }
     refc = refcrs.get(0);
   } else {
     LOG.debug("Using existing clustering: " + refc.getLongName() + " " + refc.getShortName());
   }
   for (Clustering<?> c : crs) {
     if (c == refc) {
       continue;
     }
     evaluteResult(db, c, refc);
   }
 }
Exemple #7
0
  /**
   * Run the Eclat algorithm
   *
   * @param db Database to process
   * @param relation Bit vector relation
   * @return Frequent patterns found
   */
  public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable arrays, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());

    LOG.verbose("Build 1-dimensional transaction lists.");
    Duration ctime = LOG.newDuration(STAT + "eclat.transposition.time").begin();
    DBIDs[] idx = buildIndex(relation, dim, minsupp);
    LOG.statistics(ctime.end());

    FiniteProgress prog =
        LOG.isVerbose() ? new FiniteProgress("Building frequent itemsets", idx.length, LOG) : null;
    Duration etime = LOG.newDuration(STAT + "eclat.extraction.time").begin();
    final List<Itemset> solution = new ArrayList<>();
    for (int i = 0; i < idx.length; i++) {
      LOG.incrementProcessed(prog);
      extractItemsets(idx, i, minsupp, solution);
    }
    LOG.ensureCompleted(prog);
    Collections.sort(solution);
    LOG.statistics(etime.end());

    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("Eclat", "eclat", solution, meta);
  }
Exemple #8
0
  /**
   * Preprocessing step: determine the radii of interest for each point.
   *
   * @param ids IDs to process
   * @param rangeQuery Range query
   * @param interestingDistances Distances of interest
   */
  protected void precomputeInterestingRadii(
      DBIDs ids,
      RangeQuery<O> rangeQuery,
      WritableDataStore<DoubleIntArrayList> interestingDistances) {
    FiniteProgress progressPreproc =
        LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null;
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
      DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax);
      // build list of critical distances
      DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1);
      {
        int i = 0;
        DoubleDBIDListIter ni = neighbors.iter();
        while (ni.valid()) {
          final double curdist = ni.doubleValue();
          ++i;
          ni.advance();
          // Skip, if tied to the next object:
          if (ni.valid() && curdist == ni.doubleValue()) {
            continue;
          }
          cdist.append(curdist, i);
          // Scale radius, and reinsert
          if (alpha != 1.) {
            final double ri = curdist / alpha;
            if (ri <= rmax) {
              cdist.append(ri, Integer.MIN_VALUE);
            }
          }
        }
      }
      cdist.sort();

      // fill the gaps to have fast lookups of number of neighbors at a given
      // distance.
      int lastk = 0;
      for (int i = 0, size = cdist.size(); i < size; i++) {
        final int k = cdist.getInt(i);
        if (k == Integer.MIN_VALUE) {
          cdist.setValue(i, lastk);
        } else {
          lastk = k;
        }
      }
      // TODO: shrink the list, removing duplicate radii?

      interestingDistances.put(iditer, cdist);
      LOG.incrementProcessed(progressPreproc);
    }
    LOG.ensureCompleted(progressPreproc);
  }
 @Override
 public void checkRange(DBIDRange range) {
   final int size = max + 1 - min;
   if (size < range.size()) {
     LOG.warning("Distance matrix has size " + size + " but range has size: " + range.size());
   }
 }
Exemple #10
0
  @Override
  public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
      return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
      clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment =
        DataStoreUtil.makeIntegerStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];

    IndefiniteProgress prog =
        LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat =
        LOG.isStatistics()
            ? new DoubleStatistic(this.getClass().getName() + ".variance-sum")
            : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
      LOG.incrementProcessed(prog);
      boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
      logVarstat(varstat, varsum);
      // Stop if no cluster assignment changed.
      if (!changed) {
        break;
      }
      // Recompute means.
      means = means(clusters, means, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
      LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }

    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
      DBIDs ids = clusters.get(i);
      if (ids.size() == 0) {
        continue;
      }
      KMeansModel model = new KMeansModel(means[i], varsum[i]);
      result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
  }
Exemple #11
0
  /**
   * Generates {@code d+1}-dimensional subspace candidates from the specified {@code d}-dimensional
   * subspaces.
   *
   * @param subspaces the {@code d}-dimensional subspaces
   * @return the {@code d+1}-dimensional subspace candidates
   */
  private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) {
    List<Subspace> candidates = new ArrayList<>();

    if (subspaces.isEmpty()) {
      return candidates;
    }

    // Generate (d+1)-dimensional candidate subspaces
    int d = subspaces.get(0).dimensionality();

    StringBuilder msgFine = new StringBuilder("\n");
    if (LOG.isDebuggingFiner()) {
      msgFine.append("subspaces ").append(subspaces).append('\n');
    }

    for (int i = 0; i < subspaces.size(); i++) {
      Subspace s1 = subspaces.get(i);
      for (int j = i + 1; j < subspaces.size(); j++) {
        Subspace s2 = subspaces.get(j);
        Subspace candidate = s1.join(s2);

        if (candidate != null) {
          if (LOG.isDebuggingFiner()) {
            msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n');
          }
          // prune irrelevant candidate subspaces
          List<Subspace> lowerSubspaces = lowerSubspaces(candidate);
          if (LOG.isDebuggingFiner()) {
            msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n');
          }
          boolean irrelevantCandidate = false;
          for (Subspace s : lowerSubspaces) {
            if (!subspaces.contains(s)) {
              irrelevantCandidate = true;
              break;
            }
          }
          if (!irrelevantCandidate) {
            candidates.add(candidate);
          }
        }
      }
    }

    if (LOG.isDebuggingFiner()) {
      LOG.debugFiner(msgFine.toString());
    }
    if (LOG.isDebugging()) {
      StringBuilder msg = new StringBuilder();
      msg.append(d + 1).append("-dimensional candidate subspaces: ");
      for (Subspace candidate : candidates) {
        msg.append(candidate.dimensonsToString()).append(' ');
      }
      LOG.debug(msg.toString());
    }

    return candidates;
  }
Exemple #12
0
 @Override
 protected void prepareComplete() {
   StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null;
   scalingreferencevalues = new double[dimensionality];
   randomPerAttribute = new Random[dimensionality];
   if (scalingreference == ScalingReference.STDDEV) {
     if (buf != null) {
       buf.append("Standard deviation per attribute: ");
     }
     for (int d = 0; d < dimensionality; d++) {
       scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage;
       if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
         scalingreferencevalues[d] = percentage;
       }
       randomPerAttribute[d] = new Random(RANDOM.nextLong());
       if (buf != null) {
         buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
       }
     }
   } else if (scalingreference == ScalingReference.MINMAX
       && minima.length == 0
       && maxima.length == 0) {
     if (buf != null) {
       buf.append("extension per attribute: ");
     }
     for (int d = 0; d < dimensionality; d++) {
       scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage;
       if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
         scalingreferencevalues[d] = percentage;
       }
       randomPerAttribute[d] = new Random(RANDOM.nextLong());
       if (buf != null) {
         buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
       }
     }
   }
   mvs = null;
   if (buf != null) {
     LOG.debugFine(buf.toString());
   }
 }
Exemple #13
0
  /** Runs the wrapper with the specified arguments. */
  @Override
  public void run() throws UnableToComplyException {
    MultipleObjectsBundle data = generator.loadData();
    if (LOG.isVerbose()) {
      LOG.verbose("Writing output ...");
    }
    try {
      if (outputFile.exists()) {
        if (LOG.isVerbose()) {
          LOG.verbose(
              "The file "
                  + outputFile
                  + " already exists, "
                  + "the generator result will be APPENDED.");
        }
      }

      try (OutputStreamWriter outStream = new FileWriter(outputFile, true)) {
        writeClusters(outStream, data);
      }
    } catch (FileNotFoundException e) {
      throw new UnableToComplyException(e);
    } catch (IOException e) {
      throw new UnableToComplyException(e);
    }
    if (LOG.isVerbose()) {
      LOG.verbose("Done.");
    }
  }
Exemple #14
0
  /**
   * Run the DBSCAN algorithm
   *
   * @param relation Data relation
   * @param rangeQuery Range query class
   */
  protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) {
    final int size = relation.size();
    FiniteProgress objprog =
        LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null;
    IndefiniteProgress clusprog =
        LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;

    processedIDs = DBIDUtil.newHashSet(size);
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      if (!processedIDs.contains(iditer)) {
        expandCluster(relation, rangeQuery, iditer, objprog, clusprog);
      }
      if (objprog != null && clusprog != null) {
        objprog.setProcessed(processedIDs.size(), LOG);
        clusprog.setProcessed(resultList.size(), LOG);
      }
      if (processedIDs.size() == size) {
        break;
      }
    }
    // Finish progress logging
    LOG.ensureCompleted(objprog);
    LOG.setCompleted(clusprog);
  }
  /**
   * Process a database
   *
   * @param database Database to process
   * @param relation Relation to process
   * @return Histogram of ranking qualities
   */
  public HistogramResult<DoubleVector> run(Database database, Relation<O> relation) {
    final DistanceQuery<O> distanceQuery =
        database.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<O> knnQuery = database.getKNNQuery(distanceQuery, relation.size());

    if (LOG.isVerbose()) {
      LOG.verbose("Preprocessing clusters...");
    }
    // Cluster by labels
    Collection<Cluster<Model>> split =
        (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();

    DoubleStaticHistogram hist = new DoubleStaticHistogram(numbins, 0.0, 1.0);

    if (LOG.isVerbose()) {
      LOG.verbose("Processing points...");
    }
    FiniteProgress progress =
        LOG.isVerbose()
            ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG)
            : null;

    MeanVariance mv = new MeanVariance();
    // sort neighbors
    for (Cluster<?> clus : split) {
      for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
        KNNList knn = knnQuery.getKNNForDBID(iter, relation.size());
        double result = new ROCEvaluation().evaluate(clus, knn);

        mv.put(result);
        hist.increment(result, 1. / relation.size());

        LOG.incrementProcessed(progress);
      }
    }
    LOG.ensureCompleted(progress);

    // Transform Histogram into a Double Vector array.
    Collection<DoubleVector> res = new ArrayList<>(relation.size());
    for (DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) {
      DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), iter.getValue()});
      res.add(row);
    }
    HistogramResult<DoubleVector> result =
        new HistogramResult<>("Ranking Quality Histogram", "ranking-histogram", res);
    result.addHeader("Mean: " + mv.getMean() + " Variance: " + mv.getSampleVariance());
    return result;
  }
Exemple #16
0
    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      DoubleParameter epsilonP =
          new DoubleParameter(EPSILON_ID) //
              .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
      if (config.grab(epsilonP)) {
        epsilon = epsilonP.getValue();
      }

      IntParameter minptsP =
          new IntParameter(MINPTS_ID) //
              .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if (config.grab(minptsP)) {
        minpts = minptsP.getValue();
        if (minpts <= 2) {
          LOG.warning(
              "DBSCAN with minPts <= 2 is equivalent to single-link clustering at a single height. Consider using larger values of minPts.");
        }
      }
    }
  private void loadCache(DistanceParser parser, File matrixfile) throws IOException {
    InputStream in =
        new BufferedInputStream(FileUtil.tryGzipInput(new FileInputStream(matrixfile)));
    cache =
        new TLongFloatHashMap(
            Constants.DEFAULT_CAPACITY,
            Constants.DEFAULT_LOAD_FACTOR,
            -1L,
            Float.POSITIVE_INFINITY);
    min = Integer.MAX_VALUE;
    max = Integer.MIN_VALUE;
    parser.parse(
        in,
        new DistanceCacheWriter() {
          @Override
          public void put(int id1, int id2, double distance) {
            if (id1 < id2) {
              min = id1 < min ? id1 : min;
              max = id2 > max ? id2 : max;
            } else {
              min = id2 < min ? id2 : min;
              max = id1 > max ? id1 : max;
            }
            cache.put(makeKey(id1, id2), (float) distance);
          }

          @Override
          public boolean containsKey(int id1, int id2) {
            return cache.containsKey(makeKey(id1, id2));
          }
        });
    if (min != 0) {
      LOG.verbose(
          "Distance matrix is supposed to be 0-indexed. Choosing offset "
              + min
              + " to compensate.");
    }
  }
  public Result run(Database database, Relation<O> rel) {
    DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction());
    int size = rel.size();
    long pairs = (size * (long) size) >> 1;

    final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling;
    if (ssize > Integer.MAX_VALUE) {
      throw new AbortException("Sampling size too large.");
    }
    final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize);

    DoubleMaxHeap heap = new DoubleMaxHeap(qsize);

    ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs());
    DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
    Random r = rand.getSingleThreadedRandom();

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null;
    for (long i = 0; i < ssize; i++) {
      int x = r.nextInt(size - 1) + 1, y = r.nextInt(x);
      double dist = dq.distance(i1.seek(x), i2.seek(y));
      // Skip NaN, and/or zeros.
      if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) {
        continue;
      }
      heap.add(dist, qsize);
      LOG.incrementProcessed(prog);
    }

    LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile));
    LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize));
    LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek()));
    LOG.ensureCompleted(prog);
    Collection<String> header = Arrays.asList(new String[] {"Distance"});
    Collection<Vector> data = Arrays.asList(new Vector[] {new Vector(heap.peek())});
    return new CollectionResult<Vector>("Distances sample", "distance-sample", data, header);
  }
Exemple #19
0
/**
 * A filter to perturb the values by adding micro-noise.
 *
 * <p>The added noise is generated, attribute-wise, by a Gaussian with mean=0 and a specified
 * standard deviation or by a uniform distribution with a specified range. The standard deviation or
 * the range can be scaled, attribute-wise, to a given percentage of the original standard deviation
 * in the data distribution (assuming a Gaussian distribution there), or to a percentage of the
 * extension in each attribute ({@code maximumValue - minimumValue}).
 *
 * <p>This filter has a potentially wide use but has been implemented for the following publication:
 *
 * <p>Reference:
 *
 * <p>A. Zimek, R. J. G. B. Campello, J. Sander:</br> Data Perturbation for Outlier Detection
 * Ensembles.<\br> In: Proc. 26th International Conference on Scientific and Statistical Database
 * Management (SSDBM), Aalborg, Denmark, 2014.
 *
 * @author Arthur Zimek
 */
@Title("Data Perturbation for Outlier Detection Ensembles")
@Description(
    "A filter to perturb a datasset on read by an additive noise component, implemented for use in an outlier ensemble (this reference).")
@Reference(
    authors = "A. Zimek, R. J. G. B. Campello, J. Sander", //
    title = "Data Perturbation for Outlier Detection Ensembles", //
    booktitle =
        "Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014", //
    url = "http://dx.doi.org/10.1145/2618243.2618257")
public class PerturbationFilter<V extends NumberVector>
    extends AbstractVectorConversionFilter<V, V> {
  /** Class logger */
  private static final Logging LOG = Logging.getLogger(PerturbationFilter.class);

  /**
   * Scaling reference options.
   *
   * @author Arthur Zimek
   * @apiviz.exclude
   */
  public static enum ScalingReference {
    UNITCUBE,
    STDDEV,
    MINMAX
  }

  /**
   * Nature of the noise distribution.
   *
   * @author Arthur Zimek
   * @apiviz.exclude
   */
  public static enum NoiseDistribution {
    GAUSSIAN,
    UNIFORM
  }

  /** Which reference to use for scaling the noise. */
  private ScalingReference scalingreference;

  /** Nature of the noise distribution. */
  private NoiseDistribution noisedistribution;

  /** Random object to generate the attribute-wise seeds for the noise. */
  private final Random RANDOM;

  /**
   * Percentage of the variance of the random noise generation, given the variance of the
   * corresponding attribute in the data.
   */
  private double percentage;

  /** Temporary storage used during initialization. */
  private MeanVarianceMinMax[] mvs = null;

  /** Stores the scaling reference in each dimension. */
  private double[] scalingreferencevalues = new double[0];

  /** The random objects to generate noise distributions independently for each attribute. */
  private Random[] randomPerAttribute = null;

  /** Stores the maximum in each dimension. */
  private double[] maxima;

  /** Stores the minimum in each dimension. */
  private double[] minima;

  /** Stores the dimensionality from the preprocessing. */
  private int dimensionality = 0;

  /**
   * Constructor.
   *
   * @param seed Seed value, may be {@code null} for a random seed.
   * @param percentage Relative amount of jitter to add
   * @param scalingreference Scaling reference
   * @param minima Preset minimum values. May be {@code null}.
   * @param maxima Preset maximum values. May be {@code null}.
   * @param noisedistribution Nature of the noise distribution.
   */
  public PerturbationFilter(
      Long seed,
      double percentage,
      ScalingReference scalingreference,
      double[] minima,
      double[] maxima,
      NoiseDistribution noisedistribution) {
    super();
    this.percentage = percentage;
    this.scalingreference = scalingreference;
    this.minima = minima;
    this.maxima = maxima;
    this.noisedistribution = noisedistribution;
    this.RANDOM = (seed == null) ? new Random() : new Random(seed);
  }

  @Override
  protected boolean prepareStart(SimpleTypeInformation<V> in) {
    if (scalingreference == ScalingReference.MINMAX && minima.length != 0 && maxima.length != 0) {
      dimensionality = minima.length;
      scalingreferencevalues = new double[dimensionality];
      randomPerAttribute = new Random[dimensionality];
      for (int d = 0; d < dimensionality; d++) {
        scalingreferencevalues[d] = (maxima[d] - minima[d]) * percentage;
        if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
          scalingreferencevalues[d] = percentage;
        }
        randomPerAttribute[d] = new Random(RANDOM.nextLong());
      }
      return false;
    }
    if (scalingreference == ScalingReference.UNITCUBE) {
      return false;
    }
    return (scalingreferencevalues.length == 0);
  }

  @Override
  protected void prepareProcessInstance(V featureVector) {
    // First object? Then init. (We didn't have a dimensionality before!)
    if (mvs == null) {
      dimensionality = featureVector.getDimensionality();
      mvs = MeanVarianceMinMax.newArray(dimensionality);
    }
    for (int d = 0; d < featureVector.getDimensionality(); d++) {
      mvs[d].put(featureVector.doubleValue(d));
    }
  }

  @Override
  protected void prepareComplete() {
    StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null;
    scalingreferencevalues = new double[dimensionality];
    randomPerAttribute = new Random[dimensionality];
    if (scalingreference == ScalingReference.STDDEV) {
      if (buf != null) {
        buf.append("Standard deviation per attribute: ");
      }
      for (int d = 0; d < dimensionality; d++) {
        scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage;
        if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
          scalingreferencevalues[d] = percentage;
        }
        randomPerAttribute[d] = new Random(RANDOM.nextLong());
        if (buf != null) {
          buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
        }
      }
    } else if (scalingreference == ScalingReference.MINMAX
        && minima.length == 0
        && maxima.length == 0) {
      if (buf != null) {
        buf.append("extension per attribute: ");
      }
      for (int d = 0; d < dimensionality; d++) {
        scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage;
        if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
          scalingreferencevalues[d] = percentage;
        }
        randomPerAttribute[d] = new Random(RANDOM.nextLong());
        if (buf != null) {
          buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
        }
      }
    }
    mvs = null;
    if (buf != null) {
      LOG.debugFine(buf.toString());
    }
  }

  @Override
  protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
    return TypeUtil.NUMBER_VECTOR_FIELD;
  }

  @Override
  protected V filterSingleObject(V featureVector) {
    if (scalingreference == ScalingReference.UNITCUBE && dimensionality == 0) {
      dimensionality = featureVector.getDimensionality();
      scalingreferencevalues = new double[dimensionality];
      randomPerAttribute = new Random[dimensionality];
      for (int d = 0; d < dimensionality; d++) {
        scalingreferencevalues[d] = percentage;
        randomPerAttribute[d] = new Random(RANDOM.nextLong());
      }
    }
    if (scalingreferencevalues.length != featureVector.getDimensionality()) {
      throw new IllegalArgumentException(
          "FeatureVectors and given Minima/Maxima differ in length.");
    }
    double[] values = new double[featureVector.getDimensionality()];
    for (int d = 0; d < featureVector.getDimensionality(); d++) {
      if (this.noisedistribution.equals(NoiseDistribution.GAUSSIAN)) {
        values[d] =
            featureVector.doubleValue(d)
                + randomPerAttribute[d].nextGaussian() * scalingreferencevalues[d];
      } else if (this.noisedistribution.equals(NoiseDistribution.UNIFORM)) {
        values[d] =
            featureVector.doubleValue(d)
                + randomPerAttribute[d].nextDouble() * scalingreferencevalues[d];
      }
    }
    return factory.newNumberVector(values);
  }

  @Override
  protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
    initializeOutputType(in);
    return in;
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * Parameterization class.
   *
   * @author Arthur Zimek
   * @apiviz.exclude
   */
  public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
    /** Parameter for minimum. */
    public static final OptionID MINIMA_ID =
        new OptionID(
            "perturbationfilter.min",
            "Only used, if "
                + ScalingReference.MINMAX
                + " is set as scaling reference: a comma separated concatenation of the minimum values in each dimension assumed as a reference. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");

    /** Parameter for maximum. */
    public static final OptionID MAXIMA_ID =
        new OptionID(
            "perturbationfilter.max",
            "Only used, if "
                + ScalingReference.MINMAX
                + " is set as scaling reference: a comma separated concatenation of the maximum values in each dimension assumed as a reference. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");

    /** Stores the maximum in each dimension. */
    private double[] maxima = new double[0];

    /** Stores the minimum in each dimension. */
    private double[] minima = new double[0];

    /**
     * Optional parameter to specify a seed for random Gaussian noise generation. If unused, system
     * time is used as seed.
     *
     * <p>Key: {@code -perturbationfilter.seed}
     */
    public static final OptionID SEED_ID =
        new OptionID("perturbationfilter.seed", "Seed for random noise generation.");

    /**
     * Seed for randomly shuffling the rows of the database. If null, system time is used as seed.
     */
    protected Long seed = null;

    /**
     * Optional parameter to specify a percentage of the standard deviation of the random Gaussian
     * noise generation, given the standard deviation of the corresponding attribute in the original
     * data distribution (assuming a Gaussian there).
     *
     * <p>Key: {@code -perturbationfilter.percentage}
     *
     * <p>Default: <code>0.01</code>
     *
     * <p>Constraint: 0 &lt; percentage &leq;1
     */
    public static final OptionID PERCENTAGE_ID =
        new OptionID(
            "perturbationfilter.percentage",
            "Percentage of the standard deviation of the random Gaussian noise generation per attribute, given the standard deviation of the corresponding attribute in the original data distribution (assuming a Gaussian distribution there).");

    /**
     * Parameter for selecting scaling reference.
     *
     * <p>Key: {@code -perturbationfilter.scalingreference}
     *
     * <p>Default: <code>ScalingReference.UNITCUBE</code>
     */
    public static final OptionID SCALINGREFERENCE_ID =
        new OptionID(
            "perturbationfilter.scalingreference",
            "The reference for scaling the Gaussian noise. Default is "
                + ScalingReference.UNITCUBE
                + ", parameter "
                + PERCENTAGE_ID.getName()
                + " will then directly define the standard deviation of all noise Gaussians. For options "
                + ScalingReference.STDDEV
                + " and  "
                + ScalingReference.MINMAX
                + ", the percentage of the attributewise standard deviation or extension, repectively, will define the attributewise standard deviation of the noise Gaussians.");

    /**
     * Parameter for selecting the noise distribution.
     *
     * <p>Key: {@code -perturbationfilter.noisedistribution}
     *
     * <p>Default: <code>NoiseDistribution.UNIFORM</code>
     */
    public static final OptionID NOISEDISTRIBUTION_ID =
        new OptionID(
            "perturbationfilter.noisedistribution",
            "The nature of the noise distribution, default is " + NoiseDistribution.UNIFORM);

    /**
     * Percentage of the variance of the random Gaussian noise generation or of the range of the
     * uniform distribution, given the variance of the corresponding attribute in the data.
     */
    protected double percentage;

    /** The option which reference to use for scaling the noise. */
    protected ScalingReference scalingreference;

    /** The option which nature of noise distribution to choose. */
    protected NoiseDistribution noisedistribution;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      EnumParameter<ScalingReference> scalingReferenceP =
          new EnumParameter<>(
              SCALINGREFERENCE_ID, ScalingReference.class, ScalingReference.UNITCUBE);
      if (config.grab(scalingReferenceP)) {
        scalingreference = scalingReferenceP.getValue();
      }
      EnumParameter<NoiseDistribution> noisedistributionP =
          new EnumParameter<>(
              NOISEDISTRIBUTION_ID, NoiseDistribution.class, NoiseDistribution.UNIFORM);
      if (config.grab(noisedistributionP)) {
        noisedistribution = noisedistributionP.getValue();
      }
      DoubleParameter percentageP = new DoubleParameter(PERCENTAGE_ID, .01);
      percentageP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
      percentageP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
      if (config.grab(percentageP)) {
        percentage = percentageP.getValue();
      }
      LongParameter seedP = new LongParameter(SEED_ID);
      seedP.setOptional(true);
      if (config.grab(seedP)) {
        seed = seedP.getValue();
      }
      DoubleListParameter minimaP = new DoubleListParameter(MINIMA_ID);
      minimaP.setOptional(true);
      if (config.grab(minimaP)) {
        minima = minimaP.getValue().clone();
      }
      DoubleListParameter maximaP = new DoubleListParameter(MAXIMA_ID);
      maximaP.setOptional(true);
      if (config.grab(maximaP)) {
        maxima = maximaP.getValue().clone();
      }

      config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(minimaP, maximaP));
      config.checkConstraint(new EqualSizeGlobalConstraint(minimaP, maximaP));
    }

    @Override
    protected PerturbationFilter<V> makeInstance() {
      return new PerturbationFilter<>(
          seed, percentage, scalingreference, minima, maxima, noisedistribution);
    }
  }
}
Exemple #20
0
/**
 * The Spatial Outlier Factor (SOF) is a spatial {@link
 * de.lmu.ifi.dbs.elki.algorithm.outlier.lof.LOF LOF} variation.
 *
 * <p>Since the "reachability distance" of LOF cannot be used canonically in the bichromatic case,
 * this part of LOF is dropped and the exact distance is used instead.
 *
 * <p>Huang, T., Qin, X.<br>
 * Detecting outliers in spatial database.<br>
 * In: Proc. 3rd International Conference on Image and Graphics, Hong Kong, China. A LOF variation
 * simplified with reachDist(o,p) == dist(o,p).
 *
 * @author Ahmed Hettab
 * @since 0.4.0
 * @param <N> Neighborhood object type
 * @param <O> Attribute object type
 */
@Title("Spatial Outlier Factor")
@Reference(
    authors = "Huang, T., Qin, X.",
    title = "Detecting outliers in spatial database",
    booktitle = "Proc. 3rd International Conference on Image and Graphics",
    url = "http://dx.doi.org/10.1109/ICIG.2004.53")
public class SOF<N, O> extends AbstractDistanceBasedSpatialOutlier<N, O> {
  /** The logger for this class. */
  private static final Logging LOG = Logging.getLogger(SOF.class);

  /**
   * Constructor.
   *
   * @param npred Neighborhood predicate
   * @param nonSpatialDistanceFunction Distance function on non-spatial attributes
   */
  public SOF(
      NeighborSetPredicate.Factory<N> npred,
      PrimitiveDistanceFunction<O> nonSpatialDistanceFunction) {
    super(npred, nonSpatialDistanceFunction);
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * The main run method
   *
   * @param database Database to use (actually unused)
   * @param spatial Relation for neighborhood
   * @param relation Attributes to evaluate
   * @return Outlier result
   */
  public OutlierResult run(Database database, Relation<N> spatial, Relation<O> relation) {
    final NeighborSetPredicate npred =
        getNeighborSetPredicateFactory().instantiate(database, spatial);
    DistanceQuery<O> distFunc = getNonSpatialDistanceFunction().instantiate(relation);

    WritableDoubleDataStore lrds =
        DataStoreUtil.makeDoubleStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
    WritableDoubleDataStore lofs =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax lofminmax = new DoubleMinMax();

    // Compute densities
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      DBIDs neighbors = npred.getNeighborDBIDs(iditer);
      double avg = 0;
      for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
        avg += distFunc.distance(iditer, iter);
      }
      double lrd = 1 / (avg / neighbors.size());
      if (Double.isNaN(lrd)) {
        lrd = 0;
      }
      lrds.putDouble(iditer, lrd);
    }

    // Compute density quotients
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      DBIDs neighbors = npred.getNeighborDBIDs(iditer);
      double avg = 0;
      for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
        avg += lrds.doubleValue(iter);
      }
      final double lrd = (avg / neighbors.size()) / lrds.doubleValue(iditer);
      if (!Double.isNaN(lrd)) {
        lofs.putDouble(iditer, lrd);
        lofminmax.put(lrd);
      } else {
        lofs.putDouble(iditer, 0.0);
      }
    }

    // Build result representation.
    DoubleRelation scoreResult =
        new MaterializedDoubleRelation(
            "Spatial Outlier Factor", "sof-outlier", lofs, relation.getDBIDs());
    OutlierScoreMeta scoreMeta =
        new QuotientOutlierScoreMeta(
            lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
    OutlierResult or = new OutlierResult(scoreMeta, scoreResult);
    or.addChildResult(npred);
    return or;
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(
        getNeighborSetPredicateFactory().getInputTypeRestriction(), TypeUtil.NUMBER_VECTOR_FIELD);
  }

  /**
   * Parameterization class
   *
   * @author Ahmed Hettab
   * @apiviz.exclude
   * @param <N> Neighborhood type
   * @param <O> Attribute object type
   */
  public static class Parameterizer<N, O>
      extends AbstractDistanceBasedSpatialOutlier.Parameterizer<N, O> {
    @Override
    protected SOF<N, O> makeInstance() {
      return new SOF<>(npredf, distanceFunction);
    }
  }
}
Exemple #21
0
  /**
   * Algorithm 3 of Cheng and Church.
   *
   * <p>Try to re-add rows or columns that decrease the overall score.
   *
   * <p>Also try adding inverted rows.
   *
   * @param mat Data matrix
   * @param cand Bicluster candidate
   */
  private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) {
    cand.updateRowAndColumnMeans(mat, true);
    cand.computeMeanSquaredDeviation(mat);
    while (true) {
      // We need this to be final + mutable
      final boolean[] added = new boolean[] {false, false};

      // Step 2: add columns
      cand.visitRow(
          mat,
          0,
          CellVisitor.NOT_SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (!selcol);
              if (cand.computeColResidue(mat, col) <= cand.residue) {
                cand.selectColumn(col, true);
                added[0] = true;
              }
              return false;
            }
          });

      // Step 3: recompute values
      if (added[0]) {
        cand.updateRowAndColumnMeans(mat, true);
        cand.computeMeanSquaredDeviation(mat);
      }

      // Step 4: try adding rows.
      cand.visitColumn(
          mat,
          0,
          CellVisitor.NOT_SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (!selrow);
              if (cand.computeRowResidue(mat, row, false) <= cand.residue) {
                cand.selectRow(row, true);
                added[1] = true;
              }
              return false;
            }
          });

      // Step 5: try adding inverted rows.
      if (useinverted) {
        cand.visitColumn(
            mat,
            0,
            CellVisitor.NOT_SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (!selrow);
                if (cand.computeRowResidue(mat, row, true) <= cand.residue) {
                  cand.selectRow(row, true);
                  cand.invertRow(row, true);
                  added[1] = true;
                }
                return false;
              }
            });
      }
      if (added[1]) {
        cand.updateRowAndColumnMeans(mat, true);
        cand.computeMeanSquaredDeviation(mat);
        if (LOG.isDebuggingFine()) {
          LOG.debugFine(
              "Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
        }
      }
      if (!added[0] && !added[1]) {
        break;
      }
    }
  }
Exemple #22
0
  /**
   * Algorithm 2 of Cheng and Church.
   *
   * <p>Remove all rows and columns that reduce the residue by alpha.
   *
   * <p>Inverted rows are not supported in this method.
   *
   * @param mat Data matrix
   * @param cand Bicluster candidate
   */
  private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
    cand.updateRowAndColumnMeans(mat, false);
    cand.computeMeanSquaredDeviation(mat);

    // Note: assumes that cand.residue = H(I,J)
    while (cand.residue > delta) {
      final boolean[] modified = {false, false};

      // Step 2: remove rows above threshold
      if (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) {
        final double alphaResidue = alpha * cand.residue;
        cand.visitColumn(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selrow);
                if (cand.computeRowResidue(mat, row, false) > alphaResidue) {
                  cand.selectRow(row, false);
                  modified[0] = true;
                }
                return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD);
              }
            });

        // Step 3: update residue
        if (modified[0]) {
          cand.updateRowAndColumnMeans(mat, false);
          cand.computeMeanSquaredDeviation(mat);
        }
      }

      // Step 4: remove columns above threshold
      if (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) {
        final double alphaResidue = alpha * cand.residue;
        cand.visitRow(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selcol);
                if (cand.computeColResidue(mat, col) > alphaResidue) {
                  cand.selectColumn(col, false);
                  modified[1] = true;
                }
                return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD);
              }
            });
        if (modified[1]) {
          cand.updateRowAndColumnMeans(mat, false);
          cand.computeMeanSquaredDeviation(mat);
        }
      }

      if (LOG.isDebuggingFine()) {
        LOG.debugFine(
            "Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      // Step 5: if nothing has been removed, try removing single nodes.
      if (!modified[0] && !modified[1]) {
        break;
        // Will be executed next in main loop, as per algorithm 4.
        // singleNodeDeletion();
      }
    }
  }
Exemple #23
0
/**
 * Perform Cheng and Church biclustering.
 *
 * <p>Reference: <br>
 * Y. Cheng and G. M. Church. Biclustering of expression data. In Proceedings of the 8th
 * International Conference on Intelligent Systems for Molecular Biology (ISMB), San Diego, CA,
 * 2000.
 *
 * @author Erich Schubert
 * @param <V> Vector type.
 */
@Reference(
    authors = "Y. Cheng, G. M. Church",
    title = "Biclustering of expression data",
    booktitle =
        "Proc. 8th International Conference on Intelligent Systems for Molecular Biology (ISMB)")
public class ChengAndChurch<V extends NumberVector>
    extends AbstractBiclustering<V, BiclusterWithInversionsModel> {
  /** The logger for this class. */
  private static final Logging LOG = Logging.getLogger(ChengAndChurch.class);

  /**
   * The minimum number of columns that the database must have so that a removal of columns is
   * performed in {@link #multipleNodeDeletion}.
   *
   * <p>Just start deleting multiple columns when more than 100 columns are in the data matrix.
   */
  private static final int MIN_COLUMN_REMOVE_THRESHOLD = 100;

  /**
   * The minimum number of rows that the database must have so that a removal of rows is performed
   * in {@link #multipleNodeDeletion}.
   *
   * <p>Just start deleting multiple rows when more than 100 rows are in the data matrix.
   * <!--
   * <p>
   * The value is set to 100 as this is not really described in the paper.
   * </p>
   * -->
   */
  private static final int MIN_ROW_REMOVE_THRESHOLD = 100;

  /** Threshold for the score. */
  private double delta;

  /**
   * The parameter for multiple node deletion.
   *
   * <p>It is used to magnify the {@link #delta} value in the {@link #multipleNodeDeletion} method.
   */
  private double alpha;

  /** Number of biclusters to be found. */
  private int n;

  /** Allow inversion of rows in the last phase. */
  private boolean useinverted = true;

  /** Distribution to sample random replacement values from. */
  private Distribution dist;

  /**
   * Constructor.
   *
   * @param delta Delta parameter: desired quality
   * @param alpha Alpha parameter: controls switching to single node deletion approach
   * @param n Number of clusters to detect
   * @param dist Distribution of random values to insert
   */
  public ChengAndChurch(double delta, double alpha, int n, Distribution dist) {
    super();
    this.delta = delta;
    this.alpha = alpha;
    this.n = n;
    this.dist = dist;
  }

  /**
   * Visitor pattern for processing cells.
   *
   * @author Erich Schubert
   * @apiviz.exclude
   */
  public static interface CellVisitor {
    /** Different modes of operation. */
    int ALL = 0, SELECTED = 1, NOT_SELECTED = 2;

    /**
     * Visit a cell.
     *
     * @param val Value
     * @param row Row Number
     * @param col Column number
     * @param selrow Boolean, whether row is selected
     * @param selcol Boolean, whether column is selected
     * @return Stop flag, return {@code true} to stop visiting
     */
    public boolean visit(double val, int row, int col, boolean selrow, boolean selcol);
  }

  /**
   * Bicluster candidate.
   *
   * @author Erich Schubert
   * @apiviz.exclude
   */
  protected static class BiclusterCandidate {
    /** Cardinalities. */
    int rowcard, colcard;

    /** Means. */
    double[] rowM, colM;

    /** Row and column bitmasks. */
    long[] rows, irow, cols;

    /** Mean of the current bicluster. */
    double allM;

    /** The current bicluster score (mean squared residue). */
    double residue;

    /**
     * Constructor.
     *
     * @param rows Row dimensionality.
     * @param cols Column dimensionality.
     */
    protected BiclusterCandidate(int rows, int cols) {
      super();
      this.rows = BitsUtil.ones(rows);
      this.irow = BitsUtil.zero(rows);
      this.rowcard = rows;
      this.rowM = new double[rows];
      this.cols = BitsUtil.ones(cols);
      this.colcard = cols;
      this.colM = new double[cols];
    }

    /** Resets the values for the next cluster search. */
    protected void reset() {
      rows = BitsUtil.ones(rowM.length);
      rowcard = rowM.length;
      cols = BitsUtil.ones(colM.length);
      colcard = colM.length;
      BitsUtil.zeroI(irow);
    }

    /**
     * Visit all selected cells in the data matrix.
     *
     * @param mat Data matrix
     * @param mode Operation mode
     * @param visitor Visitor function
     */
    protected void visitAll(double[][] mat, int mode, CellVisitor visitor) {
      // For efficiency, we manually iterate over the rows and column bitmasks.
      // This saves repeated shifting needed by the manual bit access.
      for (int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) {
        long rlong = rows[rlpos];
        // Fast skip blocks of 64 masked values.
        if ((mode == CellVisitor.SELECTED && rlong == 0L)
            || (mode == CellVisitor.NOT_SELECTED && rlong == -1L)) {
          rpos += Long.SIZE;
          continue;
        }
        for (int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) {
          boolean rselected = ((rlong & 1L) == 1L);
          if ((mode == CellVisitor.SELECTED && !rselected)
              || (mode == CellVisitor.NOT_SELECTED && rselected)) {
            continue;
          }
          for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
            long clong = cols[clpos];
            if ((mode == CellVisitor.SELECTED && clong == 0L)
                || (mode == CellVisitor.NOT_SELECTED && clong == -1L)) {
              cpos += Long.SIZE;
              continue;
            }
            for (int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) {
              boolean cselected = ((clong & 1L) == 1L);
              if ((mode == CellVisitor.SELECTED && !cselected)
                  || (mode == CellVisitor.NOT_SELECTED && cselected)) {
                continue;
              }
              boolean stop = visitor.visit(mat[rpos][cpos], rpos, cpos, rselected, cselected);
              if (stop) {
                return;
              }
            }
          }
        }
      }
    }

    /**
     * Visit a column of the matrix.
     *
     * @param mat Data matrix
     * @param col Column to visit
     * @param mode Operation mode
     * @param visitor Visitor function
     */
    protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) {
      boolean cselected = BitsUtil.get(cols, col);
      // For efficiency, we manually iterate over the rows and column bitmasks.
      // This saves repeated shifting needed by the manual bit access.
      for (int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) {
        long rlong = rows[rlpos];
        // Fast skip blocks of 64 masked values.
        if (mode == CellVisitor.SELECTED && rlong == 0L) {
          rpos += Long.SIZE;
          continue;
        }
        if (mode == CellVisitor.NOT_SELECTED && rlong == -1L) {
          rpos += Long.SIZE;
          continue;
        }
        for (int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) {
          boolean rselected = ((rlong & 1L) == 1L);
          if (mode == CellVisitor.SELECTED && !rselected) {
            continue;
          }
          if (mode == CellVisitor.NOT_SELECTED && rselected) {
            continue;
          }
          boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected);
          if (stop) {
            return;
          }
        }
      }
    }

    /**
     * Visit a row of the data matrix.
     *
     * @param mat Data matrix
     * @param row Row to visit
     * @param visitor Visitor function
     */
    protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) {
      boolean rselected = BitsUtil.get(rows, row);
      final double[] rowdata = mat[row];
      for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
        long clong = cols[clpos];
        // Fast skip blocks of 64 masked values.
        if (mode == CellVisitor.SELECTED && clong == 0L) {
          cpos += Long.SIZE;
          continue;
        }
        if (mode == CellVisitor.NOT_SELECTED && clong == -1L) {
          cpos += Long.SIZE;
          continue;
        }
        for (int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) {
          boolean cselected = ((clong & 1L) == 1L);
          if (mode == CellVisitor.SELECTED && !cselected) {
            continue;
          }
          if (mode == CellVisitor.NOT_SELECTED && cselected) {
            continue;
          }
          boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected);
          if (stop) {
            return;
          }
        }
      }
    }

    /** Visitor for updating the means. */
    private final CellVisitor MEANVISITOR =
        new CellVisitor() {
          @Override
          public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
            if (selcol) {
              rowM[row] += val;
            }
            if (selrow) {
              colM[col] += val;
            }
            if (selcol && selrow) {
              allM += val;
            }
            return false;
          }
        };

    /**
     * Update the row means and column means.
     *
     * @param mat Data matrix
     * @param all Flag, to update all
     * @return overall mean
     */
    protected double updateRowAndColumnMeans(final double[][] mat, boolean all) {
      final int mode = all ? CellVisitor.ALL : CellVisitor.SELECTED;
      Arrays.fill(rowM, 0.);
      Arrays.fill(colM, 0.);
      allM = 0.;
      visitAll(mat, mode, MEANVISITOR);
      visitColumn(
          mat,
          0,
          mode,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              rowM[row] /= colcard;
              return false;
            }
          });
      visitRow(
          mat,
          0,
          mode,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              colM[col] /= rowcard;
              return false;
            }
          });
      allM /= colcard * rowcard;
      return allM;
    }

    /**
     * Compute the mean square residue.
     *
     * @param mat Data matrix
     * @return mean squared residue
     */
    protected double computeMeanSquaredDeviation(final double[][] mat) {
      final Mean msr = new Mean();
      visitAll(
          mat,
          CellVisitor.SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (selrow && selcol);
              double v = val - rowM[row] - colM[col] + allM;
              msr.put(v * v);
              return false;
            }
          });
      residue = msr.getMean();
      return residue;
    }

    /**
     * Computes the <b>mean row residue</b> of the given <code>row</code>.
     *
     * @param mat Data matrix
     * @param row The row who's residue should be computed.
     * @param rowinverted Indicates if the row should be considered inverted.
     * @return The row residue of the given <code>row</code>.
     */
    protected double computeRowResidue(final double[][] mat, int row, final boolean rowinverted) {
      final Mean rowResidue = new Mean();
      visitRow(
          mat,
          row,
          CellVisitor.SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (selcol);
              final double rowMean = rowM[row];
              final double colMean = colM[col];
              double v = ((!rowinverted) ? (val - rowMean) : (rowMean - val)) - colMean + allM;
              rowResidue.put(v * v);
              return false;
            }
          });
      return rowResidue.getMean();
    }

    /**
     * Computes the <b>mean column residue</b> of the given <code>col</code>.
     *
     * @param col The column who's residue should be computed.
     * @return The row residue of the given <code>col</code>um.
     */
    protected double computeColResidue(final double[][] mat, final int col) {
      final double bias = colM[col] - allM;
      final Mean colResidue = new Mean();
      visitColumn(
          mat,
          col,
          CellVisitor.SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (selrow);
              final double rowMean = rowM[row];
              double v = val - rowMean - bias;
              colResidue.put(v * v);
              return false;
            }
          });
      return colResidue.getMean();
    }

    /**
     * Updates the mask with replacement values for all data in the given rows and columns.
     *
     * @param mat Mask to update.
     * @param replacement Distribution to sample replacement values from.
     */
    protected void maskMatrix(final double[][] mat, final Distribution replacement) {
      visitAll(
          mat,
          CellVisitor.SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (selrow && selcol);
              mat[row][col] = replacement.nextRandom();
              return false;
            }
          });
    }

    /**
     * Select or deselect a column.
     *
     * @param cnum Column to select
     * @param set Value to set
     */
    protected void selectColumn(int cnum, boolean set) {
      if (set) {
        BitsUtil.setI(cols, cnum);
        colcard++;
      } else {
        BitsUtil.clearI(cols, cnum);
        colcard--;
      }
    }

    /**
     * Select or deselect a row.
     *
     * @param rnum Row to select
     * @param set Value to set
     */
    protected void selectRow(int rnum, boolean set) {
      if (set) {
        BitsUtil.setI(rows, rnum);
        rowcard++;
      } else {
        BitsUtil.clearI(rows, rnum);
        rowcard--;
      }
    }

    protected void invertRow(int rnum, boolean b) {
      BitsUtil.setI(irow, rnum);
    }
  }

  @Override
  public Clustering<BiclusterWithInversionsModel> biclustering() {
    double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);

    BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());

    Clustering<BiclusterWithInversionsModel> result =
        new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
    ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
    for (int i = 0; i < n; i++) {
      cand.reset();
      multipleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      singleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      nodeAddition(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      cand.maskMatrix(mat, dist);
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
      final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
      noise.removeDBIDs(cids);
      result.addToplevelCluster(new Cluster<>(cids, model));

      if (LOG.isVerbose()) {
        LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
        LOG.verbose("Number of rows: " + cand.rowcard + "\n");
        LOG.verbose("Number of columns: " + cand.colcard + "\n");
        // LOG.verbose("Total number of masked values: " + maskedVals.size() +
        // "\n");
      }
      LOG.incrementProcessed(prog);
    }
    // Add a noise cluster, full-dimensional.
    if (!noise.isEmpty()) {
      long[] allcols = BitsUtil.ones(getColDim());
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
      result.addToplevelCluster(new Cluster<>(noise, true, model));
    }
    LOG.ensureCompleted(prog);
    return result;
  }

  /**
   * Algorithm 1 of Cheng and Church:
   *
   * <p>Remove single rows or columns.
   *
   * <p>Inverted rows are not supported in this method.
   *
   * @param mat Data matrix
   * @param cand Bicluster candidate
   */
  private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
    // Assume that cand.residue is up to date!
    while (cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) {
      // Store current maximum. Need final mutable, so use arrays.
      final double[] max = {Double.NEGATIVE_INFINITY};
      final int[] best = {-1, -1};

      // Test rows
      if (cand.rowcard > 2) {
        cand.visitColumn(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selrow);
                double rowResidue = cand.computeRowResidue(mat, row, false);
                if (max[0] < rowResidue) {
                  max[0] = rowResidue;
                  best[0] = row;
                }
                return false;
              }
            });
      }

      // Test columns:
      if (cand.colcard > 2) {
        cand.visitRow(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selcol);
                double colResidue = cand.computeColResidue(mat, col);
                if (max[0] < colResidue) {
                  max[0] = colResidue;
                  best[1] = col;
                }
                return false;
              }
            });
      }

      if (best[1] >= 0) { // then override bestrow!
        cand.selectColumn(best[1], false);
      } else {
        assert (best[0] >= 0);
        cand.selectRow(best[0], false);
      }
      // TODO: incremental update could be much faster?
      cand.updateRowAndColumnMeans(mat, false);
      cand.computeMeanSquaredDeviation(mat);
      if (LOG.isDebuggingFine()) {
        LOG.debugFine(
            "Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
    }
  }

  //
  /**
   * Algorithm 2 of Cheng and Church.
   *
   * <p>Remove all rows and columns that reduce the residue by alpha.
   *
   * <p>Inverted rows are not supported in this method.
   *
   * @param mat Data matrix
   * @param cand Bicluster candidate
   */
  private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
    cand.updateRowAndColumnMeans(mat, false);
    cand.computeMeanSquaredDeviation(mat);

    // Note: assumes that cand.residue = H(I,J)
    while (cand.residue > delta) {
      final boolean[] modified = {false, false};

      // Step 2: remove rows above threshold
      if (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) {
        final double alphaResidue = alpha * cand.residue;
        cand.visitColumn(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selrow);
                if (cand.computeRowResidue(mat, row, false) > alphaResidue) {
                  cand.selectRow(row, false);
                  modified[0] = true;
                }
                return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD);
              }
            });

        // Step 3: update residue
        if (modified[0]) {
          cand.updateRowAndColumnMeans(mat, false);
          cand.computeMeanSquaredDeviation(mat);
        }
      }

      // Step 4: remove columns above threshold
      if (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) {
        final double alphaResidue = alpha * cand.residue;
        cand.visitRow(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selcol);
                if (cand.computeColResidue(mat, col) > alphaResidue) {
                  cand.selectColumn(col, false);
                  modified[1] = true;
                }
                return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD);
              }
            });
        if (modified[1]) {
          cand.updateRowAndColumnMeans(mat, false);
          cand.computeMeanSquaredDeviation(mat);
        }
      }

      if (LOG.isDebuggingFine()) {
        LOG.debugFine(
            "Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      // Step 5: if nothing has been removed, try removing single nodes.
      if (!modified[0] && !modified[1]) {
        break;
        // Will be executed next in main loop, as per algorithm 4.
        // singleNodeDeletion();
      }
    }
  }

  /**
   * Algorithm 3 of Cheng and Church.
   *
   * <p>Try to re-add rows or columns that decrease the overall score.
   *
   * <p>Also try adding inverted rows.
   *
   * @param mat Data matrix
   * @param cand Bicluster candidate
   */
  private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) {
    cand.updateRowAndColumnMeans(mat, true);
    cand.computeMeanSquaredDeviation(mat);
    while (true) {
      // We need this to be final + mutable
      final boolean[] added = new boolean[] {false, false};

      // Step 2: add columns
      cand.visitRow(
          mat,
          0,
          CellVisitor.NOT_SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (!selcol);
              if (cand.computeColResidue(mat, col) <= cand.residue) {
                cand.selectColumn(col, true);
                added[0] = true;
              }
              return false;
            }
          });

      // Step 3: recompute values
      if (added[0]) {
        cand.updateRowAndColumnMeans(mat, true);
        cand.computeMeanSquaredDeviation(mat);
      }

      // Step 4: try adding rows.
      cand.visitColumn(
          mat,
          0,
          CellVisitor.NOT_SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (!selrow);
              if (cand.computeRowResidue(mat, row, false) <= cand.residue) {
                cand.selectRow(row, true);
                added[1] = true;
              }
              return false;
            }
          });

      // Step 5: try adding inverted rows.
      if (useinverted) {
        cand.visitColumn(
            mat,
            0,
            CellVisitor.NOT_SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (!selrow);
                if (cand.computeRowResidue(mat, row, true) <= cand.residue) {
                  cand.selectRow(row, true);
                  cand.invertRow(row, true);
                  added[1] = true;
                }
                return false;
              }
            });
      }
      if (added[1]) {
        cand.updateRowAndColumnMeans(mat, true);
        cand.computeMeanSquaredDeviation(mat);
        if (LOG.isDebuggingFine()) {
          LOG.debugFine(
              "Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
        }
      }
      if (!added[0] && !added[1]) {
        break;
      }
    }
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   * @apiviz.exclude
   * @param <V> Vector type
   */
  public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
    /** Parameter to specify the distribution of replacement values when masking a cluster. */
    public static final OptionID DIST_ID =
        new OptionID(
            "chengandchurch.replacement",
            "Distribution of replacement values when masking found clusters.");

    /**
     * Threshold value to determine the maximal acceptable score (mean squared residue) of a
     * bicluster.
     *
     * <p>Key: {@code -chengandchurch.delta}
     */
    public static final OptionID DELTA_ID =
        new OptionID(
            "chengandchurch.delta",
            "Threshold value to determine the maximal acceptable score (mean squared residue) of a bicluster.");

    /**
     * Parameter for multiple node deletion to accelerate the algorithm. (&gt;= 1)
     *
     * <p>Key: {@code -chengandchurch.alpha}
     */
    public static final OptionID ALPHA_ID =
        new OptionID(
            "chengandchurch.alpha",
            "Parameter for multiple node deletion to accelerate the algorithm.");

    /**
     * Number of biclusters to be found.
     *
     * <p>Default value: 1
     *
     * <p>Key: {@code -chengandchurch.n}
     */
    public static final OptionID N_ID =
        new OptionID("chengandchurch.n", "The number of biclusters to be found.");

    /** Threshold for the score ({@link #DELTA_ID}). */
    private double delta;

    /**
     * The parameter for multiple node deletion.
     *
     * <p>It is used to magnify the {@link #delta} value in the {@link
     * ChengAndChurch#multipleNodeDeletion} method.
     */
    private double alpha;

    /** Number of biclusters to be found. */
    private int n;

    /** Distribution of replacement values. */
    private Distribution dist;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      DoubleParameter deltaP = new DoubleParameter(DELTA_ID);
      if (config.grab(deltaP)) {
        delta = deltaP.doubleValue();
      }
      deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);

      IntParameter nP = new IntParameter(N_ID, 1);
      nP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if (config.grab(nP)) {
        n = nP.intValue();
      }

      DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.);
      alphaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_DOUBLE);
      if (config.grab(alphaP)) {
        alpha = alphaP.doubleValue();
      }

      ObjectParameter<Distribution> distP =
          new ObjectParameter<>(DIST_ID, Distribution.class, UniformDistribution.class);
      if (config.grab(distP)) {
        dist = distP.instantiateClass(config);
      }
    }

    @Override
    protected ChengAndChurch<V> makeInstance() {
      return new ChengAndChurch<>(delta, alpha, n, dist);
    }
  }
}
Exemple #24
0
  /**
   * Algorithm 1 of Cheng and Church:
   *
   * <p>Remove single rows or columns.
   *
   * <p>Inverted rows are not supported in this method.
   *
   * @param mat Data matrix
   * @param cand Bicluster candidate
   */
  private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
    // Assume that cand.residue is up to date!
    while (cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) {
      // Store current maximum. Need final mutable, so use arrays.
      final double[] max = {Double.NEGATIVE_INFINITY};
      final int[] best = {-1, -1};

      // Test rows
      if (cand.rowcard > 2) {
        cand.visitColumn(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selrow);
                double rowResidue = cand.computeRowResidue(mat, row, false);
                if (max[0] < rowResidue) {
                  max[0] = rowResidue;
                  best[0] = row;
                }
                return false;
              }
            });
      }

      // Test columns:
      if (cand.colcard > 2) {
        cand.visitRow(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selcol);
                double colResidue = cand.computeColResidue(mat, col);
                if (max[0] < colResidue) {
                  max[0] = colResidue;
                  best[1] = col;
                }
                return false;
              }
            });
      }

      if (best[1] >= 0) { // then override bestrow!
        cand.selectColumn(best[1], false);
      } else {
        assert (best[0] >= 0);
        cand.selectRow(best[0], false);
      }
      // TODO: incremental update could be much faster?
      cand.updateRowAndColumnMeans(mat, false);
      cand.computeMeanSquaredDeviation(mat);
      if (LOG.isDebuggingFine()) {
        LOG.debugFine(
            "Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
    }
  }
Exemple #25
0
  @Override
  public Clustering<BiclusterWithInversionsModel> biclustering() {
    double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);

    BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());

    Clustering<BiclusterWithInversionsModel> result =
        new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
    ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
    for (int i = 0; i < n; i++) {
      cand.reset();
      multipleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      singleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      nodeAddition(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      cand.maskMatrix(mat, dist);
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
      final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
      noise.removeDBIDs(cids);
      result.addToplevelCluster(new Cluster<>(cids, model));

      if (LOG.isVerbose()) {
        LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
        LOG.verbose("Number of rows: " + cand.rowcard + "\n");
        LOG.verbose("Number of columns: " + cand.colcard + "\n");
        // LOG.verbose("Total number of masked values: " + maskedVals.size() +
        // "\n");
      }
      LOG.incrementProcessed(prog);
    }
    // Add a noise cluster, full-dimensional.
    if (!noise.isEmpty()) {
      long[] allcols = BitsUtil.ones(getColDim());
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
      result.addToplevelCluster(new Cluster<>(noise, true, model));
    }
    LOG.ensureCompleted(prog);
    return result;
  }
Exemple #26
0
/**
 * The standard k-means algorithm, using Lloyd-style bulk iterations.
 *
 * <p>Reference:<br>
 * S. Lloyd<br>
 * Least squares quantization in PCM<br>
 * IEEE Transactions on Information Theory 28 (2)<br>
 * previously published as Bell Telephone Laboratories Paper
 *
 * @author Arthur Zimek
 * @apiviz.landmark
 * @apiviz.has KMeansModel
 * @param <V> vector datatype
 */
@Title("K-Means")
@Description("Finds a least-squared partitioning into k clusters.")
@Reference(
    authors = "S. Lloyd", //
    title = "Least squares quantization in PCM", //
    booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", //
    url = "http://dx.doi.org/10.1109/TIT.1982.1056489")
public class KMeansLloyd<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
  /** The logger for this class. */
  private static final Logging LOG = Logging.getLogger(KMeansLloyd.class);

  /** Key for statistics logging. */
  private static final String KEY = KMeansLloyd.class.getName();

  /**
   * Constructor.
   *
   * @param distanceFunction distance function
   * @param k k parameter
   * @param maxiter Maxiter parameter
   * @param initializer Initialization method
   */
  public KMeansLloyd(
      NumberVectorDistanceFunction<? super V> distanceFunction,
      int k,
      int maxiter,
      KMeansInitialization<? super V> initializer) {
    super(distanceFunction, k, maxiter, initializer);
  }

  @Override
  public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
      return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
      clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment =
        DataStoreUtil.makeIntegerStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];

    IndefiniteProgress prog =
        LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat =
        LOG.isStatistics()
            ? new DoubleStatistic(this.getClass().getName() + ".variance-sum")
            : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
      LOG.incrementProcessed(prog);
      boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
      logVarstat(varstat, varsum);
      // Stop if no cluster assignment changed.
      if (!changed) {
        break;
      }
      // Recompute means.
      means = means(clusters, means, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
      LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }

    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
      DBIDs ids = clusters.get(i);
      if (ids.size() == 0) {
        continue;
      }
      KMeansModel model = new KMeansModel(means[i], varsum[i]);
      result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   * @apiviz.exclude
   */
  public static class Parameterizer<V extends NumberVector>
      extends AbstractKMeans.Parameterizer<V> {
    @Override
    protected Logging getLogger() {
      return LOG;
    }

    @Override
    protected KMeansLloyd<V> makeInstance() {
      return new KMeansLloyd<>(distanceFunction, k, maxiter, initializer);
    }
  }
}
Exemple #27
0
/**
 * Fast Outlier Detection Using the "Local Correlation Integral".
 *
 * <p>Exact implementation only, not aLOCI. See {@link ALOCI}.
 *
 * <p>Outlier detection using multiple epsilon neighborhoods.
 *
 * <p>This implementation has O(n<sup>3</sup> log n) runtime complexity!
 *
 * <p>Based on: S. Papadimitriou, H. Kitagawa, P. B. Gibbons and C. Faloutsos: LOCI: Fast Outlier
 * Detection Using the Local Correlation Integral. In: Proc. 19th IEEE Int. Conf. on Data
 * Engineering (ICDE '03), Bangalore, India, 2003.
 *
 * @author Erich Schubert
 * @apiviz.has RangeQuery
 * @param <O> Object type
 */
@Title("LOCI: Fast Outlier Detection Using the Local Correlation Integral")
@Description("Algorithm to compute outliers based on the Local Correlation Integral")
@Reference(
    authors = "S. Papadimitriou, H. Kitagawa, P. B. Gibbons, C. Faloutsos",
    title = "LOCI: Fast Outlier Detection Using the Local Correlation Integral",
    booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003",
    url = "http://dx.doi.org/10.1109/ICDE.2003.1260802")
@Alias({"de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI"})
public class LOCI<O> extends AbstractDistanceBasedAlgorithm<O, OutlierResult>
    implements OutlierAlgorithm {
  /** The logger for this class. */
  private static final Logging LOG = Logging.getLogger(LOCI.class);

  /**
   * Parameter to specify the maximum radius of the neighborhood to be considered, must be suitable
   * to the distance function specified.
   */
  public static final OptionID RMAX_ID =
      new OptionID("loci.rmax", "The maximum radius of the neighborhood to be considered.");

  /** Parameter to specify the minimum neighborhood size */
  public static final OptionID NMIN_ID =
      new OptionID("loci.nmin", "Minimum neighborhood size to be considered.");

  /** Parameter to specify the averaging neighborhood scaling. */
  public static final OptionID ALPHA_ID =
      new OptionID("loci.alpha", "Scaling factor for averaging neighborhood");

  /** Holds the value of {@link #RMAX_ID}. */
  private double rmax;

  /** Holds the value of {@link #NMIN_ID}. */
  private int nmin;

  /** Holds the value of {@link #ALPHA_ID}. */
  private double alpha;

  /**
   * Constructor.
   *
   * @param distanceFunction Distance function
   * @param rmax Maximum radius
   * @param nmin Minimum neighborhood size
   * @param alpha Alpha value
   */
  public LOCI(DistanceFunction<? super O> distanceFunction, double rmax, int nmin, double alpha) {
    super(distanceFunction);
    this.rmax = rmax;
    this.nmin = nmin;
    this.alpha = alpha;
  }

  /**
   * Run the algorithm
   *
   * @param database Database to process
   * @param relation Relation to process
   * @return Outlier result
   */
  public OutlierResult run(Database database, Relation<O> relation) {
    DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
    RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc);
    DBIDs ids = relation.getDBIDs();

    // LOCI preprocessing step
    WritableDataStore<DoubleIntArrayList> interestingDistances =
        DataStoreUtil.makeStorage(
            relation.getDBIDs(),
            DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED,
            DoubleIntArrayList.class);
    precomputeInterestingRadii(ids, rangeQuery, interestingDistances);
    // LOCI main step
    FiniteProgress progressLOCI =
        LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null;
    WritableDoubleDataStore mdef_norm =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore mdef_radius =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax minmax = new DoubleMinMax();

    // Shared instance, to save allocations.
    MeanVariance mv_n_r_alpha = new MeanVariance();

    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
      final DoubleIntArrayList cdist = interestingDistances.get(iditer);
      final double maxdist = cdist.getDouble(cdist.size() - 1);
      final int maxneig = cdist.getInt(cdist.size() - 1);

      double maxmdefnorm = 0.0;
      double maxnormr = 0;
      if (maxneig >= nmin) {
        // Compute the largest neighborhood we will need.
        DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist);
        // TODO: Ensure the result is sorted. This is currently implied.

        // For any critical distance, compute the normalized MDEF score.
        for (int i = 0, size = cdist.size(); i < size; i++) {
          // Only start when minimum size is fulfilled
          if (cdist.getInt(i) < nmin) {
            continue;
          }
          final double r = cdist.getDouble(i);
          final double alpha_r = alpha * r;
          // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!)
          final int n_alphar = cdist.getInt(cdist.find(alpha_r));
          // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF}
          mv_n_r_alpha.reset();
          for (DoubleDBIDListIter neighbor = maxneighbors.iter();
              neighbor.valid();
              neighbor.advance()) {
            // Stop at radius r
            if (neighbor.doubleValue() > r) {
              break;
            }
            DoubleIntArrayList cdist2 = interestingDistances.get(neighbor);
            int rn_alphar = cdist2.getInt(cdist2.find(alpha_r));
            mv_n_r_alpha.put(rn_alphar);
          }
          // We only use the average and standard deviation
          final double nhat_r_alpha = mv_n_r_alpha.getMean();
          final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev();

          // Redundant divisions by nhat_r_alpha removed.
          final double mdef = nhat_r_alpha - n_alphar;
          final double sigmamdef = sigma_nhat_r_alpha;
          final double mdefnorm = mdef / sigmamdef;

          if (mdefnorm > maxmdefnorm) {
            maxmdefnorm = mdefnorm;
            maxnormr = r;
          }
        }
      } else {
        // FIXME: when nmin was not fulfilled - what is the proper value then?
        maxmdefnorm = Double.POSITIVE_INFINITY;
        maxnormr = maxdist;
      }
      mdef_norm.putDouble(iditer, maxmdefnorm);
      mdef_radius.putDouble(iditer, maxnormr);
      minmax.put(maxmdefnorm);
      LOG.incrementProcessed(progressLOCI);
    }
    LOG.ensureCompleted(progressLOCI);
    DoubleRelation scoreResult =
        new MaterializedDoubleRelation(
            "LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs());
    OutlierScoreMeta scoreMeta =
        new QuotientOutlierScoreMeta(
            minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0);
    OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
    result.addChildResult(
        new MaterializedDoubleRelation(
            "LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs()));
    return result;
  }

  /**
   * Preprocessing step: determine the radii of interest for each point.
   *
   * @param ids IDs to process
   * @param rangeQuery Range query
   * @param interestingDistances Distances of interest
   */
  protected void precomputeInterestingRadii(
      DBIDs ids,
      RangeQuery<O> rangeQuery,
      WritableDataStore<DoubleIntArrayList> interestingDistances) {
    FiniteProgress progressPreproc =
        LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null;
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
      DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax);
      // build list of critical distances
      DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1);
      {
        int i = 0;
        DoubleDBIDListIter ni = neighbors.iter();
        while (ni.valid()) {
          final double curdist = ni.doubleValue();
          ++i;
          ni.advance();
          // Skip, if tied to the next object:
          if (ni.valid() && curdist == ni.doubleValue()) {
            continue;
          }
          cdist.append(curdist, i);
          // Scale radius, and reinsert
          if (alpha != 1.) {
            final double ri = curdist / alpha;
            if (ri <= rmax) {
              cdist.append(ri, Integer.MIN_VALUE);
            }
          }
        }
      }
      cdist.sort();

      // fill the gaps to have fast lookups of number of neighbors at a given
      // distance.
      int lastk = 0;
      for (int i = 0, size = cdist.size(); i < size; i++) {
        final int k = cdist.getInt(i);
        if (k == Integer.MIN_VALUE) {
          cdist.setValue(i, lastk);
        } else {
          lastk = k;
        }
      }
      // TODO: shrink the list, removing duplicate radii?

      interestingDistances.put(iditer, cdist);
      LOG.incrementProcessed(progressPreproc);
    }
    LOG.ensureCompleted(progressPreproc);
  }

  /**
   * Array of double-int values.
   *
   * @author Erich Schubert
   * @apiviz.exclude
   */
  protected static class DoubleIntArrayList {
    /** Double keys */
    double[] keys;

    /** Integer values */
    int[] vals;

    /** Used size */
    int size = 0;

    /**
     * Constructor.
     *
     * @param alloc Initial allocation.
     */
    public DoubleIntArrayList(int alloc) {
      keys = new double[alloc];
      vals = new int[alloc];
      size = 0;
    }

    /**
     * Collection size.
     *
     * @return Size
     */
    public int size() {
      return size;
    }

    /**
     * Get the key at the given position.
     *
     * @param i Position
     * @return Key
     */
    public double getDouble(int i) {
      return keys[i];
    }

    /**
     * Get the value at the given position.
     *
     * @param i Position
     * @return Value
     */
    public int getInt(int i) {
      return vals[i];
    }

    /**
     * Get the value at the given position.
     *
     * @param i Position
     * @param val New value
     */
    public void setValue(int i, int val) {
      vals[i] = val;
    }

    /**
     * Append a key-value pair.
     *
     * @param key Key to append
     * @param val Value to append.
     */
    public void append(double key, int val) {
      if (size == keys.length) {
        keys = Arrays.copyOf(keys, size << 1);
        vals = Arrays.copyOf(vals, size << 1);
      }
      keys[size] = key;
      vals[size] = val;
      ++size;
    }

    /**
     * Find the last position with a smaller or equal key.
     *
     * @param search Key
     * @return Position
     */
    public int find(final double search) {
      int a = 0, b = size - 1;
      while (a <= b) {
        final int mid = (a + b) >>> 1;
        final double cur = keys[mid];
        if (cur > search) {
          b = mid - 1;
        } else { // less or equal!
          a = mid + 1;
        }
      }
      return b;
    }

    /** Sort the array list. */
    public void sort() {
      DoubleIntegerArrayQuickSort.sort(keys, vals, size);
    }
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   * @apiviz.exclude
   * @param <O> Object type
   */
  public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> {
    protected double rmax;

    protected int nmin = 0;

    protected double alpha = 0.5;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      final DoubleParameter rmaxP = new DoubleParameter(RMAX_ID);
      if (config.grab(rmaxP)) {
        rmax = rmaxP.doubleValue();
      }

      final IntParameter nminP = new IntParameter(NMIN_ID, 20);
      if (config.grab(nminP)) {
        nmin = nminP.intValue();
      }

      final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5);
      if (config.grab(alphaP)) {
        alpha = alphaP.getValue();
      }
    }

    @Override
    protected LOCI<O> makeInstance() {
      return new LOCI<>(distanceFunction, rmax, nmin, alpha);
    }
  }
}
Exemple #28
0
/**
 * Implementation of the SUBCLU algorithm, an algorithm to detect arbitrarily shaped and positioned
 * clusters in subspaces. SUBCLU delivers for each subspace the same clusters DBSCAN would have
 * found, when applied to this subspace separately.
 *
 * <p>Reference: <br>
 * K. Kailing, H.-P. Kriegel, P. Kröger:<br>
 * Density connected Subspace Clustering for High Dimensional Data<br>
 * In Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004.
 *
 * @author Elke Achtert
 * @apiviz.uses DBSCAN
 * @apiviz.uses DimensionSelectingSubspaceDistanceFunction
 * @apiviz.has SubspaceModel
 * @param <V> the type of FeatureVector handled by this Algorithm
 */
@Title("SUBCLU: Density connected Subspace Clustering")
@Description(
    "Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. " //
        + "SUBCLU delivers for each subspace the same clusters DBSCAN would have found, " //
        + "when applied to this subspace seperately.")
@Reference(
    authors = "K. Kailing, H.-P. Kriegel, P. Kröger", //
    title = "Density connected Subspace Clustering for High Dimensional Data", //
    booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004", //
    url = "http://www.siam.org/meetings/sdm04/proceedings/sdm04_023.pdf")
public class SUBCLU<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>>
    implements SubspaceClusteringAlgorithm<SubspaceModel> {
  /** The logger for this class. */
  private static final Logging LOG = Logging.getLogger(SUBCLU.class);

  /**
   * The distance function to determine the distance between database objects.
   *
   * <p>Default value: {@link SubspaceEuclideanDistanceFunction}
   *
   * <p>Key: {@code -subclu.distancefunction}
   */
  public static final OptionID DISTANCE_FUNCTION_ID =
      new OptionID(
          "subclu.distancefunction",
          "Distance function to determine the distance between database objects.");

  /**
   * Parameter to specify the maximum radius of the neighborhood to be considered, must be suitable
   * to {@link DimensionSelectingSubspaceDistanceFunction}.
   *
   * <p>Key: {@code -subclu.epsilon}
   */
  public static final OptionID EPSILON_ID =
      new OptionID("subclu.epsilon", "The maximum radius of the neighborhood to be considered.");

  /**
   * Parameter to specify the threshold for minimum number of points in the epsilon-neighborhood of
   * a point, must be an integer greater than 0.
   *
   * <p>Key: {@code -subclu.minpts}
   */
  public static final OptionID MINPTS_ID =
      new OptionID(
          "subclu.minpts",
          "Threshold for minimum number of points in the epsilon-neighborhood of a point.");

  /** Holds the instance of the distance function specified by {@link #DISTANCE_FUNCTION_ID}. */
  private DimensionSelectingSubspaceDistanceFunction<V> distanceFunction;

  /** Holds the value of {@link #EPSILON_ID}. */
  private double epsilon;

  /** Holds the value of {@link #MINPTS_ID}. */
  private int minpts;

  /** Holds the result; */
  private Clustering<SubspaceModel> result;

  /**
   * Constructor.
   *
   * @param distanceFunction Distance function
   * @param epsilon Epsilon value
   * @param minpts Minpts value
   */
  public SUBCLU(
      DimensionSelectingSubspaceDistanceFunction<V> distanceFunction, double epsilon, int minpts) {
    super();
    this.distanceFunction = distanceFunction;
    this.epsilon = epsilon;
    this.minpts = minpts;
  }

  /**
   * Performs the SUBCLU algorithm on the given database.
   *
   * @param relation Relation to process
   * @return Clustering result
   */
  public Clustering<SubspaceModel> run(Relation<V> relation) {
    final int dimensionality = RelationUtil.dimensionality(relation);

    StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null;

    // Generate all 1-dimensional clusters
    LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters.");

    // mapping of dimensionality to set of subspaces
    HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>();

    // list of 1-dimensional subspaces containing clusters
    List<Subspace> s_1 = new ArrayList<>();
    subspaceMap.put(0, s_1);

    // mapping of subspaces to list of clusters
    TreeMap<Subspace, List<Cluster<Model>>> clusterMap =
        new TreeMap<>(new Subspace.DimensionComparator());

    for (int d = 0; d < dimensionality; d++) {
      Subspace currentSubspace = new Subspace(d);
      List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace);

      if (LOG.isDebuggingFiner()) {
        StringBuilder msg = new StringBuilder();
        msg.append('\n')
            .append(clusters.size())
            .append(" clusters in subspace ")
            .append(currentSubspace.dimensonsToString())
            .append(": \n");
        for (Cluster<Model> cluster : clusters) {
          msg.append("      " + cluster.getIDs() + "\n");
        }
        LOG.debugFiner(msg.toString());
      }

      if (!clusters.isEmpty()) {
        s_1.add(currentSubspace);
        clusterMap.put(currentSubspace, clusters);
      }
    }

    // Generate (d+1)-dimensional clusters from d-dimensional clusters
    for (int d = 0; d < dimensionality - 1; d++) {
      if (stepprog != null) {
        stepprog.beginStep(
            d + 2,
            "Generate "
                + (d + 2)
                + "-dimensional clusters from "
                + (d + 1)
                + "-dimensional clusters.",
            LOG);
      }

      List<Subspace> subspaces = subspaceMap.get(d);
      if (subspaces == null || subspaces.isEmpty()) {
        if (stepprog != null) {
          for (int dim = d + 1; dim < dimensionality - 1; dim++) {
            stepprog.beginStep(
                dim + 2,
                "Generation of"
                    + (dim + 2)
                    + "-dimensional clusters not applicable, because no more "
                    + (d + 2)
                    + "-dimensional subspaces found.",
                LOG);
          }
        }
        break;
      }

      List<Subspace> candidates = generateSubspaceCandidates(subspaces);
      List<Subspace> s_d = new ArrayList<>();

      for (Subspace candidate : candidates) {
        Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap);
        if (LOG.isDebuggingFine()) {
          LOG.debugFine(
              "best subspace of "
                  + candidate.dimensonsToString()
                  + ": "
                  + bestSubspace.dimensonsToString());
        }

        List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace);
        List<Cluster<Model>> clusters = new ArrayList<>();
        for (Cluster<Model> cluster : bestSubspaceClusters) {
          List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate);
          if (!candidateClusters.isEmpty()) {
            clusters.addAll(candidateClusters);
          }
        }

        if (LOG.isDebuggingFine()) {
          StringBuilder msg = new StringBuilder();
          msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n");
          for (Cluster<Model> c : clusters) {
            msg.append("      " + c.getIDs() + "\n");
          }
          LOG.debugFine(msg.toString());
        }

        if (!clusters.isEmpty()) {
          s_d.add(candidate);
          clusterMap.put(candidate, clusters);
        }
      }

      if (!s_d.isEmpty()) {
        subspaceMap.put(d + 1, s_d);
      }
    }

    // build result
    int numClusters = 1;
    result = new Clustering<>("SUBCLU clustering", "subclu-clustering");
    for (Subspace subspace : clusterMap.descendingKeySet()) {
      List<Cluster<Model>> clusters = clusterMap.get(subspace);
      for (Cluster<Model> cluster : clusters) {
        Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs());
        newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs())));
        newCluster.setName("cluster_" + numClusters++);
        result.addToplevelCluster(newCluster);
      }
    }

    LOG.setCompleted(stepprog);
    return result;
  }

  /**
   * Returns the result of the algorithm.
   *
   * @return the result of the algorithm
   */
  public Clustering<SubspaceModel> getResult() {
    return result;
  }

  /**
   * Runs the DBSCAN algorithm on the specified partition of the database in the given subspace. If
   * parameter {@code ids} is null DBSCAN will be applied to the whole database.
   *
   * @param relation the database holding the objects to run DBSCAN on
   * @param ids the IDs of the database defining the partition to run DBSCAN on - if this parameter
   *     is null DBSCAN will be applied to the whole database
   * @param subspace the subspace to run DBSCAN on
   * @return the clustering result of the DBSCAN run
   */
  private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace subspace) {
    // distance function
    distanceFunction.setSelectedDimensions(subspace.getDimensions());

    ProxyDatabase proxy;
    if (ids == null) {
      // TODO: in this case, we might want to use an index - the proxy below
      // will prevent this!
      ids = relation.getDBIDs();
    }

    proxy = new ProxyDatabase(ids, relation);

    DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts);
    // run DBSCAN
    if (LOG.isVerbose()) {
      LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString());
    }
    Clustering<Model> dbsres = dbscan.run(proxy);

    // separate cluster and noise
    List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters();
    List<Cluster<Model>> clusters = new ArrayList<>();
    for (Cluster<Model> c : clusterAndNoise) {
      if (!c.isNoise()) {
        clusters.add(c);
      }
    }
    return clusters;
  }

  /**
   * Generates {@code d+1}-dimensional subspace candidates from the specified {@code d}-dimensional
   * subspaces.
   *
   * @param subspaces the {@code d}-dimensional subspaces
   * @return the {@code d+1}-dimensional subspace candidates
   */
  private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) {
    List<Subspace> candidates = new ArrayList<>();

    if (subspaces.isEmpty()) {
      return candidates;
    }

    // Generate (d+1)-dimensional candidate subspaces
    int d = subspaces.get(0).dimensionality();

    StringBuilder msgFine = new StringBuilder("\n");
    if (LOG.isDebuggingFiner()) {
      msgFine.append("subspaces ").append(subspaces).append('\n');
    }

    for (int i = 0; i < subspaces.size(); i++) {
      Subspace s1 = subspaces.get(i);
      for (int j = i + 1; j < subspaces.size(); j++) {
        Subspace s2 = subspaces.get(j);
        Subspace candidate = s1.join(s2);

        if (candidate != null) {
          if (LOG.isDebuggingFiner()) {
            msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n');
          }
          // prune irrelevant candidate subspaces
          List<Subspace> lowerSubspaces = lowerSubspaces(candidate);
          if (LOG.isDebuggingFiner()) {
            msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n');
          }
          boolean irrelevantCandidate = false;
          for (Subspace s : lowerSubspaces) {
            if (!subspaces.contains(s)) {
              irrelevantCandidate = true;
              break;
            }
          }
          if (!irrelevantCandidate) {
            candidates.add(candidate);
          }
        }
      }
    }

    if (LOG.isDebuggingFiner()) {
      LOG.debugFiner(msgFine.toString());
    }
    if (LOG.isDebugging()) {
      StringBuilder msg = new StringBuilder();
      msg.append(d + 1).append("-dimensional candidate subspaces: ");
      for (Subspace candidate : candidates) {
        msg.append(candidate.dimensonsToString()).append(' ');
      }
      LOG.debug(msg.toString());
    }

    return candidates;
  }

  /**
   * Returns the list of all {@code (d-1)}-dimensional subspaces of the specified {@code
   * d}-dimensional subspace.
   *
   * @param subspace the {@code d}-dimensional subspace
   * @return a list of all {@code (d-1)}-dimensional subspaces
   */
  private List<Subspace> lowerSubspaces(Subspace subspace) {
    int dimensionality = subspace.dimensionality();
    if (dimensionality <= 1) {
      return null;
    }

    // order result according to the dimensions
    List<Subspace> result = new ArrayList<>();
    long[] dimensions = subspace.getDimensions();
    for (int dim = BitsUtil.nextSetBit(dimensions, 0);
        dim >= 0;
        dim = BitsUtil.nextSetBit(dimensions, dim + 1)) {
      long[] newDimensions = dimensions.clone();
      BitsUtil.clearI(newDimensions, dim);
      result.add(new Subspace(newDimensions));
    }

    return result;
  }

  /**
   * Determines the {@code d}-dimensional subspace of the {@code (d+1)} -dimensional candidate with
   * minimal number of objects in the cluster.
   *
   * @param subspaces the list of {@code d}-dimensional subspaces containing clusters
   * @param candidate the {@code (d+1)}-dimensional candidate subspace
   * @param clusterMap the mapping of subspaces to clusters
   * @return the {@code d}-dimensional subspace of the {@code (d+1)} -dimensional candidate with
   *     minimal number of objects in the cluster
   */
  private Subspace bestSubspace(
      List<Subspace> subspaces,
      Subspace candidate,
      TreeMap<Subspace, List<Cluster<Model>>> clusterMap) {
    Subspace bestSubspace = null;

    for (Subspace subspace : subspaces) {
      int min = Integer.MAX_VALUE;

      if (subspace.isSubspace(candidate)) {
        List<Cluster<Model>> clusters = clusterMap.get(subspace);
        for (Cluster<Model> cluster : clusters) {
          int clusterSize = cluster.size();
          if (clusterSize < min) {
            min = clusterSize;
            bestSubspace = subspace;
          }
        }
      }
    }

    return bestSubspace;
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   * @apiviz.exclude
   */
  public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
    protected int minpts = 0;

    protected double epsilon;

    protected DimensionSelectingSubspaceDistanceFunction<V> distance = null;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      ObjectParameter<DimensionSelectingSubspaceDistanceFunction<V>> param =
          new ObjectParameter<>(
              DISTANCE_FUNCTION_ID,
              DimensionSelectingSubspaceDistanceFunction.class,
              SubspaceEuclideanDistanceFunction.class);
      if (config.grab(param)) {
        distance = param.instantiateClass(config);
      }

      DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID);
      if (config.grab(epsilonP)) {
        epsilon = epsilonP.getValue();
      }

      IntParameter minptsP = new IntParameter(MINPTS_ID);
      minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if (config.grab(minptsP)) {
        minpts = minptsP.getValue();
      }
    }

    @Override
    protected SUBCLU<V> makeInstance() {
      return new SUBCLU<>(distance, epsilon, minpts);
    }
  }
}
Exemple #29
0
  /**
   * Run the algorithm
   *
   * @param database Database to process
   * @param relation Relation to process
   * @return Outlier result
   */
  public OutlierResult run(Database database, Relation<O> relation) {
    DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
    RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc);
    DBIDs ids = relation.getDBIDs();

    // LOCI preprocessing step
    WritableDataStore<DoubleIntArrayList> interestingDistances =
        DataStoreUtil.makeStorage(
            relation.getDBIDs(),
            DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED,
            DoubleIntArrayList.class);
    precomputeInterestingRadii(ids, rangeQuery, interestingDistances);
    // LOCI main step
    FiniteProgress progressLOCI =
        LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null;
    WritableDoubleDataStore mdef_norm =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore mdef_radius =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax minmax = new DoubleMinMax();

    // Shared instance, to save allocations.
    MeanVariance mv_n_r_alpha = new MeanVariance();

    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
      final DoubleIntArrayList cdist = interestingDistances.get(iditer);
      final double maxdist = cdist.getDouble(cdist.size() - 1);
      final int maxneig = cdist.getInt(cdist.size() - 1);

      double maxmdefnorm = 0.0;
      double maxnormr = 0;
      if (maxneig >= nmin) {
        // Compute the largest neighborhood we will need.
        DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist);
        // TODO: Ensure the result is sorted. This is currently implied.

        // For any critical distance, compute the normalized MDEF score.
        for (int i = 0, size = cdist.size(); i < size; i++) {
          // Only start when minimum size is fulfilled
          if (cdist.getInt(i) < nmin) {
            continue;
          }
          final double r = cdist.getDouble(i);
          final double alpha_r = alpha * r;
          // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!)
          final int n_alphar = cdist.getInt(cdist.find(alpha_r));
          // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF}
          mv_n_r_alpha.reset();
          for (DoubleDBIDListIter neighbor = maxneighbors.iter();
              neighbor.valid();
              neighbor.advance()) {
            // Stop at radius r
            if (neighbor.doubleValue() > r) {
              break;
            }
            DoubleIntArrayList cdist2 = interestingDistances.get(neighbor);
            int rn_alphar = cdist2.getInt(cdist2.find(alpha_r));
            mv_n_r_alpha.put(rn_alphar);
          }
          // We only use the average and standard deviation
          final double nhat_r_alpha = mv_n_r_alpha.getMean();
          final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev();

          // Redundant divisions by nhat_r_alpha removed.
          final double mdef = nhat_r_alpha - n_alphar;
          final double sigmamdef = sigma_nhat_r_alpha;
          final double mdefnorm = mdef / sigmamdef;

          if (mdefnorm > maxmdefnorm) {
            maxmdefnorm = mdefnorm;
            maxnormr = r;
          }
        }
      } else {
        // FIXME: when nmin was not fulfilled - what is the proper value then?
        maxmdefnorm = Double.POSITIVE_INFINITY;
        maxnormr = maxdist;
      }
      mdef_norm.putDouble(iditer, maxmdefnorm);
      mdef_radius.putDouble(iditer, maxnormr);
      minmax.put(maxmdefnorm);
      LOG.incrementProcessed(progressLOCI);
    }
    LOG.ensureCompleted(progressLOCI);
    DoubleRelation scoreResult =
        new MaterializedDoubleRelation(
            "LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs());
    OutlierScoreMeta scoreMeta =
        new QuotientOutlierScoreMeta(
            minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0);
    OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
    result.addChildResult(
        new MaterializedDoubleRelation(
            "LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs()));
    return result;
  }
Exemple #30
0
/**
 * Density-Based Clustering of Applications with Noise (DBSCAN), an algorithm to find
 * density-connected sets in a database.
 *
 * <p>Reference: <br>
 * M. Ester, H.-P. Kriegel, J. Sander, X. Xu<br>
 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise<br>
 * In Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996.
 *
 * @author Arthur Zimek
 * @param <O> the type of Object the algorithm is applied to
 */
@Title("DBSCAN: Density-Based Clustering of Applications with Noise")
@Description(
    "Algorithm to find density-connected sets in a database based on the parameters 'minpts' and 'epsilon' (specifying a volume). "
        + "These two parameters determine a density threshold for clustering.")
@Reference(
    authors = "M. Ester, H.-P. Kriegel, J. Sander, X. Xu", //
    title =
        "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", //
    booktitle =
        "Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996", //
    url = "http://www.aaai.org/Papers/KDD/1996/KDD96-037")
public class DBSCAN<O> extends AbstractDistanceBasedAlgorithm<O, Clustering<Model>>
    implements ClusteringAlgorithm<Clustering<Model>> {
  /** The logger for this class. */
  private static final Logging LOG = Logging.getLogger(DBSCAN.class);

  /** Holds the epsilon radius threshold. */
  protected double epsilon;

  /** Holds the minimum cluster size. */
  protected int minpts;

  /** Holds a list of clusters found. */
  protected List<ModifiableDBIDs> resultList;

  /** Holds a set of noise. */
  protected ModifiableDBIDs noise;

  /** Holds a set of processed ids. */
  protected ModifiableDBIDs processedIDs;

  /** Number of neighbors. */
  protected long ncounter;

  /**
   * Constructor with parameters.
   *
   * @param distanceFunction Distance function
   * @param epsilon Epsilon value
   * @param minpts Minpts parameter
   */
  public DBSCAN(DistanceFunction<? super O> distanceFunction, double epsilon, int minpts) {
    super(distanceFunction);
    this.epsilon = epsilon;
    this.minpts = minpts;
  }

  /** Performs the DBSCAN algorithm on the given database. */
  public Clustering<Model> run(Relation<O> relation) {
    final int size = relation.size();
    if (size < minpts) {
      Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
      result.addToplevelCluster(
          new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER));
      return result;
    }

    RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
    resultList = new ArrayList<>();
    noise = DBIDUtil.newHashSet();
    runDBSCAN(relation, rangeQuery);

    double averagen = ncounter / (double) relation.size();
    LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen));
    if (averagen < 1 + 0.1 * (minpts - 1)) {
      LOG.warning("There are very few neighbors found. Epsilon may be too small.");
    }
    if (averagen > 100 * minpts) {
      LOG.warning("There are very many neighbors found. Epsilon may be too large.");
    }

    Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
    for (ModifiableDBIDs res : resultList) {
      result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER));
    }
    result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return result;
  }

  /**
   * Run the DBSCAN algorithm
   *
   * @param relation Data relation
   * @param rangeQuery Range query class
   */
  protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) {
    final int size = relation.size();
    FiniteProgress objprog =
        LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null;
    IndefiniteProgress clusprog =
        LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;

    processedIDs = DBIDUtil.newHashSet(size);
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      if (!processedIDs.contains(iditer)) {
        expandCluster(relation, rangeQuery, iditer, objprog, clusprog);
      }
      if (objprog != null && clusprog != null) {
        objprog.setProcessed(processedIDs.size(), LOG);
        clusprog.setProcessed(resultList.size(), LOG);
      }
      if (processedIDs.size() == size) {
        break;
      }
    }
    // Finish progress logging
    LOG.ensureCompleted(objprog);
    LOG.setCompleted(clusprog);
  }

  /**
   * DBSCAN-function expandCluster.
   *
   * <p>Border-Objects become members of the first possible cluster.
   *
   * @param relation Database relation to run on
   * @param rangeQuery Range query to use
   * @param startObjectID potential seed of a new potential cluster
   * @param objprog the progress object for logging the current status
   */
  protected void expandCluster(
      Relation<O> relation,
      RangeQuery<O> rangeQuery,
      DBIDRef startObjectID,
      FiniteProgress objprog,
      IndefiniteProgress clusprog) {
    DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
    ncounter += neighbors.size();

    // startObject is no core-object
    if (neighbors.size() < minpts) {
      noise.add(startObjectID);
      processedIDs.add(startObjectID);
      if (objprog != null) {
        objprog.incrementProcessed(LOG);
      }
      return;
    }

    ModifiableDBIDs currentCluster = DBIDUtil.newArray();
    currentCluster.add(startObjectID);
    processedIDs.add(startObjectID);

    // try to expand the cluster
    HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet();
    processNeighbors(neighbors.iter(), currentCluster, seeds);

    DBIDVar o = DBIDUtil.newVar();
    while (!seeds.isEmpty()) {
      seeds.pop(o);
      neighbors = rangeQuery.getRangeForDBID(o, epsilon);
      ncounter += neighbors.size();

      if (neighbors.size() >= minpts) {
        processNeighbors(neighbors.iter(), currentCluster, seeds);
      }

      if (objprog != null) {
        objprog.incrementProcessed(LOG);
      }
    }
    resultList.add(currentCluster);
    if (clusprog != null) {
      clusprog.setProcessed(resultList.size(), LOG);
    }
  }

  /**
   * Process a single core point.
   *
   * @param neighbor Iterator over neighbors
   * @param currentCluster Current cluster
   * @param seeds Seed set
   */
  private void processNeighbors(
      DBIDIter neighbor, ModifiableDBIDs currentCluster, HashSetModifiableDBIDs seeds) {
    for (; neighbor.valid(); neighbor.advance()) {
      if (processedIDs.add(neighbor)) {
        seeds.add(neighbor);
      } else if (!noise.remove(neighbor)) {
        continue;
      }
      currentCluster.add(neighbor);
    }
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   * @apiviz.exclude
   */
  public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> {
    /**
     * Parameter to specify the maximum radius of the neighborhood to be considered, must be
     * suitable to the distance function specified.
     */
    public static final OptionID EPSILON_ID =
        new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered.");

    /**
     * Parameter to specify the threshold for minimum number of points in the epsilon-neighborhood
     * of a point, must be an integer greater than 0.
     */
    public static final OptionID MINPTS_ID =
        new OptionID(
            "dbscan.minpts",
            "Threshold for minimum number of points in the epsilon-neighborhood of a point. The suggested value is '2 * dim - 1'.");

    /** Holds the epsilon radius threshold. */
    protected double epsilon;

    /** Holds the minimum cluster size. */
    protected int minpts;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      DoubleParameter epsilonP =
          new DoubleParameter(EPSILON_ID) //
              .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
      if (config.grab(epsilonP)) {
        epsilon = epsilonP.getValue();
      }

      IntParameter minptsP =
          new IntParameter(MINPTS_ID) //
              .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if (config.grab(minptsP)) {
        minpts = minptsP.getValue();
        if (minpts <= 2) {
          LOG.warning(
              "DBSCAN with minPts <= 2 is equivalent to single-link clustering at a single height. Consider using larger values of minPts.");
        }
      }
    }

    @Override
    protected DBSCAN<O> makeInstance() {
      return new DBSCAN<>(distanceFunction, epsilon, minpts);
    }
  }
}