Пример #1
   * Subspace relevance test.
   * @param subspace Subspace to test
   * @param neigh Neighbor list
   * @param kernel Kernel density estimator
   * @return relevance test result
  protected boolean relevantSubspace(
      long[] subspace, DoubleDBIDList neigh, KernelDensityEstimator kernel) {
    Relation<V> relation = kernel.relation;
    final double crit = K_S_CRITICAL001 / Math.sqrt(neigh.size());

    for (int dim = BitsUtil.nextSetBit(subspace, 0);
        dim > 0;
        dim = BitsUtil.nextSetBit(subspace, dim + 1)) {
      // TODO: can we save this copy somehow?
      double[] data = new double[neigh.size()];
        int count = 0;
        for (DBIDIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) {
          V vector = relation.get(neighbor);
          data[count] = vector.doubleValue(dim);
        assert (count == neigh.size());

      final double norm = data[data.length - 1] - data[0];
      final double min = data[0];

      // Kolmogorow-Smirnow-Test against uniform distribution:
      for (int j = 1; j < data.length - 2; j++) {
        double delta = (j / (data.length - 1.)) - ((data[j] - min) / norm);
        if (Math.abs(delta) > crit) {
          return false;
    return true;
Пример #2
 /** Resets the values for the next cluster search. */
 protected void reset() {
   rows = BitsUtil.ones(rowM.length);
   rowcard = rowM.length;
   cols = BitsUtil.ones(colM.length);
   colcard = colM.length;
Пример #3
   * Main loop for OUTRES
   * @param relation Relation to process
   * @return Outlier detection result
  public OutlierResult run(Relation<V> relation) {
    WritableDoubleDataStore ranks =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax minmax = new DoubleMinMax();

    KernelDensityEstimator kernel = new KernelDensityEstimator(relation);
    long[] subspace = BitsUtil.zero(kernel.dim);

    FiniteProgress progress =
        LOG.isVerbose() ? new FiniteProgress("OUTRES scores", relation.size(), LOG) : null;

    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      double score = outresScore(0, subspace, iditer, kernel);
      ranks.putDouble(iditer, score);

    OutlierScoreMeta meta =
        new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.);
    OutlierResult outresResult =
        new OutlierResult(
            new MaterializedDoubleRelation("OUTRES", "outres-score", ranks, relation.getDBIDs()));
    return outresResult;
  protected double minDistObject(SpatialComparable mbr, NumberVector v) {
    if (mbr.getDimensionality() != v.getDimensionality()) {
      throw new IllegalArgumentException(
          "Different dimensionality of objects\n  "
              + "first argument: "
              + mbr.toString()
              + "\n  "
              + "second argument: "
              + v.toString());

    double agg = 0.;
    for (int d = BitsUtil.nextSetBit(dimensions, 0);
        d >= 0;
        d = BitsUtil.nextSetBit(dimensions, d + 1)) {
      final double value = v.doubleValue(d);
      final double omin = mbr.getMin(d);
      final double diff1 = omin - value;
      if (diff1 > 0.) {
        if (diff1 > agg) {
          agg = diff1;
      } else {
        final double omax = mbr.getMax(d);
        final double diff2 = value - omax;
        if (diff2 > agg) {
          agg = diff2;
    return agg;
Пример #5
  public List<Polygon> compute() {
    // Compute delaunay triangulation:
    delaunay = (new SweepHullDelaunay2D(points)).getDelaunay();

    List<Polygon> polys = new ArrayList<>();

    // Working data
    long[] used = BitsUtil.zero(delaunay.size());
    List<double[]> cur = new ArrayList<>();

    for (int i = 0 /* = used.nextClearBit(0) */;
        i < delaunay.size() && i >= 0;
        i = BitsUtil.nextClearBit(used, i + 1)) {
      if (!BitsUtil.get(used, i)) {
        BitsUtil.setI(used, i);
        SweepHullDelaunay2D.Triangle tri = delaunay.get(i);
        if (tri.r2 <= alpha2) {
          // Check neighbors
          processNeighbor(cur, used, i, tri.ab, tri.b);
          processNeighbor(cur, used, i, tri.bc, tri.c);
          processNeighbor(cur, used, i, tri.ca, tri.a);
        if (cur.size() > 0) {
          polys.add(new Polygon(cur));
          cur = new ArrayList<>();

    return polys;
 public double minDist(SpatialComparable mbr1, SpatialComparable mbr2) {
   if (mbr1.getDimensionality() != mbr2.getDimensionality()) {
     throw new IllegalArgumentException(
         "Different dimensionality of objects\n  "
             + "first argument: "
             + mbr1.toString()
             + "\n  "
             + "second argument: "
             + mbr2.toString());
   double agg = 0.;
   for (int d = BitsUtil.nextSetBit(dimensions, 0);
       d >= 0;
       d = BitsUtil.nextSetBit(dimensions, d + 1)) {
     final double max1 = mbr1.getMax(d);
     final double min2 = mbr2.getMin(d);
     if (max1 < min2) {
       double v = min2 - max1;
       if (v > agg) {
         agg = v;
     } else {
       final double min1 = mbr1.getMin(d);
       final double max2 = mbr2.getMax(d);
       double v = min1 - max2;
       if (v > agg) {
         agg = v;
   return agg;
Пример #7
  * Select or deselect a column.
  * @param cnum Column to select
  * @param set Value to set
 protected void selectColumn(int cnum, boolean set) {
   if (set) {
     BitsUtil.setI(cols, cnum);
   } else {
     BitsUtil.clearI(cols, cnum);
Пример #8
  * Select or deselect a row.
  * @param rnum Row to select
  * @param set Value to set
 protected void selectRow(int rnum, boolean set) {
   if (set) {
     BitsUtil.setI(rows, rnum);
   } else {
     BitsUtil.clearI(rows, rnum);
Пример #9
  * Constructor.
  * @param rows Row dimensionality.
  * @param cols Column dimensionality.
 protected BiclusterCandidate(int rows, int cols) {
   this.rows = BitsUtil.ones(rows);
   this.irow = BitsUtil.zero(rows);
   this.rowcard = rows;
   this.rowM = new double[rows];
   this.cols = BitsUtil.ones(cols);
   this.colcard = cols;
   this.colM = new double[cols];
 public double norm(NumberVector obj) {
   double agg = 0.;
   for (int d = BitsUtil.nextSetBit(dimensions, 0);
       d >= 0;
       d = BitsUtil.nextSetBit(dimensions, d + 1)) {
     double v = Math.abs(obj.doubleValue(d));
     if (v > agg) {
       agg = v;
   return agg;
Пример #11
  * Visit a column of the matrix.
  * @param mat Data matrix
  * @param col Column to visit
  * @param mode Operation mode
  * @param visitor Visitor function
 protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) {
   boolean cselected = BitsUtil.get(cols, col);
   // For efficiency, we manually iterate over the rows and column bitmasks.
   // This saves repeated shifting needed by the manual bit access.
   for (int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) {
     long rlong = rows[rlpos];
     // Fast skip blocks of 64 masked values.
     if (mode == CellVisitor.SELECTED && rlong == 0L) {
       rpos += Long.SIZE;
     if (mode == CellVisitor.NOT_SELECTED && rlong == -1L) {
       rpos += Long.SIZE;
     for (int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) {
       boolean rselected = ((rlong & 1L) == 1L);
       if (mode == CellVisitor.SELECTED && !rselected) {
       if (mode == CellVisitor.NOT_SELECTED && rselected) {
       boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected);
       if (stop) {
Пример #12
  * Visit a row of the data matrix.
  * @param mat Data matrix
  * @param row Row to visit
  * @param visitor Visitor function
 protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) {
   boolean rselected = BitsUtil.get(rows, row);
   final double[] rowdata = mat[row];
   for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
     long clong = cols[clpos];
     // Fast skip blocks of 64 masked values.
     if (mode == CellVisitor.SELECTED && clong == 0L) {
       cpos += Long.SIZE;
     if (mode == CellVisitor.NOT_SELECTED && clong == -1L) {
       cpos += Long.SIZE;
     for (int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) {
       boolean cselected = ((clong & 1L) == 1L);
       if (mode == CellVisitor.SELECTED && !cselected) {
       if (mode == CellVisitor.NOT_SELECTED && cselected) {
       boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected);
       if (stop) {
Пример #13
  public long[] getVisibleDimensions2D() {
    final int dim = proj.getDimensionality();
    long[] actDim = BitsUtil.zero(dim);
    double[] vScale = new double[dim];
    for (int d = 0; d < dim; d++) {
      Arrays.fill(vScale, 0);
      vScale[d] = 1;
      double[] vRender = fastProjectScaledToRenderSpace(vScale);

      // TODO: Can't we do this by inspecting the projection matrix directly?
      if (vRender[0] > 0.0 || vRender[0] < 0.0 || vRender[1] != 0) {
        BitsUtil.setI(actDim, d);
    return actDim;
Пример #14
   * Main loop of OUTRES. Run for each object
   * @param s start dimension
   * @param subspace Current subspace
   * @param id Current object ID
   * @param kernel Kernel
   * @return Score
  public double outresScore(
      final int s, long[] subspace, DBIDRef id, KernelDensityEstimator kernel) {
    double score = 1.0; // Initial score is 1.0
    final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(subspace);
    MeanVariance meanv = new MeanVariance();

    for (int i = s; i < kernel.dim; i++) {
      if (BitsUtil.get(subspace, i)) { // TODO: needed? Or should we always start
        // with i=0?
      BitsUtil.setI(subspace, i);
      final double adjustedEps = kernel.adjustedEps(kernel.dim);
      // Query with a larger window, to also get neighbors of neighbors
      // Subspace euclidean is metric!
      final double range = adjustedEps * 2.;
      RangeQuery<V> rq = QueryUtil.getRangeQuery(kernel.relation, df, range);

      DoubleDBIDList neighc = rq.getRangeForDBID(id, range);
      DoubleDBIDList neigh = refineRange(neighc, adjustedEps);
      if (neigh.size() > 2) {
        // Relevance test
        if (relevantSubspace(subspace, neigh, kernel)) {
          final double density = kernel.subspaceDensity(subspace, neigh);
          // Compute mean and standard deviation for densities of neighbors.
          for (DoubleDBIDListIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) {
            DoubleDBIDList n2 = subsetNeighborhoodQuery(neighc, neighbor, df, adjustedEps, kernel);
            meanv.put(kernel.subspaceDensity(subspace, n2));
          final double deviation = (meanv.getMean() - density) / (2. * meanv.getSampleStddev());
          // High deviation:
          if (deviation >= 1) {
            score *= (density / deviation);
          // Recursion
          score *= outresScore(i + 1, subspace, id, kernel);
      BitsUtil.clearI(subspace, i);
    return score;
Пример #15
  public Clustering<BiclusterWithInversionsModel> biclustering() {
    double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);

    BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());

    Clustering<BiclusterWithInversionsModel> result =
        new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
    ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
    for (int i = 0; i < n; i++) {
      multipleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
            "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      singleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
            "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      nodeAddition(mat, cand);
      if (LOG.isVeryVerbose()) {
            "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      cand.maskMatrix(mat, dist);
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
      final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
      result.addToplevelCluster(new Cluster<>(cids, model));

      if (LOG.isVerbose()) {
        LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
        LOG.verbose("Number of rows: " + cand.rowcard + "\n");
        LOG.verbose("Number of columns: " + cand.colcard + "\n");
        // LOG.verbose("Total number of masked values: " + maskedVals.size() +
        // "\n");
    // Add a noise cluster, full-dimensional.
    if (!noise.isEmpty()) {
      long[] allcols = BitsUtil.ones(getColDim());
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
      result.addToplevelCluster(new Cluster<>(noise, true, model));
    return result;
Пример #16
     * Compute density in the given subspace.
     * @param subspace Subspace
     * @param neighbors Neighbor distance list
     * @return Density
    protected double subspaceDensity(long[] subspace, DoubleDBIDList neighbors) {
      final double bandwidth = optimalBandwidth(BitsUtil.cardinality(subspace));

      double density = 0;
      for (DoubleDBIDListIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
        double v = neighbor.doubleValue() / bandwidth;
        if (v < 1) {
          density += 1 - (v * v);

      return density / relation.size();
  public double distance(NumberVector v1, NumberVector v2) {
    if (v1.getDimensionality() != v2.getDimensionality()) {
      throw new IllegalArgumentException(
          "Different dimensionality of FeatureVectors\n  "
              + "first argument: "
              + v1
              + "\n  "
              + "second argument: "
              + v2);

    double agg = 0.;
    for (int d = BitsUtil.nextSetBit(dimensions, 0);
        d >= 0;
        d = BitsUtil.nextSetBit(dimensions, d + 1)) {
      double v = Math.abs(v1.doubleValue(d) - v2.doubleValue(d));
      if (v > agg) {
        agg = v;
    return agg;
Пример #18
 private void processNeighbor(List<double[]> cur, long[] used, int i, int ab, int b) {
   if (ab >= 0) {
     if (BitsUtil.get(used, ab)) {
     BitsUtil.setI(used, ab);
     final SweepHullDelaunay2D.Triangle next = delaunay.get(ab);
     if (next.r2 < alpha2) {
       // Continue where we left off...
       if (next.ab == i) {
         processNeighbor(cur, used, ab, next.bc, next.c);
         processNeighbor(cur, used, ab, next.ca, next.a);
       } else if (next.bc == i) {
         processNeighbor(cur, used, ab, next.ca, next.a);
         processNeighbor(cur, used, ab, next.ab, next.b);
       } else if (next.ca == i) {
         processNeighbor(cur, used, ab, next.ab, next.b);
         processNeighbor(cur, used, ab, next.bc, next.c);
Пример #19
 protected void invertRow(int rnum, boolean b) {
   BitsUtil.setI(irow, rnum);
Пример #20
   * Performs the DOC or FastDOC (as configured) algorithm on the given Database.
   * <p>This will run exhaustively, i.e. run DOC until no clusters are found anymore / the database
   * size has shrunk below the threshold for minimum cluster size.
   * @param database Database
   * @param relation Data relation
  public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    // Dimensionality of our set.
    final int d = RelationUtil.dimensionality(relation);

    // Get available DBIDs as a set we can remove items from.
    ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());

    // Precompute values as described in Figure 2.
    double r = Math.abs(Math.log(d + d) / Math.log(beta * .5));
    // Outer loop count.
    int n = (int) (2. / alpha);
    // Inner loop count.
    int m = (int) (Math.pow(2. / alpha, r) * Math.log(4));
    if (heuristics) {
      m = Math.min(m, Math.min(1000000, d * d));

    // Minimum size for a cluster for it to be accepted.
    int minClusterSize = (int) (alpha * S.size());

    // List of all clusters we found.
    Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC");

    // Inform the user about the number of actual clusters found so far.
    IndefiniteProgress cprogress =
        LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;

    // To not only find a single cluster, we continue running until our set
    // of points is empty.
    while (S.size() > minClusterSize) {
      Cluster<SubspaceModel> C;
      if (heuristics) {
        C = runFastDOC(database, relation, S, d, n, m, (int) r);
      } else {
        C = runDOC(database, relation, S, d, n, m, (int) r, minClusterSize);

      if (C == null) {
        // Stop trying if we couldn't find a cluster.
      // Found a cluster, remember it, remove its points from the set.

      // Remove all points of the cluster from the set and continue.

      if (cprogress != null) {
        cprogress.setProcessed(result.getAllClusters().size(), LOG);

    // Add the remainder as noise.
    if (S.size() > 0) {
      long[] alldims = BitsUtil.ones(d);
          new Cluster<>(
              new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S).getArrayRef())));
    return result;
Пример #21
   * Performs a single run of DOC, finding a single cluster.
   * @param database Database context
   * @param relation used to get actual values for DBIDs.
   * @param S The set of points we're working on.
   * @param d Dimensionality of the data set we're currently working on.
   * @param r Size of random samples.
   * @param m Number of inner iterations (per seed point).
   * @param n Number of outer iterations (seed points).
   * @param minClusterSize Minimum size a cluster must have to be accepted.
   * @return a cluster, if one is found, else <code>null</code>.
  private Cluster<SubspaceModel> runDOC(
      Database database,
      Relation<V> relation,
      ArrayModifiableDBIDs S,
      final int d,
      int n,
      int m,
      int r,
      int minClusterSize) {
    // Best cluster for the current run.
    DBIDs C = null;
    // Relevant attributes for the best cluster.
    long[] D = null;
    // Quality of the best cluster.
    double quality = Double.NEGATIVE_INFINITY;

    // Bounds for our cluster.
    // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new
    // double[d], new double[d]);

    // Weights for distance (= rectangle query)
    SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d));
    DistanceQuery<V> dq = database.getDistanceQuery(relation, df);
    RangeQuery<V> rq = database.getRangeQuery(dq);

    // Inform the user about the progress in the current iteration.
    FiniteProgress iprogress =
            ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG)
            : null;

    Random random = rnd.getSingleThreadedRandom();
    DBIDArrayIter iter = S.iter();

    for (int i = 0; i < n; ++i) {
      // Pick a random seed point.

      for (int j = 0; j < m; ++j) {
        // Choose a set of random points.
        DBIDs randomSet = DBIDUtil.randomSample(S, r, random);

        // Initialize cluster info.
        long[] nD = BitsUtil.zero(d);

        // Test each dimension and build bounding box.
        for (int k = 0; k < d; ++k) {
          if (dimensionIsRelevant(k, relation, randomSet)) {
            BitsUtil.setI(nD, k);
        if (BitsUtil.cardinality(nD) > 0) {
          // Get all points in the box.
          // TODO: add filtering capabilities into query API!
          DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w));

          if (LOG.isDebuggingFiner()) {
                "Testing a cluster candidate, |C| = "
                    + nC.size()
                    + ", |D| = "
                    + BitsUtil.cardinality(nD));

          // Is the cluster large enough?
          if (nC.size() < minClusterSize) {
            // Too small.
            if (LOG.isDebuggingFiner()) {
              LOG.finer("... but it's too small.");
          } else {
            // Better cluster than before?
            double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD));
            if (nQuality > quality) {
              if (LOG.isDebuggingFiner()) {
                LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality);
              C = nC;
              D = nD;
              quality = nQuality;
            } else {
              if (LOG.isDebuggingFiner()) {
                LOG.finer("... but we already have a better one.");

    return (C != null) ? makeCluster(relation, C, D) : null;
Пример #22
   * Performs a single run of FastDOC, finding a single cluster.
   * @param database Database context
   * @param relation used to get actual values for DBIDs.
   * @param S The set of points we're working on.
   * @param d Dimensionality of the data set we're currently working on.
   * @param r Size of random samples.
   * @param m Number of inner iterations (per seed point).
   * @param n Number of outer iterations (seed points).
   * @return a cluster, if one is found, else <code>null</code>.
  private Cluster<SubspaceModel> runFastDOC(
      Database database, Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
    // Relevant attributes of highest cardinality.
    long[] D = null;
    // The seed point for the best dimensions.
    DBIDVar dV = DBIDUtil.newVar();

    // Inform the user about the progress in the current iteration.
    FiniteProgress iprogress =
            ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG)
            : null;

    Random random = rnd.getSingleThreadedRandom();

    DBIDArrayIter iter = S.iter();
    for (int i = 0; i < n; ++i) {
      // Pick a random seed point.

      for (int j = 0; j < m; ++j) {
        // Choose a set of random points.
        DBIDs randomSet = DBIDUtil.randomSample(S, r, random);

        // Initialize cluster info.
        long[] nD = BitsUtil.zero(d);

        // Test each dimension.
        for (int k = 0; k < d; ++k) {
          if (dimensionIsRelevant(k, relation, randomSet)) {
            BitsUtil.setI(nD, k);

        if (D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) {
          D = nD;

          if (BitsUtil.cardinality(D) >= d_zero) {
            if (iprogress != null) {
              iprogress.setProcessed(iprogress.getTotal(), LOG);
            break outer;

    // If no relevant dimensions were found, skip it.
    if (D == null || BitsUtil.cardinality(D) == 0) {
      return null;

    // Get all points in the box.
    SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D);
    DistanceQuery<V> dq = database.getDistanceQuery(relation, df);
    RangeQuery<V> rq = database.getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);

    // TODO: add filtering capabilities into query API!
    DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w));

    // If we have a non-empty cluster, return it.
    return (C.size() > 0) ? makeCluster(relation, C, D) : null;