/** * Subspace relevance test. * * @param subspace Subspace to test * @param neigh Neighbor list * @param kernel Kernel density estimator * @return relevance test result */ protected boolean relevantSubspace( long[] subspace, DoubleDBIDList neigh, KernelDensityEstimator kernel) { Relation<V> relation = kernel.relation; final double crit = K_S_CRITICAL001 / Math.sqrt(neigh.size()); for (int dim = BitsUtil.nextSetBit(subspace, 0); dim > 0; dim = BitsUtil.nextSetBit(subspace, dim + 1)) { // TODO: can we save this copy somehow? double[] data = new double[neigh.size()]; { int count = 0; for (DBIDIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) { V vector = relation.get(neighbor); data[count] = vector.doubleValue(dim); count++; } assert (count == neigh.size()); } Arrays.sort(data); final double norm = data[data.length - 1] - data[0]; final double min = data[0]; // Kolmogorow-Smirnow-Test against uniform distribution: for (int j = 1; j < data.length - 2; j++) { double delta = (j / (data.length - 1.)) - ((data[j] - min) / norm); if (Math.abs(delta) > crit) { return false; } } } return true; }
/** Resets the values for the next cluster search. */ protected void reset() { rows = BitsUtil.ones(rowM.length); rowcard = rowM.length; cols = BitsUtil.ones(colM.length); colcard = colM.length; BitsUtil.zeroI(irow); }
/** * Main loop for OUTRES * * @param relation Relation to process * @return Outlier detection result */ public OutlierResult run(Relation<V> relation) { WritableDoubleDataStore ranks = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); KernelDensityEstimator kernel = new KernelDensityEstimator(relation); long[] subspace = BitsUtil.zero(kernel.dim); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("OUTRES scores", relation.size(), LOG) : null; for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { BitsUtil.zeroI(subspace); double score = outresScore(0, subspace, iditer, kernel); ranks.putDouble(iditer, score); minmax.put(score); LOG.incrementProcessed(progress); } LOG.ensureCompleted(progress); OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.); OutlierResult outresResult = new OutlierResult( meta, new MaterializedDoubleRelation("OUTRES", "outres-score", ranks, relation.getDBIDs())); return outresResult; }
@Override protected double minDistObject(SpatialComparable mbr, NumberVector v) { if (mbr.getDimensionality() != v.getDimensionality()) { throw new IllegalArgumentException( "Different dimensionality of objects\n " + "first argument: " + mbr.toString() + "\n " + "second argument: " + v.toString()); } double agg = 0.; for (int d = BitsUtil.nextSetBit(dimensions, 0); d >= 0; d = BitsUtil.nextSetBit(dimensions, d + 1)) { final double value = v.doubleValue(d); final double omin = mbr.getMin(d); final double diff1 = omin - value; if (diff1 > 0.) { if (diff1 > agg) { agg = diff1; } } else { final double omax = mbr.getMax(d); final double diff2 = value - omax; if (diff2 > agg) { agg = diff2; } } } return agg; }
public List<Polygon> compute() { // Compute delaunay triangulation: delaunay = (new SweepHullDelaunay2D(points)).getDelaunay(); List<Polygon> polys = new ArrayList<>(); // Working data long[] used = BitsUtil.zero(delaunay.size()); List<double[]> cur = new ArrayList<>(); for (int i = 0 /* = used.nextClearBit(0) */; i < delaunay.size() && i >= 0; i = BitsUtil.nextClearBit(used, i + 1)) { if (!BitsUtil.get(used, i)) { BitsUtil.setI(used, i); SweepHullDelaunay2D.Triangle tri = delaunay.get(i); if (tri.r2 <= alpha2) { // Check neighbors processNeighbor(cur, used, i, tri.ab, tri.b); processNeighbor(cur, used, i, tri.bc, tri.c); processNeighbor(cur, used, i, tri.ca, tri.a); } if (cur.size() > 0) { polys.add(new Polygon(cur)); cur = new ArrayList<>(); } } } return polys; }
@Override public double minDist(SpatialComparable mbr1, SpatialComparable mbr2) { if (mbr1.getDimensionality() != mbr2.getDimensionality()) { throw new IllegalArgumentException( "Different dimensionality of objects\n " + "first argument: " + mbr1.toString() + "\n " + "second argument: " + mbr2.toString()); } double agg = 0.; for (int d = BitsUtil.nextSetBit(dimensions, 0); d >= 0; d = BitsUtil.nextSetBit(dimensions, d + 1)) { final double max1 = mbr1.getMax(d); final double min2 = mbr2.getMin(d); if (max1 < min2) { double v = min2 - max1; if (v > agg) { agg = v; } } else { final double min1 = mbr1.getMin(d); final double max2 = mbr2.getMax(d); double v = min1 - max2; if (v > agg) { agg = v; } } } return agg; }
/** * Select or deselect a column. * * @param cnum Column to select * @param set Value to set */ protected void selectColumn(int cnum, boolean set) { if (set) { BitsUtil.setI(cols, cnum); colcard++; } else { BitsUtil.clearI(cols, cnum); colcard--; } }
/** * Select or deselect a row. * * @param rnum Row to select * @param set Value to set */ protected void selectRow(int rnum, boolean set) { if (set) { BitsUtil.setI(rows, rnum); rowcard++; } else { BitsUtil.clearI(rows, rnum); rowcard--; } }
/** * Constructor. * * @param rows Row dimensionality. * @param cols Column dimensionality. */ protected BiclusterCandidate(int rows, int cols) { super(); this.rows = BitsUtil.ones(rows); this.irow = BitsUtil.zero(rows); this.rowcard = rows; this.rowM = new double[rows]; this.cols = BitsUtil.ones(cols); this.colcard = cols; this.colM = new double[cols]; }
@Override public double norm(NumberVector obj) { double agg = 0.; for (int d = BitsUtil.nextSetBit(dimensions, 0); d >= 0; d = BitsUtil.nextSetBit(dimensions, d + 1)) { double v = Math.abs(obj.doubleValue(d)); if (v > agg) { agg = v; } } return agg; }
/** * Visit a column of the matrix. * * @param mat Data matrix * @param col Column to visit * @param mode Operation mode * @param visitor Visitor function */ protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) { boolean cselected = BitsUtil.get(cols, col); // For efficiency, we manually iterate over the rows and column bitmasks. // This saves repeated shifting needed by the manual bit access. for (int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) { long rlong = rows[rlpos]; // Fast skip blocks of 64 masked values. if (mode == CellVisitor.SELECTED && rlong == 0L) { rpos += Long.SIZE; continue; } if (mode == CellVisitor.NOT_SELECTED && rlong == -1L) { rpos += Long.SIZE; continue; } for (int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) { boolean rselected = ((rlong & 1L) == 1L); if (mode == CellVisitor.SELECTED && !rselected) { continue; } if (mode == CellVisitor.NOT_SELECTED && rselected) { continue; } boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected); if (stop) { return; } } } }
/** * Visit a row of the data matrix. * * @param mat Data matrix * @param row Row to visit * @param visitor Visitor function */ protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) { boolean rselected = BitsUtil.get(rows, row); final double[] rowdata = mat[row]; for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { long clong = cols[clpos]; // Fast skip blocks of 64 masked values. if (mode == CellVisitor.SELECTED && clong == 0L) { cpos += Long.SIZE; continue; } if (mode == CellVisitor.NOT_SELECTED && clong == -1L) { cpos += Long.SIZE; continue; } for (int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) { boolean cselected = ((clong & 1L) == 1L); if (mode == CellVisitor.SELECTED && !cselected) { continue; } if (mode == CellVisitor.NOT_SELECTED && cselected) { continue; } boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected); if (stop) { return; } } } }
@Override public long[] getVisibleDimensions2D() { final int dim = proj.getDimensionality(); long[] actDim = BitsUtil.zero(dim); double[] vScale = new double[dim]; for (int d = 0; d < dim; d++) { Arrays.fill(vScale, 0); vScale[d] = 1; double[] vRender = fastProjectScaledToRenderSpace(vScale); // TODO: Can't we do this by inspecting the projection matrix directly? if (vRender[0] > 0.0 || vRender[0] < 0.0 || vRender[1] != 0) { BitsUtil.setI(actDim, d); } } return actDim; }
/** * Main loop of OUTRES. Run for each object * * @param s start dimension * @param subspace Current subspace * @param id Current object ID * @param kernel Kernel * @return Score */ public double outresScore( final int s, long[] subspace, DBIDRef id, KernelDensityEstimator kernel) { double score = 1.0; // Initial score is 1.0 final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(subspace); MeanVariance meanv = new MeanVariance(); for (int i = s; i < kernel.dim; i++) { if (BitsUtil.get(subspace, i)) { // TODO: needed? Or should we always start // with i=0? continue; } BitsUtil.setI(subspace, i); df.setSelectedDimensions(subspace); final double adjustedEps = kernel.adjustedEps(kernel.dim); // Query with a larger window, to also get neighbors of neighbors // Subspace euclidean is metric! final double range = adjustedEps * 2.; RangeQuery<V> rq = QueryUtil.getRangeQuery(kernel.relation, df, range); DoubleDBIDList neighc = rq.getRangeForDBID(id, range); DoubleDBIDList neigh = refineRange(neighc, adjustedEps); if (neigh.size() > 2) { // Relevance test if (relevantSubspace(subspace, neigh, kernel)) { final double density = kernel.subspaceDensity(subspace, neigh); // Compute mean and standard deviation for densities of neighbors. meanv.reset(); for (DoubleDBIDListIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) { DoubleDBIDList n2 = subsetNeighborhoodQuery(neighc, neighbor, df, adjustedEps, kernel); meanv.put(kernel.subspaceDensity(subspace, n2)); } final double deviation = (meanv.getMean() - density) / (2. * meanv.getSampleStddev()); // High deviation: if (deviation >= 1) { score *= (density / deviation); } // Recursion score *= outresScore(i + 1, subspace, id, kernel); } } BitsUtil.clearI(subspace, i); } return score; }
@Override public Clustering<BiclusterWithInversionsModel> biclustering() { double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; for (int i = 0; i < n; i++) { cand.reset(); multipleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } singleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } nodeAddition(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } cand.maskMatrix(mat, dist); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); noise.removeDBIDs(cids); result.addToplevelCluster(new Cluster<>(cids, model)); if (LOG.isVerbose()) { LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); LOG.verbose("Number of rows: " + cand.rowcard + "\n"); LOG.verbose("Number of columns: " + cand.colcard + "\n"); // LOG.verbose("Total number of masked values: " + maskedVals.size() + // "\n"); } LOG.incrementProcessed(prog); } // Add a noise cluster, full-dimensional. if (!noise.isEmpty()) { long[] allcols = BitsUtil.ones(getColDim()); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); result.addToplevelCluster(new Cluster<>(noise, true, model)); } LOG.ensureCompleted(prog); return result; }
/** * Compute density in the given subspace. * * @param subspace Subspace * @param neighbors Neighbor distance list * @return Density */ protected double subspaceDensity(long[] subspace, DoubleDBIDList neighbors) { final double bandwidth = optimalBandwidth(BitsUtil.cardinality(subspace)); double density = 0; for (DoubleDBIDListIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { double v = neighbor.doubleValue() / bandwidth; if (v < 1) { density += 1 - (v * v); } } return density / relation.size(); }
@Override public double distance(NumberVector v1, NumberVector v2) { if (v1.getDimensionality() != v2.getDimensionality()) { throw new IllegalArgumentException( "Different dimensionality of FeatureVectors\n " + "first argument: " + v1 + "\n " + "second argument: " + v2); } double agg = 0.; for (int d = BitsUtil.nextSetBit(dimensions, 0); d >= 0; d = BitsUtil.nextSetBit(dimensions, d + 1)) { double v = Math.abs(v1.doubleValue(d) - v2.doubleValue(d)); if (v > agg) { agg = v; } } return agg; }
private void processNeighbor(List<double[]> cur, long[] used, int i, int ab, int b) { if (ab >= 0) { if (BitsUtil.get(used, ab)) { return; } BitsUtil.setI(used, ab); final SweepHullDelaunay2D.Triangle next = delaunay.get(ab); if (next.r2 < alpha2) { // Continue where we left off... if (next.ab == i) { processNeighbor(cur, used, ab, next.bc, next.c); processNeighbor(cur, used, ab, next.ca, next.a); } else if (next.bc == i) { processNeighbor(cur, used, ab, next.ca, next.a); processNeighbor(cur, used, ab, next.ab, next.b); } else if (next.ca == i) { processNeighbor(cur, used, ab, next.ab, next.b); processNeighbor(cur, used, ab, next.bc, next.c); } return; } } cur.add(points.get(b)); }
protected void invertRow(int rnum, boolean b) { BitsUtil.setI(irow, rnum); }
/** * Performs the DOC or FastDOC (as configured) algorithm on the given Database. * * <p>This will run exhaustively, i.e. run DOC until no clusters are found anymore / the database * size has shrunk below the threshold for minimum cluster size. * * @param database Database * @param relation Data relation */ public Clustering<SubspaceModel> run(Database database, Relation<V> relation) { // Dimensionality of our set. final int d = RelationUtil.dimensionality(relation); // Get available DBIDs as a set we can remove items from. ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs()); // Precompute values as described in Figure 2. double r = Math.abs(Math.log(d + d) / Math.log(beta * .5)); // Outer loop count. int n = (int) (2. / alpha); // Inner loop count. int m = (int) (Math.pow(2. / alpha, r) * Math.log(4)); if (heuristics) { m = Math.min(m, Math.min(1000000, d * d)); } // Minimum size for a cluster for it to be accepted. int minClusterSize = (int) (alpha * S.size()); // List of all clusters we found. Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC"); // Inform the user about the number of actual clusters found so far. IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null; // To not only find a single cluster, we continue running until our set // of points is empty. while (S.size() > minClusterSize) { Cluster<SubspaceModel> C; if (heuristics) { C = runFastDOC(database, relation, S, d, n, m, (int) r); } else { C = runDOC(database, relation, S, d, n, m, (int) r, minClusterSize); } if (C == null) { // Stop trying if we couldn't find a cluster. break; } // Found a cluster, remember it, remove its points from the set. result.addToplevelCluster(C); // Remove all points of the cluster from the set and continue. S.removeDBIDs(C.getIDs()); if (cprogress != null) { cprogress.setProcessed(result.getAllClusters().size(), LOG); } } // Add the remainder as noise. if (S.size() > 0) { long[] alldims = BitsUtil.ones(d); result.addToplevelCluster( new Cluster<>( S, true, new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S).getArrayRef()))); } LOG.setCompleted(cprogress); return result; }
/** * Performs a single run of DOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @param minClusterSize Minimum size a cluster must have to be accepted. * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) { // Best cluster for the current run. DBIDs C = null; // Relevant attributes for the best cluster. long[] D = null; // Quality of the best cluster. double quality = Double.NEGATIVE_INFINITY; // Bounds for our cluster. // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new // double[d], new double[d]); // Weights for distance (= rectangle query) SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d)); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension and build bounding box. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (BitsUtil.cardinality(nD) > 0) { // Get all points in the box. df.setSelectedDimensions(nD); // TODO: add filtering capabilities into query API! DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w)); if (LOG.isDebuggingFiner()) { LOG.finer( "Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + BitsUtil.cardinality(nD)); } // Is the cluster large enough? if (nC.size() < minClusterSize) { // Too small. if (LOG.isDebuggingFiner()) { LOG.finer("... but it's too small."); } } else { // Better cluster than before? double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD)); if (nQuality > quality) { if (LOG.isDebuggingFiner()) { LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality); } C = nC; D = nD; quality = nQuality; } else { if (LOG.isDebuggingFiner()) { LOG.finer("... but we already have a better one."); } } } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); return (C != null) ? makeCluster(relation, C, D) : null; }
/** * Performs a single run of FastDOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runFastDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) { // Relevant attributes of highest cardinality. long[] D = null; // The seed point for the best dimensions. DBIDVar dV = DBIDUtil.newVar(); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); outer: for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) { D = nD; dV.set(iter); if (BitsUtil.cardinality(D) >= d_zero) { if (iprogress != null) { iprogress.setProcessed(iprogress.getTotal(), LOG); } break outer; } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); // If no relevant dimensions were found, skip it. if (D == null || BitsUtil.cardinality(D) == 0) { return null; } // Get all points in the box. SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq, DatabaseQuery.HINT_SINGLE); // TODO: add filtering capabilities into query API! DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w)); // If we have a non-empty cluster, return it. return (C.size() > 0) ? makeCluster(relation, C, D) : null; }