/** * Preprocessing step: determine the radii of interest for each point. * * @param ids IDs to process * @param rangeQuery Range query * @param interestingDistances Distances of interest */ protected void precomputeInterestingRadii( DBIDs ids, RangeQuery<O> rangeQuery, WritableDataStore<DoubleIntArrayList> interestingDistances) { FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null; for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax); // build list of critical distances DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1); { int i = 0; DoubleDBIDListIter ni = neighbors.iter(); while (ni.valid()) { final double curdist = ni.doubleValue(); ++i; ni.advance(); // Skip, if tied to the next object: if (ni.valid() && curdist == ni.doubleValue()) { continue; } cdist.append(curdist, i); // Scale radius, and reinsert if (alpha != 1.) { final double ri = curdist / alpha; if (ri <= rmax) { cdist.append(ri, Integer.MIN_VALUE); } } } } cdist.sort(); // fill the gaps to have fast lookups of number of neighbors at a given // distance. int lastk = 0; for (int i = 0, size = cdist.size(); i < size; i++) { final int k = cdist.getInt(i); if (k == Integer.MIN_VALUE) { cdist.setValue(i, lastk); } else { lastk = k; } } // TODO: shrink the list, removing duplicate radii? interestingDistances.put(iditer, cdist); LOG.incrementProcessed(progressPreproc); } LOG.ensureCompleted(progressPreproc); }
@Override public void run() { Database database = input.getDatabase(); Relation<O> relation = database.getRelation(distance.getInputTypeRestriction()); DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, distance); KNNQuery<O> knnQ = database.getKNNQuery(distanceQuery, DatabaseQuery.HINT_HEAVY_USE); // open file. try (RandomAccessFile file = new RandomAccessFile(out, "rw"); FileChannel channel = file.getChannel(); // and acquire a file write lock FileLock lock = channel.lock()) { // write magic header file.writeInt(KNN_CACHE_MAGIC); int bufsize = k * 12 * 2 + 10; // Initial size, enough for 2 kNN. ByteBuffer buffer = ByteBuffer.allocateDirect(bufsize); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Computing kNN", relation.size(), LOG) : null; for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { final KNNList nn = knnQ.getKNNForDBID(it, k); final int nnsize = nn.size(); // Grow the buffer when needed: if (nnsize * 12 + 10 > bufsize) { while (nnsize * 12 + 10 > bufsize) { bufsize <<= 1; } buffer = ByteBuffer.allocateDirect(bufsize); } buffer.clear(); ByteArrayUtil.writeUnsignedVarint(buffer, it.internalGetIndex()); ByteArrayUtil.writeUnsignedVarint(buffer, nnsize); int c = 0; for (DoubleDBIDListIter ni = nn.iter(); ni.valid(); ni.advance(), c++) { ByteArrayUtil.writeUnsignedVarint(buffer, ni.internalGetIndex()); buffer.putDouble(ni.doubleValue()); } if (c != nn.size()) { throw new AbortException("Sizes did not agree. Cache is invalid."); } buffer.flip(); channel.write(buffer); LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); lock.release(); } catch (IOException e) { LOG.exception(e); } // FIXME: close! }
/** * Compute density in the given subspace. * * @param subspace Subspace * @param neighbors Neighbor distance list * @return Density */ protected double subspaceDensity(long[] subspace, DoubleDBIDList neighbors) { final double bandwidth = optimalBandwidth(BitsUtil.cardinality(subspace)); double density = 0; for (DoubleDBIDListIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { double v = neighbor.doubleValue() / bandwidth; if (v < 1) { density += 1 - (v * v); } } return density / relation.size(); }
@Override public String toString() { StringBuilder buf = new StringBuilder(); buf.append("kNNList["); for (DoubleDBIDListIter iter = this.iter(); iter.valid(); ) { buf.append(iter.doubleValue()).append(':').append(DBIDUtil.toString(iter)); iter.advance(); if (iter.valid()) { buf.append(','); } } buf.append(']'); return buf.toString(); }
/** * Run the algorithm * * @param database Database to process * @param relation Relation to process * @return Outlier result */ public OutlierResult run(Database database, Relation<O> relation) { DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc); DBIDs ids = relation.getDBIDs(); // LOCI preprocessing step WritableDataStore<DoubleIntArrayList> interestingDistances = DataStoreUtil.makeStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, DoubleIntArrayList.class); precomputeInterestingRadii(ids, rangeQuery, interestingDistances); // LOCI main step FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null; WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); // Shared instance, to save allocations. MeanVariance mv_n_r_alpha = new MeanVariance(); for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { final DoubleIntArrayList cdist = interestingDistances.get(iditer); final double maxdist = cdist.getDouble(cdist.size() - 1); final int maxneig = cdist.getInt(cdist.size() - 1); double maxmdefnorm = 0.0; double maxnormr = 0; if (maxneig >= nmin) { // Compute the largest neighborhood we will need. DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist); // TODO: Ensure the result is sorted. This is currently implied. // For any critical distance, compute the normalized MDEF score. for (int i = 0, size = cdist.size(); i < size; i++) { // Only start when minimum size is fulfilled if (cdist.getInt(i) < nmin) { continue; } final double r = cdist.getDouble(i); final double alpha_r = alpha * r; // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!) final int n_alphar = cdist.getInt(cdist.find(alpha_r)); // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF} mv_n_r_alpha.reset(); for (DoubleDBIDListIter neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) { // Stop at radius r if (neighbor.doubleValue() > r) { break; } DoubleIntArrayList cdist2 = interestingDistances.get(neighbor); int rn_alphar = cdist2.getInt(cdist2.find(alpha_r)); mv_n_r_alpha.put(rn_alphar); } // We only use the average and standard deviation final double nhat_r_alpha = mv_n_r_alpha.getMean(); final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev(); // Redundant divisions by nhat_r_alpha removed. final double mdef = nhat_r_alpha - n_alphar; final double sigmamdef = sigma_nhat_r_alpha; final double mdefnorm = mdef / sigmamdef; if (mdefnorm > maxmdefnorm) { maxmdefnorm = mdefnorm; maxnormr = r; } } } else { // FIXME: when nmin was not fulfilled - what is the proper value then? maxmdefnorm = Double.POSITIVE_INFINITY; maxnormr = maxdist; } mdef_norm.putDouble(iditer, maxmdefnorm); mdef_radius.putDouble(iditer, maxnormr); minmax.put(maxmdefnorm); LOG.incrementProcessed(progressLOCI); } LOG.ensureCompleted(progressLOCI); DoubleRelation scoreResult = new MaterializedDoubleRelation( "LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta( minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); result.addChildResult( new MaterializedDoubleRelation( "LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs())); return result; }