@Override public T put(DBIDRef id, T value) { if (value == null) { return data.remove(DBIDUtil.deref(id)); } return data.put(DBIDUtil.deref(id), value); }
/** * Constructor. * * @param size Size * @param idmap ID map */ public ArrayDBIDStore(int size, DataStoreIDMap idmap) { super(); this.data = DBIDUtil.newArray(size); // Initialize DBIDRef inv = DBIDUtil.invalid(); for (int i = 0; i < size; i++) { data.add(inv); } this.idmap = idmap; }
/** * Actual setter. * * @param id Database ID * @param index column index * @param value new value * @param <T> type * @return previous value */ @SuppressWarnings("unchecked") protected <T> T set(DBIDRef id, int index, T value) { Object[] d = data.get(DBIDUtil.deref(id)); if (d == null) { d = new Object[rlen]; data.put(DBIDUtil.deref(id), d); } T ret = (T) d[index]; d[index] = value; return ret; }
/** * Handles a DataStoreEvent with the specified type. If the current event type is not equal to the * specified type, the events accumulated up to now will be fired first. * * <p>The new event will be aggregated and fired on demand if {@link #accumulateDataStoreEvents} * is set, otherwise all registered <code>DataStoreListener</code> will be notified immediately * that the content of the database has been changed. * * @param objects the objects that have been changed, i.e. inserted, deleted or updated */ private void fireObjectsChanged(DBIDs objects, Type type) { // flush first if (currentDataStoreEventType != null && !currentDataStoreEventType.equals(type)) { flushDataStoreEvents(); } if (accumulateDataStoreEvents) { if (this.dataStoreObjects == null) { this.dataStoreObjects = DBIDUtil.newHashSet(); } this.dataStoreObjects.addDBIDs(objects); currentDataStoreEventType = type; return; } // Execute immediately: DataStoreEvent e; switch (type) { case INSERT: e = DataStoreEvent.insertionEvent(objects); break; case REMOVE: e = DataStoreEvent.removalEvent(objects); break; case UPDATE: e = DataStoreEvent.updateEvent(objects); break; default: return; } for (int i = dataListenerList.size(); --i >= 0; ) { dataListenerList.get(i).contentChanged(e); } }
protected double[] computeWithinDistances( Relation<? extends NumberVector> rel, List<? extends Cluster<?>> clusters, int withinPairs) { double[] concordant = new double[withinPairs]; int i = 0; for (Cluster<?> cluster : clusters) { if (cluster.size() <= 1 || cluster.isNoise()) { switch (noiseHandling) { case IGNORE_NOISE: continue; case TREAT_NOISE_AS_SINGLETONS: continue; // No concordant distances. case MERGE_NOISE: break; // Treat like a cluster below. } } for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) { NumberVector obj = rel.get(it1); for (DBIDIter it2 = cluster.getIDs().iter(); it2.valid(); it2.advance()) { if (DBIDUtil.compare(it1, it2) <= 0) { continue; } concordant[i++] = distanceFunction.distance(obj, rel.get(it2)); } } } assert (concordant.length == i); Arrays.sort(concordant); return concordant; }
/** Performs the DBSCAN algorithm on the given database. */ public Clustering<Model> run(Relation<O> relation) { final int size = relation.size(); if (size < minpts) { Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); result.addToplevelCluster( new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER)); return result; } RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction()); resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); runDBSCAN(relation, rangeQuery); double averagen = ncounter / (double) relation.size(); LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen)); if (averagen < 1 + 0.1 * (minpts - 1)) { LOG.warning("There are very few neighbors found. Epsilon may be too small."); } if (averagen > 100 * minpts) { LOG.warning("There are very many neighbors found. Epsilon may be too large."); } Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); for (ModifiableDBIDs res : resultList) { result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER)); } result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER)); return result; }
/** * Utility method to create a subspace cluster from a list of DBIDs and the relevant attributes. * * @param relation to compute a centroid. * @param C the cluster points. * @param D the relevant dimensions. * @return an object representing the subspace cluster. */ private Cluster<SubspaceModel> makeCluster(Relation<V> relation, DBIDs C, long[] D) { DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values! Cluster<SubspaceModel> cluster = new Cluster<>(ids); cluster.setModel( new SubspaceModel(new Subspace(D), Centroid.make(relation, ids).getArrayRef())); return cluster; }
/** * DBSCAN-function expandCluster. * * <p>Border-Objects become members of the first possible cluster. * * @param relation Database relation to run on * @param rangeQuery Range query to use * @param startObjectID potential seed of a new potential cluster * @param objprog the progress object for logging the current status */ protected void expandCluster( Relation<O> relation, RangeQuery<O> rangeQuery, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); ncounter += neighbors.size(); // startObject is no core-object if (neighbors.size() < minpts) { noise.add(startObjectID); processedIDs.add(startObjectID); if (objprog != null) { objprog.incrementProcessed(LOG); } return; } ModifiableDBIDs currentCluster = DBIDUtil.newArray(); currentCluster.add(startObjectID); processedIDs.add(startObjectID); // try to expand the cluster HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet(); processNeighbors(neighbors.iter(), currentCluster, seeds); DBIDVar o = DBIDUtil.newVar(); while (!seeds.isEmpty()) { seeds.pop(o); neighbors = rangeQuery.getRangeForDBID(o, epsilon); ncounter += neighbors.size(); if (neighbors.size() >= minpts) { processNeighbors(neighbors.iter(), currentCluster, seeds); } if (objprog != null) { objprog.incrementProcessed(LOG); } } resultList.add(currentCluster); if (clusprog != null) { clusprog.setProcessed(resultList.size(), LOG); } }
/** * Actual getter. * * @param id Database ID * @param index column index * @param <T> type * @return current value */ @SuppressWarnings("unchecked") protected <T> T get(DBIDRef id, int index) { Object[] d = data.get(DBIDUtil.deref(id)); if (d == null) { return null; } return (T) d[index]; }
@Override public boolean contains(DBIDRef o) { for (DBIDIter iter = iter(); iter.valid(); iter.advance()) { if (DBIDUtil.equal(iter, o)) { return true; } } return false; }
@Override public Clustering<KMeansModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means if (LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage( relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; int iteration = 0; for (; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); logVarstat(varstat, varsum); // Stop if no cluster assignment changed. if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } LOG.setCompleted(prog); if (LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); } // Wrap result Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); if (ids.size() == 0) { continue; } KMeansModel model = new KMeansModel(means[i], varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; }
@Override public Clustering<BiclusterWithInversionsModel> biclustering() { double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; for (int i = 0; i < n; i++) { cand.reset(); multipleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } singleNodeDeletion(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } nodeAddition(mat, cand); if (LOG.isVeryVerbose()) { LOG.veryverbose( "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); } cand.maskMatrix(mat, dist); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); noise.removeDBIDs(cids); result.addToplevelCluster(new Cluster<>(cids, model)); if (LOG.isVerbose()) { LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); LOG.verbose("Number of rows: " + cand.rowcard + "\n"); LOG.verbose("Number of columns: " + cand.colcard + "\n"); // LOG.verbose("Total number of masked values: " + maskedVals.size() + // "\n"); } LOG.incrementProcessed(prog); } // Add a noise cluster, full-dimensional. if (!noise.isEmpty()) { long[] allcols = BitsUtil.ones(getColDim()); BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); result.addToplevelCluster(new Cluster<>(noise, true, model)); } LOG.ensureCompleted(prog); return result; }
@Override public void initialize() { super.initialize(); List<MkAppEntry> objs = new ArrayList<>(relation.size()); for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { DBID id = DBIDUtil.deref(iter); final O object = relation.get(id); objs.add(createNewLeafEntry(id, object, Double.NaN)); } insertAll(objs); }
private DBIDs mergeJoin(DBIDs first, DBIDs second) { assert (!(first instanceof HashSetDBIDs)); assert (!(second instanceof HashSetDBIDs)); ArrayModifiableDBIDs ids = DBIDUtil.newArray(); DBIDIter i1 = first.iter(), i2 = second.iter(); while (i1.valid() && i2.valid()) { int c = DBIDUtil.compare(i1, i2); if (c < 0) { i1.advance(); } else if (c > 0) { i2.advance(); } else { ids.add(i1); i1.advance(); i2.advance(); } } return ids; }
/** * Computes for each object the distance to one reference point. (one dimensional representation * of the data set) * * @param refPoint Reference Point Feature Vector * @param database database to work on * @param distFunc Distance function to use * @return array containing the distance to one reference point for each database object and the * object id */ protected DoubleDBIDList computeDistanceVector( NumberVector refPoint, Relation<? extends NumberVector> database, PrimitiveDistanceQuery<? super NumberVector> distFunc) { ModifiableDoubleDBIDList referenceDists = DBIDUtil.newDistanceDBIDList(database.size()); for (DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { referenceDists.add(distFunc.distance(iditer, refPoint), iditer); } referenceDists.sort(); return referenceDists; }
/** * Refine a range query. * * @param neighc Original result * @param adjustedEps New epsilon * @return refined list */ private DoubleDBIDList refineRange(DoubleDBIDList neighc, double adjustedEps) { ModifiableDoubleDBIDList n = DBIDUtil.newDistanceDBIDList(neighc.size()); // We don't have a guarantee for this list to be sorted for (DoubleDBIDListIter neighbor = neighc.iter(); neighbor.valid(); neighbor.advance()) { DoubleDBIDPair p = neighbor.getPair(); double dist = p.doubleValue(); if (dist <= adjustedEps) { n.add(dist, p); } } return n; }
@Override public int binarySearch(DBIDRef key) { int keyid = DBIDUtil.asInteger(key); if (keyid < start) { return -1; } final int off = keyid - start; if (off < len) { return off; } return -(len + 1); }
@Override public String toString() { StringBuilder buf = new StringBuilder(); buf.append("kNNList["); for (DoubleDBIDListIter iter = this.iter(); iter.valid(); ) { buf.append(iter.doubleValue()).append(':').append(DBIDUtil.toString(iter)); iter.advance(); if (iter.valid()) { buf.append(','); } } buf.append(']'); return buf.toString(); }
/** * Handles a DataStoreEvent with the specified type. If the current event type is not equal to the * specified type, the events accumulated up to now will be fired first. * * <p>The new event will be aggregated and fired on demand if {@link #accumulateDataStoreEvents} * is set, otherwise all registered <code>DataStoreListener</code> will be notified immediately * that the content of the database has been changed. * * @param object the object that has been changed, i.e. inserted, deleted or updated */ private void fireObjectChanged(DBIDRef object, Type type) { // flush first if (currentDataStoreEventType != null && !currentDataStoreEventType.equals(type)) { flushDataStoreEvents(); } if (this.dataStoreObjects == null) { this.dataStoreObjects = DBIDUtil.newHashSet(); } this.dataStoreObjects.add(object); currentDataStoreEventType = type; if (!accumulateDataStoreEvents) { flushDataStoreEvents(); } }
/** * Refine neighbors within a subset. * * @param neighc Neighbor candidates * @param dbid Query object * @param df distance function * @param adjustedEps Epsilon range * @param kernel Kernel * @return Neighbors of neighbor object */ private DoubleDBIDList subsetNeighborhoodQuery( DoubleDBIDList neighc, DBIDRef dbid, PrimitiveDistanceFunction<? super V> df, double adjustedEps, KernelDensityEstimator kernel) { ModifiableDoubleDBIDList n = DBIDUtil.newDistanceDBIDList(neighc.size()); V query = kernel.relation.get(dbid); for (DoubleDBIDListIter neighbor = neighc.iter(); neighbor.valid(); neighbor.advance()) { DoubleDBIDPair p = neighbor.getPair(); double dist = df.distance(query, kernel.relation.get(p)); if (dist <= adjustedEps) { n.add(dist, p); } } return n; }
/** * Compute the intersection size. * * @param neighbors1 SORTED neighbor ids of first * @param neighbors2 SORTED neighbor ids of second * @return Intersection size */ protected static int countSharedNeighbors(DBIDs neighbors1, DBIDs neighbors2) { int intersection = 0; DBIDIter iter1 = neighbors1.iter(); DBIDIter iter2 = neighbors2.iter(); while (iter1.valid() && iter2.valid()) { final int comp = DBIDUtil.compare(iter1, iter2); if (comp == 0) { intersection++; iter1.advance(); iter2.advance(); } else if (comp < 0) { iter1.advance(); } else // iter2 < iter1 { iter2.advance(); } } return intersection; }
/** * Inserts the specified objects into this index. If a bulk load mode is implemented, the objects * are inserted in one bulk. * * @param ids the objects to be inserted */ @Override public void insertAll(DBIDs ids) { if (ids.isEmpty() || (ids.size() == 1)) { return; } // Make an example leaf if (canBulkLoad()) { List<SpatialEntry> leafs = new ArrayList<>(ids.size()); for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { leafs.add(createNewLeafEntry(iter)); } bulkLoad(leafs); } else { for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { insert(DBIDUtil.deref(iter)); } } doExtraIntegrityChecks(); }
private DBIDs[] buildIndex(Relation<BitVector> relation, int dim, int minsupp) { ArrayModifiableDBIDs[] idx = new ArrayModifiableDBIDs[dim]; for (int i = 0; i < dim; i++) { idx[i] = DBIDUtil.newArray(); } for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { SparseFeatureVector<?> bv = relation.get(iter); // TODO: only count those which satisfy minlength? for (int it = bv.iter(); bv.iterValid(it); it = bv.iterAdvance(it)) { idx[bv.iterDim(it)].add(iter); } } // Forget non-frequent 1-itemsets. for (int i = 0; i < dim; i++) { if (idx[i].size() < minsupp) { idx[i] = null; } else { idx[i].sort(); } } return idx; }
/** * Run the ODIN algorithm * * @param database Database to run on. * @param relation Relation to process. * @return ODIN outlier result. */ public OutlierResult run(Database database, Relation<O> relation) { // Get the query functions: DistanceQuery<O> dq = database.getDistanceQuery(relation, getDistanceFunction()); KNNQuery<O> knnq = database.getKNNQuery(dq, k); // Get the objects to process, and a data storage for counting and output: DBIDs ids = relation.getDBIDs(); WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_DB, 0.); double inc = 1. / (k - 1); double min = Double.POSITIVE_INFINITY, max = 0.0; // Process all objects for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { // Find the nearest neighbors (using an index, if available!) DBIDs neighbors = knnq.getKNNForDBID(iter, k); // For each neighbor, except ourselves, increase the in-degree: for (DBIDIter nei = neighbors.iter(); nei.valid(); nei.advance()) { if (DBIDUtil.equal(iter, nei)) { continue; } final double value = scores.doubleValue(nei) + inc; if (value < min) { min = value; } if (value > max) { max = value; } scores.put(nei, value); } } // Wrap the result and add metadata. OutlierScoreMeta meta = new InvertedOutlierScoreMeta(min, max, 0., inc * (ids.size() - 1), 1); DoubleRelation rel = new MaterializedDoubleRelation("ODIN In-Degree", "odin", scores, ids); return new OutlierResult(meta, rel); }
public Result run(Database database, Relation<O> rel) { DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction()); int size = rel.size(); long pairs = (size * (long) size) >> 1; final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling; if (ssize > Integer.MAX_VALUE) { throw new AbortException("Sampling size too large."); } final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize); DoubleMaxHeap heap = new DoubleMaxHeap(qsize); ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs()); DBIDArrayIter i1 = ids.iter(), i2 = ids.iter(); Random r = rand.getSingleThreadedRandom(); FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null; for (long i = 0; i < ssize; i++) { int x = r.nextInt(size - 1) + 1, y = r.nextInt(x); double dist = dq.distance(i1.seek(x), i2.seek(y)); // Skip NaN, and/or zeros. if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) { continue; } heap.add(dist, qsize); LOG.incrementProcessed(prog); } LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile)); LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize)); LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek())); LOG.ensureCompleted(prog); Collection<String> header = Arrays.asList(new String[] {"Distance"}); Collection<Vector> data = Arrays.asList(new Vector[] {new Vector(heap.peek())}); return new CollectionResult<Vector>("Distances sample", "distance-sample", data, header); }
/** * Run the DBSCAN algorithm * * @param relation Data relation * @param rangeQuery Range query class */ protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) { final int size = relation.size(); FiniteProgress objprog = LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null; IndefiniteProgress clusprog = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null; processedIDs = DBIDUtil.newHashSet(size); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if (!processedIDs.contains(iditer)) { expandCluster(relation, rangeQuery, iditer, objprog, clusprog); } if (objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), LOG); clusprog.setProcessed(resultList.size(), LOG); } if (processedIDs.size() == size) { break; } } // Finish progress logging LOG.ensureCompleted(objprog); LOG.setCompleted(clusprog); }
@Override public T get(DBIDRef id) { return data.get(DBIDUtil.deref(id)); }
/** * Performs a single run of FastDOC, finding a single cluster. * * @param database Database context * @param relation used to get actual values for DBIDs. * @param S The set of points we're working on. * @param d Dimensionality of the data set we're currently working on. * @param r Size of random samples. * @param m Number of inner iterations (per seed point). * @param n Number of outer iterations (seed points). * @return a cluster, if one is found, else <code>null</code>. */ private Cluster<SubspaceModel> runFastDOC( Database database, Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) { // Relevant attributes of highest cardinality. long[] D = null; // The seed point for the best dimensions. DBIDVar dV = DBIDUtil.newVar(); // Inform the user about the progress in the current iteration. FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null; Random random = rnd.getSingleThreadedRandom(); DBIDArrayIter iter = S.iter(); outer: for (int i = 0; i < n; ++i) { // Pick a random seed point. iter.seek(random.nextInt(S.size())); for (int j = 0; j < m; ++j) { // Choose a set of random points. DBIDs randomSet = DBIDUtil.randomSample(S, r, random); // Initialize cluster info. long[] nD = BitsUtil.zero(d); // Test each dimension. for (int k = 0; k < d; ++k) { if (dimensionIsRelevant(k, relation, randomSet)) { BitsUtil.setI(nD, k); } } if (D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) { D = nD; dV.set(iter); if (BitsUtil.cardinality(D) >= d_zero) { if (iprogress != null) { iprogress.setProcessed(iprogress.getTotal(), LOG); } break outer; } } LOG.incrementProcessed(iprogress); } } LOG.ensureCompleted(iprogress); // If no relevant dimensions were found, skip it. if (D == null || BitsUtil.cardinality(D) == 0) { return null; } // Get all points in the box. SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D); DistanceQuery<V> dq = database.getDistanceQuery(relation, df); RangeQuery<V> rq = database.getRangeQuery(dq, DatabaseQuery.HINT_SINGLE); // TODO: add filtering capabilities into query API! DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w)); // If we have a non-empty cluster, return it. return (C.size() > 0) ? makeCluster(relation, C, D) : null; }
/** * Run the algorithm. * * @param database Database to use * @param relation Relation to use * @return Result */ public OutlierResult run(Database database, Relation<?> relation) { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); try (InputStream in = FileUtil.tryGzipInput(new FileInputStream(file)); // TokenizedReader reader = CSVReaderFormat.DEFAULT_FORMAT.makeReader()) { Tokenizer tokenizer = reader.getTokenizer(); CharSequence buf = reader.getBuffer(); Matcher mi = idpattern.matcher(buf), ms = scorepattern.matcher(buf); reader.reset(in); while (reader.nextLineExceptComments()) { Integer id = null; double score = Double.NaN; for ( /* initialized by nextLineExceptComments */ ; tokenizer.valid(); tokenizer.advance()) { mi.region(tokenizer.getStart(), tokenizer.getEnd()); ms.region(tokenizer.getStart(), tokenizer.getEnd()); final boolean mif = mi.find(); final boolean msf = ms.find(); if (mif && msf) { throw new AbortException( "ID pattern and score pattern both match value: " + tokenizer.getSubstring()); } if (mif) { if (id != null) { throw new AbortException( "ID pattern matched twice: previous value " + id + " second value: " + tokenizer.getSubstring()); } id = Integer.parseInt(buf.subSequence(mi.end(), tokenizer.getEnd()).toString()); } if (msf) { if (!Double.isNaN(score)) { throw new AbortException( "Score pattern matched twice: previous value " + score + " second value: " + tokenizer.getSubstring()); } score = ParseUtil.parseDouble(buf, ms.end(), tokenizer.getEnd()); } } if (id != null && !Double.isNaN(score)) { scores.putDouble(DBIDUtil.importInteger(id), score); minmax.put(score); } else if (id == null && Double.isNaN(score)) { LOG.warning( "Line did not match either ID nor score nor comment: " + reader.getLineNumber()); } else { throw new AbortException( "Line matched only ID or only SCORE patterns: " + reader.getLineNumber()); } } } catch (IOException e) { throw new AbortException( "Could not load outlier scores: " + e.getMessage() + " when loading " + file, e); } OutlierScoreMeta meta; if (inverted) { meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax()); } else { meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); } DoubleRelation scoresult = new MaterializedDoubleRelation( "External Outlier", "external-outlier", scores, relation.getDBIDs()); OutlierResult or = new OutlierResult(meta, scoresult); // Apply scaling if (scaling instanceof OutlierScalingFunction) { ((OutlierScalingFunction) scaling).prepare(or); } DoubleMinMax mm = new DoubleMinMax(); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double val = scoresult.doubleValue(iditer); val = scaling.getScaled(val); scores.putDouble(iditer, val); mm.put(val); } meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax()); or = new OutlierResult(meta, scoresult); return or; }
/** * Run the algorithm * * @param db Database * @param relation Relation * @return Clustering hierarchy */ public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) { DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); if (size > 0x10000) { throw new AbortException( "This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow."); } if (Linkage.SINGLE.equals(linkage)) { LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!"); } // Compute the initial (lower triangular) distance matrix. double[] scratch = new double[triangleSize(size)]; DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter(); // Position counter - must agree with computeOffset! int pos = 0; boolean square = Linkage.WARD.equals(linkage) && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction())); for (int x = 0; ix.valid(); x++, ix.advance()) { iy.seek(0); for (int y = 0; y < x; y++, iy.advance()) { scratch[pos] = dq.distance(ix, iy); // Ward uses variances -- i.e. squared values if (square) { scratch[pos] *= scratch[pos]; } pos++; } } // Initialize space for result: WritableDBIDDataStore parent = DataStoreUtil.makeDBIDStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableDoubleDataStore height = DataStoreUtil.makeDoubleStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage( ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { parent.put(it, it); height.put(it, Double.POSITIVE_INFINITY); csize.put(it, 1); } // Repeat until everything merged, except the desired number of clusters: FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null; for (int i = 1; i < size; i++) { double min = Double.POSITIVE_INFINITY; int minx = -1, miny = -1; for (ix.seek(0); ix.valid(); ix.advance()) { if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) { continue; } final int xbase = triangleSize(ix.getOffset()); for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) { continue; } final int idx = xbase + iy.getOffset(); if (scratch[idx] <= min) { min = scratch[idx]; minx = ix.getOffset(); miny = iy.getOffset(); } } } assert (minx >= 0 && miny >= 0); // Avoid allocating memory, by reusing existing iterators: ix.seek(minx); iy.seek(miny); // Perform merge in data structure: x -> y // Since y < x, prefer keeping y, dropping x. int sizex = csize.intValue(ix), sizey = csize.intValue(iy); height.put(ix, min); parent.put(ix, iy); csize.put(iy, sizex + sizey); // Update distance matrix. Note: miny < minx final int xbase = triangleSize(minx), ybase = triangleSize(miny); // Write to (y, j), with j < y for (ij.seek(0); ij.getOffset() < miny; ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int sizej = csize.intValue(ij); scratch[ybase + ij.getOffset()] = linkage.combine( sizex, scratch[xbase + ij.getOffset()], sizey, scratch[ybase + ij.getOffset()], sizej, min); } // Write to (j, y), with y < j < x for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + miny] = linkage.combine( sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min); } // Write to (j, y), with y < x < j for (ij.seek(minx + 1); ij.valid(); ij.advance()) { if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min); } LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); return new PointerHierarchyRepresentationResult(ids, parent, height); }