Пример #1
0
 @Override
 public T put(DBIDRef id, T value) {
   if (value == null) {
     return data.remove(DBIDUtil.deref(id));
   }
   return data.put(DBIDUtil.deref(id), value);
 }
Пример #2
0
 /**
  * Constructor.
  *
  * @param size Size
  * @param idmap ID map
  */
 public ArrayDBIDStore(int size, DataStoreIDMap idmap) {
   super();
   this.data = DBIDUtil.newArray(size);
   // Initialize
   DBIDRef inv = DBIDUtil.invalid();
   for (int i = 0; i < size; i++) {
     data.add(inv);
   }
   this.idmap = idmap;
 }
Пример #3
0
 /**
  * Actual setter.
  *
  * @param id Database ID
  * @param index column index
  * @param value new value
  * @param <T> type
  * @return previous value
  */
 @SuppressWarnings("unchecked")
 protected <T> T set(DBIDRef id, int index, T value) {
   Object[] d = data.get(DBIDUtil.deref(id));
   if (d == null) {
     d = new Object[rlen];
     data.put(DBIDUtil.deref(id), d);
   }
   T ret = (T) d[index];
   d[index] = value;
   return ret;
 }
Пример #4
0
  /**
   * Handles a DataStoreEvent with the specified type. If the current event type is not equal to the
   * specified type, the events accumulated up to now will be fired first.
   *
   * <p>The new event will be aggregated and fired on demand if {@link #accumulateDataStoreEvents}
   * is set, otherwise all registered <code>DataStoreListener</code> will be notified immediately
   * that the content of the database has been changed.
   *
   * @param objects the objects that have been changed, i.e. inserted, deleted or updated
   */
  private void fireObjectsChanged(DBIDs objects, Type type) {
    // flush first
    if (currentDataStoreEventType != null && !currentDataStoreEventType.equals(type)) {
      flushDataStoreEvents();
    }
    if (accumulateDataStoreEvents) {
      if (this.dataStoreObjects == null) {
        this.dataStoreObjects = DBIDUtil.newHashSet();
      }
      this.dataStoreObjects.addDBIDs(objects);
      currentDataStoreEventType = type;
      return;
    }
    // Execute immediately:
    DataStoreEvent e;
    switch (type) {
      case INSERT:
        e = DataStoreEvent.insertionEvent(objects);
        break;
      case REMOVE:
        e = DataStoreEvent.removalEvent(objects);
        break;
      case UPDATE:
        e = DataStoreEvent.updateEvent(objects);
        break;
      default:
        return;
    }

    for (int i = dataListenerList.size(); --i >= 0; ) {
      dataListenerList.get(i).contentChanged(e);
    }
  }
Пример #5
0
  protected double[] computeWithinDistances(
      Relation<? extends NumberVector> rel, List<? extends Cluster<?>> clusters, int withinPairs) {
    double[] concordant = new double[withinPairs];
    int i = 0;
    for (Cluster<?> cluster : clusters) {
      if (cluster.size() <= 1 || cluster.isNoise()) {
        switch (noiseHandling) {
          case IGNORE_NOISE:
            continue;
          case TREAT_NOISE_AS_SINGLETONS:
            continue; // No concordant distances.
          case MERGE_NOISE:
            break; // Treat like a cluster below.
        }
      }

      for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
        NumberVector obj = rel.get(it1);
        for (DBIDIter it2 = cluster.getIDs().iter(); it2.valid(); it2.advance()) {
          if (DBIDUtil.compare(it1, it2) <= 0) {
            continue;
          }
          concordant[i++] = distanceFunction.distance(obj, rel.get(it2));
        }
      }
    }
    assert (concordant.length == i);
    Arrays.sort(concordant);
    return concordant;
  }
Пример #6
0
  /** Performs the DBSCAN algorithm on the given database. */
  public Clustering<Model> run(Relation<O> relation) {
    final int size = relation.size();
    if (size < minpts) {
      Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
      result.addToplevelCluster(
          new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER));
      return result;
    }

    RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
    resultList = new ArrayList<>();
    noise = DBIDUtil.newHashSet();
    runDBSCAN(relation, rangeQuery);

    double averagen = ncounter / (double) relation.size();
    LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen));
    if (averagen < 1 + 0.1 * (minpts - 1)) {
      LOG.warning("There are very few neighbors found. Epsilon may be too small.");
    }
    if (averagen > 100 * minpts) {
      LOG.warning("There are very many neighbors found. Epsilon may be too large.");
    }

    Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
    for (ModifiableDBIDs res : resultList) {
      result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER));
    }
    result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return result;
  }
Пример #7
0
 /**
  * Utility method to create a subspace cluster from a list of DBIDs and the relevant attributes.
  *
  * @param relation to compute a centroid.
  * @param C the cluster points.
  * @param D the relevant dimensions.
  * @return an object representing the subspace cluster.
  */
 private Cluster<SubspaceModel> makeCluster(Relation<V> relation, DBIDs C, long[] D) {
   DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values!
   Cluster<SubspaceModel> cluster = new Cluster<>(ids);
   cluster.setModel(
       new SubspaceModel(new Subspace(D), Centroid.make(relation, ids).getArrayRef()));
   return cluster;
 }
Пример #8
0
  /**
   * DBSCAN-function expandCluster.
   *
   * <p>Border-Objects become members of the first possible cluster.
   *
   * @param relation Database relation to run on
   * @param rangeQuery Range query to use
   * @param startObjectID potential seed of a new potential cluster
   * @param objprog the progress object for logging the current status
   */
  protected void expandCluster(
      Relation<O> relation,
      RangeQuery<O> rangeQuery,
      DBIDRef startObjectID,
      FiniteProgress objprog,
      IndefiniteProgress clusprog) {
    DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
    ncounter += neighbors.size();

    // startObject is no core-object
    if (neighbors.size() < minpts) {
      noise.add(startObjectID);
      processedIDs.add(startObjectID);
      if (objprog != null) {
        objprog.incrementProcessed(LOG);
      }
      return;
    }

    ModifiableDBIDs currentCluster = DBIDUtil.newArray();
    currentCluster.add(startObjectID);
    processedIDs.add(startObjectID);

    // try to expand the cluster
    HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet();
    processNeighbors(neighbors.iter(), currentCluster, seeds);

    DBIDVar o = DBIDUtil.newVar();
    while (!seeds.isEmpty()) {
      seeds.pop(o);
      neighbors = rangeQuery.getRangeForDBID(o, epsilon);
      ncounter += neighbors.size();

      if (neighbors.size() >= minpts) {
        processNeighbors(neighbors.iter(), currentCluster, seeds);
      }

      if (objprog != null) {
        objprog.incrementProcessed(LOG);
      }
    }
    resultList.add(currentCluster);
    if (clusprog != null) {
      clusprog.setProcessed(resultList.size(), LOG);
    }
  }
Пример #9
0
 /**
  * Actual getter.
  *
  * @param id Database ID
  * @param index column index
  * @param <T> type
  * @return current value
  */
 @SuppressWarnings("unchecked")
 protected <T> T get(DBIDRef id, int index) {
   Object[] d = data.get(DBIDUtil.deref(id));
   if (d == null) {
     return null;
   }
   return (T) d[index];
 }
 @Override
 public boolean contains(DBIDRef o) {
   for (DBIDIter iter = iter(); iter.valid(); iter.advance()) {
     if (DBIDUtil.equal(iter, o)) {
       return true;
     }
   }
   return false;
 }
Пример #11
0
  @Override
  public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
      return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
      clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment =
        DataStoreUtil.makeIntegerStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];

    IndefiniteProgress prog =
        LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat =
        LOG.isStatistics()
            ? new DoubleStatistic(this.getClass().getName() + ".variance-sum")
            : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
      LOG.incrementProcessed(prog);
      boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
      logVarstat(varstat, varsum);
      // Stop if no cluster assignment changed.
      if (!changed) {
        break;
      }
      // Recompute means.
      means = means(clusters, means, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
      LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }

    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
      DBIDs ids = clusters.get(i);
      if (ids.size() == 0) {
        continue;
      }
      KMeansModel model = new KMeansModel(means[i], varsum[i]);
      result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
  }
Пример #12
0
  @Override
  public Clustering<BiclusterWithInversionsModel> biclustering() {
    double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);

    BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());

    Clustering<BiclusterWithInversionsModel> result =
        new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
    ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
    for (int i = 0; i < n; i++) {
      cand.reset();
      multipleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      singleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      nodeAddition(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      cand.maskMatrix(mat, dist);
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
      final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
      noise.removeDBIDs(cids);
      result.addToplevelCluster(new Cluster<>(cids, model));

      if (LOG.isVerbose()) {
        LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
        LOG.verbose("Number of rows: " + cand.rowcard + "\n");
        LOG.verbose("Number of columns: " + cand.colcard + "\n");
        // LOG.verbose("Total number of masked values: " + maskedVals.size() +
        // "\n");
      }
      LOG.incrementProcessed(prog);
    }
    // Add a noise cluster, full-dimensional.
    if (!noise.isEmpty()) {
      long[] allcols = BitsUtil.ones(getColDim());
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
      result.addToplevelCluster(new Cluster<>(noise, true, model));
    }
    LOG.ensureCompleted(prog);
    return result;
  }
Пример #13
0
 @Override
 public void initialize() {
   super.initialize();
   List<MkAppEntry> objs = new ArrayList<>(relation.size());
   for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
     DBID id = DBIDUtil.deref(iter);
     final O object = relation.get(id);
     objs.add(createNewLeafEntry(id, object, Double.NaN));
   }
   insertAll(objs);
 }
Пример #14
0
  private DBIDs mergeJoin(DBIDs first, DBIDs second) {
    assert (!(first instanceof HashSetDBIDs));
    assert (!(second instanceof HashSetDBIDs));
    ArrayModifiableDBIDs ids = DBIDUtil.newArray();

    DBIDIter i1 = first.iter(), i2 = second.iter();
    while (i1.valid() && i2.valid()) {
      int c = DBIDUtil.compare(i1, i2);
      if (c < 0) {
        i1.advance();
      } else if (c > 0) {
        i2.advance();
      } else {
        ids.add(i1);
        i1.advance();
        i2.advance();
      }
    }
    return ids;
  }
Пример #15
0
 /**
  * Computes for each object the distance to one reference point. (one dimensional representation
  * of the data set)
  *
  * @param refPoint Reference Point Feature Vector
  * @param database database to work on
  * @param distFunc Distance function to use
  * @return array containing the distance to one reference point for each database object and the
  *     object id
  */
 protected DoubleDBIDList computeDistanceVector(
     NumberVector refPoint,
     Relation<? extends NumberVector> database,
     PrimitiveDistanceQuery<? super NumberVector> distFunc) {
   ModifiableDoubleDBIDList referenceDists = DBIDUtil.newDistanceDBIDList(database.size());
   for (DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) {
     referenceDists.add(distFunc.distance(iditer, refPoint), iditer);
   }
   referenceDists.sort();
   return referenceDists;
 }
Пример #16
0
 /**
  * Refine a range query.
  *
  * @param neighc Original result
  * @param adjustedEps New epsilon
  * @return refined list
  */
 private DoubleDBIDList refineRange(DoubleDBIDList neighc, double adjustedEps) {
   ModifiableDoubleDBIDList n = DBIDUtil.newDistanceDBIDList(neighc.size());
   // We don't have a guarantee for this list to be sorted
   for (DoubleDBIDListIter neighbor = neighc.iter(); neighbor.valid(); neighbor.advance()) {
     DoubleDBIDPair p = neighbor.getPair();
     double dist = p.doubleValue();
     if (dist <= adjustedEps) {
       n.add(dist, p);
     }
   }
   return n;
 }
Пример #17
0
 @Override
 public int binarySearch(DBIDRef key) {
   int keyid = DBIDUtil.asInteger(key);
   if (keyid < start) {
     return -1;
   }
   final int off = keyid - start;
   if (off < len) {
     return off;
   }
   return -(len + 1);
 }
 @Override
 public String toString() {
   StringBuilder buf = new StringBuilder();
   buf.append("kNNList[");
   for (DoubleDBIDListIter iter = this.iter(); iter.valid(); ) {
     buf.append(iter.doubleValue()).append(':').append(DBIDUtil.toString(iter));
     iter.advance();
     if (iter.valid()) {
       buf.append(',');
     }
   }
   buf.append(']');
   return buf.toString();
 }
Пример #19
0
  /**
   * Handles a DataStoreEvent with the specified type. If the current event type is not equal to the
   * specified type, the events accumulated up to now will be fired first.
   *
   * <p>The new event will be aggregated and fired on demand if {@link #accumulateDataStoreEvents}
   * is set, otherwise all registered <code>DataStoreListener</code> will be notified immediately
   * that the content of the database has been changed.
   *
   * @param object the object that has been changed, i.e. inserted, deleted or updated
   */
  private void fireObjectChanged(DBIDRef object, Type type) {
    // flush first
    if (currentDataStoreEventType != null && !currentDataStoreEventType.equals(type)) {
      flushDataStoreEvents();
    }
    if (this.dataStoreObjects == null) {
      this.dataStoreObjects = DBIDUtil.newHashSet();
    }
    this.dataStoreObjects.add(object);
    currentDataStoreEventType = type;

    if (!accumulateDataStoreEvents) {
      flushDataStoreEvents();
    }
  }
Пример #20
0
 /**
  * Refine neighbors within a subset.
  *
  * @param neighc Neighbor candidates
  * @param dbid Query object
  * @param df distance function
  * @param adjustedEps Epsilon range
  * @param kernel Kernel
  * @return Neighbors of neighbor object
  */
 private DoubleDBIDList subsetNeighborhoodQuery(
     DoubleDBIDList neighc,
     DBIDRef dbid,
     PrimitiveDistanceFunction<? super V> df,
     double adjustedEps,
     KernelDensityEstimator kernel) {
   ModifiableDoubleDBIDList n = DBIDUtil.newDistanceDBIDList(neighc.size());
   V query = kernel.relation.get(dbid);
   for (DoubleDBIDListIter neighbor = neighc.iter(); neighbor.valid(); neighbor.advance()) {
     DoubleDBIDPair p = neighbor.getPair();
     double dist = df.distance(query, kernel.relation.get(p));
     if (dist <= adjustedEps) {
       n.add(dist, p);
     }
   }
   return n;
 }
 /**
  * Compute the intersection size.
  *
  * @param neighbors1 SORTED neighbor ids of first
  * @param neighbors2 SORTED neighbor ids of second
  * @return Intersection size
  */
 protected static int countSharedNeighbors(DBIDs neighbors1, DBIDs neighbors2) {
   int intersection = 0;
   DBIDIter iter1 = neighbors1.iter();
   DBIDIter iter2 = neighbors2.iter();
   while (iter1.valid() && iter2.valid()) {
     final int comp = DBIDUtil.compare(iter1, iter2);
     if (comp == 0) {
       intersection++;
       iter1.advance();
       iter2.advance();
     } else if (comp < 0) {
       iter1.advance();
     } else // iter2 < iter1
     {
       iter2.advance();
     }
   }
   return intersection;
 }
Пример #22
0
  /**
   * Inserts the specified objects into this index. If a bulk load mode is implemented, the objects
   * are inserted in one bulk.
   *
   * @param ids the objects to be inserted
   */
  @Override
  public void insertAll(DBIDs ids) {
    if (ids.isEmpty() || (ids.size() == 1)) {
      return;
    }

    // Make an example leaf
    if (canBulkLoad()) {
      List<SpatialEntry> leafs = new ArrayList<>(ids.size());
      for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        leafs.add(createNewLeafEntry(iter));
      }
      bulkLoad(leafs);
    } else {
      for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        insert(DBIDUtil.deref(iter));
      }
    }

    doExtraIntegrityChecks();
  }
Пример #23
0
 private DBIDs[] buildIndex(Relation<BitVector> relation, int dim, int minsupp) {
   ArrayModifiableDBIDs[] idx = new ArrayModifiableDBIDs[dim];
   for (int i = 0; i < dim; i++) {
     idx[i] = DBIDUtil.newArray();
   }
   for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
     SparseFeatureVector<?> bv = relation.get(iter);
     // TODO: only count those which satisfy minlength?
     for (int it = bv.iter(); bv.iterValid(it); it = bv.iterAdvance(it)) {
       idx[bv.iterDim(it)].add(iter);
     }
   }
   // Forget non-frequent 1-itemsets.
   for (int i = 0; i < dim; i++) {
     if (idx[i].size() < minsupp) {
       idx[i] = null;
     } else {
       idx[i].sort();
     }
   }
   return idx;
 }
Пример #24
0
  /**
   * Run the ODIN algorithm
   *
   * @param database Database to run on.
   * @param relation Relation to process.
   * @return ODIN outlier result.
   */
  public OutlierResult run(Database database, Relation<O> relation) {
    // Get the query functions:
    DistanceQuery<O> dq = database.getDistanceQuery(relation, getDistanceFunction());
    KNNQuery<O> knnq = database.getKNNQuery(dq, k);

    // Get the objects to process, and a data storage for counting and output:
    DBIDs ids = relation.getDBIDs();
    WritableDoubleDataStore scores =
        DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_DB, 0.);

    double inc = 1. / (k - 1);
    double min = Double.POSITIVE_INFINITY, max = 0.0;
    // Process all objects
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
      // Find the nearest neighbors (using an index, if available!)
      DBIDs neighbors = knnq.getKNNForDBID(iter, k);
      // For each neighbor, except ourselves, increase the in-degree:
      for (DBIDIter nei = neighbors.iter(); nei.valid(); nei.advance()) {
        if (DBIDUtil.equal(iter, nei)) {
          continue;
        }
        final double value = scores.doubleValue(nei) + inc;
        if (value < min) {
          min = value;
        }
        if (value > max) {
          max = value;
        }
        scores.put(nei, value);
      }
    }

    // Wrap the result and add metadata.
    OutlierScoreMeta meta = new InvertedOutlierScoreMeta(min, max, 0., inc * (ids.size() - 1), 1);
    DoubleRelation rel = new MaterializedDoubleRelation("ODIN In-Degree", "odin", scores, ids);
    return new OutlierResult(meta, rel);
  }
Пример #25
0
  public Result run(Database database, Relation<O> rel) {
    DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction());
    int size = rel.size();
    long pairs = (size * (long) size) >> 1;

    final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling;
    if (ssize > Integer.MAX_VALUE) {
      throw new AbortException("Sampling size too large.");
    }
    final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize);

    DoubleMaxHeap heap = new DoubleMaxHeap(qsize);

    ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs());
    DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
    Random r = rand.getSingleThreadedRandom();

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null;
    for (long i = 0; i < ssize; i++) {
      int x = r.nextInt(size - 1) + 1, y = r.nextInt(x);
      double dist = dq.distance(i1.seek(x), i2.seek(y));
      // Skip NaN, and/or zeros.
      if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) {
        continue;
      }
      heap.add(dist, qsize);
      LOG.incrementProcessed(prog);
    }

    LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile));
    LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize));
    LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek()));
    LOG.ensureCompleted(prog);
    Collection<String> header = Arrays.asList(new String[] {"Distance"});
    Collection<Vector> data = Arrays.asList(new Vector[] {new Vector(heap.peek())});
    return new CollectionResult<Vector>("Distances sample", "distance-sample", data, header);
  }
Пример #26
0
  /**
   * Run the DBSCAN algorithm
   *
   * @param relation Data relation
   * @param rangeQuery Range query class
   */
  protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) {
    final int size = relation.size();
    FiniteProgress objprog =
        LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null;
    IndefiniteProgress clusprog =
        LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;

    processedIDs = DBIDUtil.newHashSet(size);
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      if (!processedIDs.contains(iditer)) {
        expandCluster(relation, rangeQuery, iditer, objprog, clusprog);
      }
      if (objprog != null && clusprog != null) {
        objprog.setProcessed(processedIDs.size(), LOG);
        clusprog.setProcessed(resultList.size(), LOG);
      }
      if (processedIDs.size() == size) {
        break;
      }
    }
    // Finish progress logging
    LOG.ensureCompleted(objprog);
    LOG.setCompleted(clusprog);
  }
Пример #27
0
 @Override
 public T get(DBIDRef id) {
   return data.get(DBIDUtil.deref(id));
 }
Пример #28
0
  /**
   * Performs a single run of FastDOC, finding a single cluster.
   *
   * @param database Database context
   * @param relation used to get actual values for DBIDs.
   * @param S The set of points we're working on.
   * @param d Dimensionality of the data set we're currently working on.
   * @param r Size of random samples.
   * @param m Number of inner iterations (per seed point).
   * @param n Number of outer iterations (seed points).
   * @return a cluster, if one is found, else <code>null</code>.
   */
  private Cluster<SubspaceModel> runFastDOC(
      Database database, Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
    // Relevant attributes of highest cardinality.
    long[] D = null;
    // The seed point for the best dimensions.
    DBIDVar dV = DBIDUtil.newVar();

    // Inform the user about the progress in the current iteration.
    FiniteProgress iprogress =
        LOG.isVerbose()
            ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG)
            : null;

    Random random = rnd.getSingleThreadedRandom();

    DBIDArrayIter iter = S.iter();
    outer:
    for (int i = 0; i < n; ++i) {
      // Pick a random seed point.
      iter.seek(random.nextInt(S.size()));

      for (int j = 0; j < m; ++j) {
        // Choose a set of random points.
        DBIDs randomSet = DBIDUtil.randomSample(S, r, random);

        // Initialize cluster info.
        long[] nD = BitsUtil.zero(d);

        // Test each dimension.
        for (int k = 0; k < d; ++k) {
          if (dimensionIsRelevant(k, relation, randomSet)) {
            BitsUtil.setI(nD, k);
          }
        }

        if (D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) {
          D = nD;
          dV.set(iter);

          if (BitsUtil.cardinality(D) >= d_zero) {
            if (iprogress != null) {
              iprogress.setProcessed(iprogress.getTotal(), LOG);
            }
            break outer;
          }
        }
        LOG.incrementProcessed(iprogress);
      }
    }
    LOG.ensureCompleted(iprogress);

    // If no relevant dimensions were found, skip it.
    if (D == null || BitsUtil.cardinality(D) == 0) {
      return null;
    }

    // Get all points in the box.
    SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D);
    DistanceQuery<V> dq = database.getDistanceQuery(relation, df);
    RangeQuery<V> rq = database.getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);

    // TODO: add filtering capabilities into query API!
    DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w));

    // If we have a non-empty cluster, return it.
    return (C.size() > 0) ? makeCluster(relation, C, D) : null;
  }
Пример #29
0
  /**
   * Run the algorithm.
   *
   * @param database Database to use
   * @param relation Relation to use
   * @return Result
   */
  public OutlierResult run(Database database, Relation<?> relation) {
    WritableDoubleDataStore scores =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);

    DoubleMinMax minmax = new DoubleMinMax();

    try (InputStream in = FileUtil.tryGzipInput(new FileInputStream(file)); //
        TokenizedReader reader = CSVReaderFormat.DEFAULT_FORMAT.makeReader()) {
      Tokenizer tokenizer = reader.getTokenizer();
      CharSequence buf = reader.getBuffer();
      Matcher mi = idpattern.matcher(buf), ms = scorepattern.matcher(buf);
      reader.reset(in);
      while (reader.nextLineExceptComments()) {
        Integer id = null;
        double score = Double.NaN;
        for (
        /* initialized by nextLineExceptComments */ ; tokenizer.valid(); tokenizer.advance()) {
          mi.region(tokenizer.getStart(), tokenizer.getEnd());
          ms.region(tokenizer.getStart(), tokenizer.getEnd());
          final boolean mif = mi.find();
          final boolean msf = ms.find();
          if (mif && msf) {
            throw new AbortException(
                "ID pattern and score pattern both match value: " + tokenizer.getSubstring());
          }
          if (mif) {
            if (id != null) {
              throw new AbortException(
                  "ID pattern matched twice: previous value "
                      + id
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            id = Integer.parseInt(buf.subSequence(mi.end(), tokenizer.getEnd()).toString());
          }
          if (msf) {
            if (!Double.isNaN(score)) {
              throw new AbortException(
                  "Score pattern matched twice: previous value "
                      + score
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            score = ParseUtil.parseDouble(buf, ms.end(), tokenizer.getEnd());
          }
        }
        if (id != null && !Double.isNaN(score)) {
          scores.putDouble(DBIDUtil.importInteger(id), score);
          minmax.put(score);
        } else if (id == null && Double.isNaN(score)) {
          LOG.warning(
              "Line did not match either ID nor score nor comment: " + reader.getLineNumber());
        } else {
          throw new AbortException(
              "Line matched only ID or only SCORE patterns: " + reader.getLineNumber());
        }
      }
    } catch (IOException e) {
      throw new AbortException(
          "Could not load outlier scores: " + e.getMessage() + " when loading " + file, e);
    }

    OutlierScoreMeta meta;
    if (inverted) {
      meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    } else {
      meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    }
    DoubleRelation scoresult =
        new MaterializedDoubleRelation(
            "External Outlier", "external-outlier", scores, relation.getDBIDs());
    OutlierResult or = new OutlierResult(meta, scoresult);

    // Apply scaling
    if (scaling instanceof OutlierScalingFunction) {
      ((OutlierScalingFunction) scaling).prepare(or);
    }
    DoubleMinMax mm = new DoubleMinMax();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      double val = scoresult.doubleValue(iditer);
      val = scaling.getScaled(val);
      scores.putDouble(iditer, val);
      mm.put(val);
    }
    meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax());
    or = new OutlierResult(meta, scoresult);
    return or;
  }
  /**
   * Run the algorithm
   *
   * @param db Database
   * @param relation Relation
   * @return Clustering hierarchy
   */
  public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();

    if (size > 0x10000) {
      throw new AbortException(
          "This implementation does not scale to data sets larger than "
              + 0x10000
              + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    if (Linkage.SINGLE.equals(linkage)) {
      LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    }

    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    boolean square =
        Linkage.WARD.equals(linkage)
            && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction()));
    for (int x = 0; ix.valid(); x++, ix.advance()) {
      iy.seek(0);
      for (int y = 0; y < x; y++, iy.advance()) {
        scratch[pos] = dq.distance(ix, iy);
        // Ward uses variances -- i.e. squared values
        if (square) {
          scratch[pos] *= scratch[pos];
        }
        pos++;
      }
    }

    // Initialize space for result:
    WritableDBIDDataStore parent =
        DataStoreUtil.makeDBIDStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore height =
        DataStoreUtil.makeDoubleStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableIntegerDataStore csize =
        DataStoreUtil.makeIntegerStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
      parent.put(it, it);
      height.put(it, Double.POSITIVE_INFINITY);
      csize.put(it, 1);
    }

    // Repeat until everything merged, except the desired number of clusters:
    FiniteProgress prog =
        LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null;
    for (int i = 1; i < size; i++) {
      double min = Double.POSITIVE_INFINITY;
      int minx = -1, miny = -1;
      for (ix.seek(0); ix.valid(); ix.advance()) {
        if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int xbase = triangleSize(ix.getOffset());
        for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) {
          if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) {
            continue;
          }
          final int idx = xbase + iy.getOffset();
          if (scratch[idx] <= min) {
            min = scratch[idx];
            minx = ix.getOffset();
            miny = iy.getOffset();
          }
        }
      }
      assert (minx >= 0 && miny >= 0);
      // Avoid allocating memory, by reusing existing iterators:
      ix.seek(minx);
      iy.seek(miny);
      // Perform merge in data structure: x -> y
      // Since y < x, prefer keeping y, dropping x.
      int sizex = csize.intValue(ix), sizey = csize.intValue(iy);
      height.put(ix, min);
      parent.put(ix, iy);
      csize.put(iy, sizex + sizey);

      // Update distance matrix. Note: miny < minx
      final int xbase = triangleSize(minx), ybase = triangleSize(miny);
      // Write to (y, j), with j < y
      for (ij.seek(0); ij.getOffset() < miny; ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int sizej = csize.intValue(ij);
        scratch[ybase + ij.getOffset()] =
            linkage.combine(
                sizex,
                scratch[xbase + ij.getOffset()],
                sizey,
                scratch[ybase + ij.getOffset()],
                sizej,
                min);
      }
      // Write to (j, y), with y < j < x
      for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int jbase = triangleSize(ij.getOffset());
        final int sizej = csize.intValue(ij);
        scratch[jbase + miny] =
            linkage.combine(
                sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min);
      }
      // Write to (j, y), with y < x < j
      for (ij.seek(minx + 1); ij.valid(); ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int jbase = triangleSize(ij.getOffset());
        final int sizej = csize.intValue(ij);
        scratch[jbase + miny] =
            linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min);
      }
      LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);

    return new PointerHierarchyRepresentationResult(ids, parent, height);
  }