Example #1
0
  /**
   * Main loop for OUTRES
   *
   * @param relation Relation to process
   * @return Outlier detection result
   */
  public OutlierResult run(Relation<V> relation) {
    WritableDoubleDataStore ranks =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax minmax = new DoubleMinMax();

    KernelDensityEstimator kernel = new KernelDensityEstimator(relation);
    long[] subspace = BitsUtil.zero(kernel.dim);

    FiniteProgress progress =
        LOG.isVerbose() ? new FiniteProgress("OUTRES scores", relation.size(), LOG) : null;

    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      BitsUtil.zeroI(subspace);
      double score = outresScore(0, subspace, iditer, kernel);
      ranks.putDouble(iditer, score);
      minmax.put(score);
      LOG.incrementProcessed(progress);
    }
    LOG.ensureCompleted(progress);

    OutlierScoreMeta meta =
        new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.);
    OutlierResult outresResult =
        new OutlierResult(
            meta,
            new MaterializedDoubleRelation("OUTRES", "outres-score", ranks, relation.getDBIDs()));
    return outresResult;
  }
Example #2
0
  /**
   * Subspace relevance test.
   *
   * @param subspace Subspace to test
   * @param neigh Neighbor list
   * @param kernel Kernel density estimator
   * @return relevance test result
   */
  protected boolean relevantSubspace(
      long[] subspace, DoubleDBIDList neigh, KernelDensityEstimator kernel) {
    Relation<V> relation = kernel.relation;
    final double crit = K_S_CRITICAL001 / Math.sqrt(neigh.size());

    for (int dim = BitsUtil.nextSetBit(subspace, 0);
        dim > 0;
        dim = BitsUtil.nextSetBit(subspace, dim + 1)) {
      // TODO: can we save this copy somehow?
      double[] data = new double[neigh.size()];
      {
        int count = 0;
        for (DBIDIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) {
          V vector = relation.get(neighbor);
          data[count] = vector.doubleValue(dim);
          count++;
        }
        assert (count == neigh.size());
      }
      Arrays.sort(data);

      final double norm = data[data.length - 1] - data[0];
      final double min = data[0];

      // Kolmogorow-Smirnow-Test against uniform distribution:
      for (int j = 1; j < data.length - 2; j++) {
        double delta = (j / (data.length - 1.)) - ((data[j] - min) / norm);
        if (Math.abs(delta) > crit) {
          return false;
        }
      }
    }
    return true;
  }
Example #3
0
 @Override
 public ProjectedIndex<O, O> instantiate(Relation<O> relation) {
   if (!proj.getInputDataTypeInformation()
       .isAssignableFromType(relation.getDataTypeInformation())) {
     return null;
   }
   proj.initialize(relation.getDataTypeInformation());
   final Relation<O> view;
   if (materialize) {
     DBIDs ids = relation.getDBIDs();
     WritableDataStore<O> content =
         DataStoreUtil.makeStorage(
             ids,
             DataStoreFactory.HINT_DB,
             proj.getOutputDataTypeInformation().getRestrictionClass());
     for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
       content.put(iter, proj.project(relation.get(iter)));
     }
     view =
         new MaterializedRelation<>(
             "ECEF Projection",
             "ecef-projection",
             proj.getOutputDataTypeInformation(),
             content,
             ids);
   } else {
     view = new ProjectedView<>(relation, proj);
   }
   Index inneri = inner.instantiate(view);
   if (inneri == null) {
     return null;
   }
   return new LngLatAsECEFIndex<>(relation, proj, view, inneri, norefine);
 }
  @Override
  public void redraw() {
    setupCSS(svgp);
    final StyleLibrary style = context.getStyleResult().getStyleLibrary();
    double dotsize = style.getLineWidth(StyleLibrary.PLOT);

    for (DBIDIter id = sample.getSample().iter(); id.valid(); id.advance()) {
      double[] v = proj.fastProjectDataToRenderSpace(rel.get(id));
      if (v[0] != v[0] || v[1] != v[1]) {
        continue; // NaN!
      }
      Element tooltip = makeTooltip(id, v[0], v[1], dotsize);
      SVGUtil.addCSSClass(tooltip, TOOLTIP_HIDDEN);

      // sensitive area.
      Element area = svgp.svgRect(v[0] - dotsize, v[1] - dotsize, 2 * dotsize, 2 * dotsize);
      SVGUtil.addCSSClass(area, TOOLTIP_AREA);

      EventTarget targ = (EventTarget) area;
      targ.addEventListener(SVGConstants.SVG_MOUSEOVER_EVENT_TYPE, hoverer, false);
      targ.addEventListener(SVGConstants.SVG_MOUSEOUT_EVENT_TYPE, hoverer, false);
      targ.addEventListener(SVGConstants.SVG_CLICK_EVENT_TYPE, hoverer, false);

      // NOTE: do not change the sequence in which these are inserted!
      layer.appendChild(area);
      layer.appendChild(tooltip);
    }
  }
 @Override
 public boolean contains(DBIDRef o) {
   for (DBIDIter iter = iter(); iter.valid(); iter.advance()) {
     if (DBIDUtil.equal(iter, o)) {
       return true;
     }
   }
   return false;
 }
  @Override
  public void run() {
    Database database = input.getDatabase();
    Relation<O> relation = database.getRelation(distance.getInputTypeRestriction());
    DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, distance);
    KNNQuery<O> knnQ = database.getKNNQuery(distanceQuery, DatabaseQuery.HINT_HEAVY_USE);

    // open file.
    try (RandomAccessFile file = new RandomAccessFile(out, "rw");
        FileChannel channel = file.getChannel();
        // and acquire a file write lock
        FileLock lock = channel.lock()) {
      // write magic header
      file.writeInt(KNN_CACHE_MAGIC);

      int bufsize = k * 12 * 2 + 10; // Initial size, enough for 2 kNN.
      ByteBuffer buffer = ByteBuffer.allocateDirect(bufsize);

      FiniteProgress prog =
          LOG.isVerbose() ? new FiniteProgress("Computing kNN", relation.size(), LOG) : null;

      for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
        final KNNList nn = knnQ.getKNNForDBID(it, k);
        final int nnsize = nn.size();

        // Grow the buffer when needed:
        if (nnsize * 12 + 10 > bufsize) {
          while (nnsize * 12 + 10 > bufsize) {
            bufsize <<= 1;
          }
          buffer = ByteBuffer.allocateDirect(bufsize);
        }

        buffer.clear();
        ByteArrayUtil.writeUnsignedVarint(buffer, it.internalGetIndex());
        ByteArrayUtil.writeUnsignedVarint(buffer, nnsize);
        int c = 0;
        for (DoubleDBIDListIter ni = nn.iter(); ni.valid(); ni.advance(), c++) {
          ByteArrayUtil.writeUnsignedVarint(buffer, ni.internalGetIndex());
          buffer.putDouble(ni.doubleValue());
        }
        if (c != nn.size()) {
          throw new AbortException("Sizes did not agree. Cache is invalid.");
        }

        buffer.flip();
        channel.write(buffer);
        LOG.incrementProcessed(prog);
      }
      LOG.ensureCompleted(prog);
      lock.release();
    } catch (IOException e) {
      LOG.exception(e);
    }
    // FIXME: close!
  }
 /**
  * Computes for each object the distance to one reference point. (one dimensional representation
  * of the data set)
  *
  * @param refPoint Reference Point Feature Vector
  * @param database database to work on
  * @param distFunc Distance function to use
  * @return array containing the distance to one reference point for each database object and the
  *     object id
  */
 protected DoubleDBIDList computeDistanceVector(
     NumberVector refPoint,
     Relation<? extends NumberVector> database,
     PrimitiveDistanceQuery<? super NumberVector> distFunc) {
   ModifiableDoubleDBIDList referenceDists = DBIDUtil.newDistanceDBIDList(database.size());
   for (DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) {
     referenceDists.add(distFunc.distance(iditer, refPoint), iditer);
   }
   referenceDists.sort();
   return referenceDists;
 }
Example #8
0
 /**
  * Process a single core point.
  *
  * @param neighbor Iterator over neighbors
  * @param currentCluster Current cluster
  * @param seeds Seed set
  */
 private void processNeighbors(
     DBIDIter neighbor, ModifiableDBIDs currentCluster, HashSetModifiableDBIDs seeds) {
   for (; neighbor.valid(); neighbor.advance()) {
     if (processedIDs.add(neighbor)) {
       seeds.add(neighbor);
     } else if (!noise.remove(neighbor)) {
       continue;
     }
     currentCluster.add(neighbor);
   }
 }
Example #9
0
 @Override
 public void initialize() {
   super.initialize();
   List<MkAppEntry> objs = new ArrayList<>(relation.size());
   for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
     DBID id = DBIDUtil.deref(iter);
     final O object = relation.get(id);
     objs.add(createNewLeafEntry(id, object, Double.NaN));
   }
   insertAll(objs);
 }
Example #10
0
 /**
  * Utility method to test if a given dimension is relevant as determined via a set of reference
  * points (i.e. if the variance along the attribute is lower than the threshold).
  *
  * @param dimension the dimension to test.
  * @param relation used to get actual values for DBIDs.
  * @param points the points to test.
  * @return <code>true</code> if the dimension is relevant.
  */
 private boolean dimensionIsRelevant(int dimension, Relation<V> relation, DBIDs points) {
   double min = Double.POSITIVE_INFINITY, max = Double.NEGATIVE_INFINITY;
   for (DBIDIter iter = points.iter(); iter.valid(); iter.advance()) {
     double xV = relation.get(iter).doubleValue(dimension);
     min = (xV < min) ? xV : min;
     max = (xV > max) ? xV : max;
     if (max - min > w) {
       return false;
     }
   }
   return true;
 }
Example #11
0
File: LOCI.java Project: fjfd/elki
  /**
   * Preprocessing step: determine the radii of interest for each point.
   *
   * @param ids IDs to process
   * @param rangeQuery Range query
   * @param interestingDistances Distances of interest
   */
  protected void precomputeInterestingRadii(
      DBIDs ids,
      RangeQuery<O> rangeQuery,
      WritableDataStore<DoubleIntArrayList> interestingDistances) {
    FiniteProgress progressPreproc =
        LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null;
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
      DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax);
      // build list of critical distances
      DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1);
      {
        int i = 0;
        DoubleDBIDListIter ni = neighbors.iter();
        while (ni.valid()) {
          final double curdist = ni.doubleValue();
          ++i;
          ni.advance();
          // Skip, if tied to the next object:
          if (ni.valid() && curdist == ni.doubleValue()) {
            continue;
          }
          cdist.append(curdist, i);
          // Scale radius, and reinsert
          if (alpha != 1.) {
            final double ri = curdist / alpha;
            if (ri <= rmax) {
              cdist.append(ri, Integer.MIN_VALUE);
            }
          }
        }
      }
      cdist.sort();

      // fill the gaps to have fast lookups of number of neighbors at a given
      // distance.
      int lastk = 0;
      for (int i = 0, size = cdist.size(); i < size; i++) {
        final int k = cdist.getInt(i);
        if (k == Integer.MIN_VALUE) {
          cdist.setValue(i, lastk);
        } else {
          lastk = k;
        }
      }
      // TODO: shrink the list, removing duplicate radii?

      interestingDistances.put(iditer, cdist);
      LOG.incrementProcessed(progressPreproc);
    }
    LOG.ensureCompleted(progressPreproc);
  }
Example #12
0
  /**
   * Process a database
   *
   * @param database Database to process
   * @param relation Relation to process
   * @return Histogram of ranking qualities
   */
  public HistogramResult<DoubleVector> run(Database database, Relation<O> relation) {
    final DistanceQuery<O> distanceQuery =
        database.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<O> knnQuery = database.getKNNQuery(distanceQuery, relation.size());

    if (LOG.isVerbose()) {
      LOG.verbose("Preprocessing clusters...");
    }
    // Cluster by labels
    Collection<Cluster<Model>> split =
        (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();

    DoubleStaticHistogram hist = new DoubleStaticHistogram(numbins, 0.0, 1.0);

    if (LOG.isVerbose()) {
      LOG.verbose("Processing points...");
    }
    FiniteProgress progress =
        LOG.isVerbose()
            ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG)
            : null;

    MeanVariance mv = new MeanVariance();
    // sort neighbors
    for (Cluster<?> clus : split) {
      for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
        KNNList knn = knnQuery.getKNNForDBID(iter, relation.size());
        double result = new ROCEvaluation().evaluate(clus, knn);

        mv.put(result);
        hist.increment(result, 1. / relation.size());

        LOG.incrementProcessed(progress);
      }
    }
    LOG.ensureCompleted(progress);

    // Transform Histogram into a Double Vector array.
    Collection<DoubleVector> res = new ArrayList<>(relation.size());
    for (DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) {
      DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), iter.getValue()});
      res.add(row);
    }
    HistogramResult<DoubleVector> result =
        new HistogramResult<>("Ranking Quality Histogram", "ranking-histogram", res);
    result.addHeader("Mean: " + mv.getMean() + " Variance: " + mv.getSampleVariance());
    return result;
  }
Example #13
0
    @Override
    public void fullRedraw() {
      setupCanvas();
      final StyleLibrary style = context.getStyleLibrary();
      CSSClass css = new CSSClass(svgp, POLYS);
      // TODO: separate fill and line colors?
      css.setStatement(
          SVGConstants.CSS_STROKE_WIDTH_PROPERTY, style.getLineWidth(StyleLibrary.POLYGONS));
      css.setStatement(SVGConstants.CSS_STROKE_PROPERTY, style.getColor(StyleLibrary.POLYGONS));
      css.setStatement(SVGConstants.CSS_FILL_PROPERTY, SVGConstants.CSS_NONE_VALUE);
      svgp.addCSSClassOrLogError(css);
      svgp.updateStyleElement();

      // draw data
      for (DBIDIter iditer = rep.iterDBIDs(); iditer.valid(); iditer.advance()) {
        try {
          PolygonsObject poly = rep.get(iditer);
          if (poly == null) {
            continue;
          }
          SVGPath path = new SVGPath();
          for (Polygon ppoly : poly.getPolygons()) {
            Vector first = ppoly.get(0);
            double[] f = proj.fastProjectDataToRenderSpace(first.getArrayRef());
            path.moveTo(f[0], f[1]);
            for (ArrayListIter<Vector> it = ppoly.iter(); it.valid(); it.advance()) {
              if (it.getOffset() == 0) {
                continue;
              }
              double[] p = proj.fastProjectDataToRenderSpace(it.get().getArrayRef());
              path.drawTo(p[0], p[1]);
            }
            // close path.
            path.drawTo(f[0], f[1]);
          }
          Element e = path.makeElement(svgp);
          SVGUtil.addCSSClass(e, POLYS);
          layer.appendChild(e);
        } catch (ObjectNotFoundException e) {
          // ignore.
        }
      }
    }
  protected double[] computeWithinDistances(
      Relation<? extends NumberVector> rel, List<? extends Cluster<?>> clusters, int withinPairs) {
    double[] concordant = new double[withinPairs];
    int i = 0;
    for (Cluster<?> cluster : clusters) {
      if (cluster.size() <= 1 || cluster.isNoise()) {
        switch (noiseHandling) {
          case IGNORE_NOISE:
            continue;
          case TREAT_NOISE_AS_SINGLETONS:
            continue; // No concordant distances.
          case MERGE_NOISE:
            break; // Treat like a cluster below.
        }
      }

      for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
        NumberVector obj = rel.get(it1);
        for (DBIDIter it2 = cluster.getIDs().iter(); it2.valid(); it2.advance()) {
          if (DBIDUtil.compare(it1, it2) <= 0) {
            continue;
          }
          concordant[i++] = distanceFunction.distance(obj, rel.get(it2));
        }
      }
    }
    assert (concordant.length == i);
    Arrays.sort(concordant);
    return concordant;
  }
Example #15
0
 private DBIDs[] buildIndex(Relation<BitVector> relation, int dim, int minsupp) {
   ArrayModifiableDBIDs[] idx = new ArrayModifiableDBIDs[dim];
   for (int i = 0; i < dim; i++) {
     idx[i] = DBIDUtil.newArray();
   }
   for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
     SparseFeatureVector<?> bv = relation.get(iter);
     // TODO: only count those which satisfy minlength?
     for (int it = bv.iter(); bv.iterValid(it); it = bv.iterAdvance(it)) {
       idx[bv.iterDim(it)].add(iter);
     }
   }
   // Forget non-frequent 1-itemsets.
   for (int i = 0; i < dim; i++) {
     if (idx[i].size() < minsupp) {
       idx[i] = null;
     } else {
       idx[i].sort();
     }
   }
   return idx;
 }
Example #16
0
  /**
   * Run the DBSCAN algorithm
   *
   * @param relation Data relation
   * @param rangeQuery Range query class
   */
  protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) {
    final int size = relation.size();
    FiniteProgress objprog =
        LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null;
    IndefiniteProgress clusprog =
        LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;

    processedIDs = DBIDUtil.newHashSet(size);
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      if (!processedIDs.contains(iditer)) {
        expandCluster(relation, rangeQuery, iditer, objprog, clusprog);
      }
      if (objprog != null && clusprog != null) {
        objprog.setProcessed(processedIDs.size(), LOG);
        clusprog.setProcessed(resultList.size(), LOG);
      }
      if (processedIDs.size() == size) {
        break;
      }
    }
    // Finish progress logging
    LOG.ensureCompleted(objprog);
    LOG.setCompleted(clusprog);
  }
 /**
  * Compute the intersection size.
  *
  * @param neighbors1 SORTED neighbor ids of first
  * @param neighbors2 SORTED neighbor ids of second
  * @return Intersection size
  */
 protected static int countSharedNeighbors(DBIDs neighbors1, DBIDs neighbors2) {
   int intersection = 0;
   DBIDIter iter1 = neighbors1.iter();
   DBIDIter iter2 = neighbors2.iter();
   while (iter1.valid() && iter2.valid()) {
     final int comp = DBIDUtil.compare(iter1, iter2);
     if (comp == 0) {
       intersection++;
       iter1.advance();
       iter2.advance();
     } else if (comp < 0) {
       iter1.advance();
     } else // iter2 < iter1
     {
       iter2.advance();
     }
   }
   return intersection;
 }
  /**
   * Run the algorithm on the given relation.
   *
   * @param database Database
   * @param relation Relation to process
   * @return Outlier result
   */
  public OutlierResult run(Database database, Relation<? extends NumberVector> relation) {
    @SuppressWarnings("unchecked")
    PrimitiveDistanceQuery<? super NumberVector> distq =
        (PrimitiveDistanceQuery<? super NumberVector>)
            database.getDistanceQuery(relation, distanceFunction);
    Collection<? extends NumberVector> refPoints = refp.getReferencePoints(relation);
    if (refPoints.size() < 1) {
      throw new AbortException("Cannot compute ROS without reference points!");
    }

    DBIDs ids = relation.getDBIDs();
    if (k >= ids.size()) {
      throw new AbortException("k must not be chosen larger than the database size!");
    }
    // storage of distance/score values.
    WritableDoubleDataStore rbod_score =
        DataStoreUtil.makeDoubleStorage(
            ids, DataStoreFactory.HINT_STATIC | DataStoreFactory.HINT_HOT, Double.NaN);

    // Compute density estimation:
    for (NumberVector refPoint : refPoints) {
      DoubleDBIDList referenceDists = computeDistanceVector(refPoint, relation, distq);
      updateDensities(rbod_score, referenceDists);
    }
    // compute maximum density
    DoubleMinMax mm = new DoubleMinMax();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      mm.put(rbod_score.doubleValue(iditer));
    }
    // compute ROS
    double scale = mm.getMax() > 0. ? 1. / mm.getMax() : 1.;
    mm.reset(); // Reuse
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      double score = 1 - (rbod_score.doubleValue(iditer) * scale);
      mm.put(score);
      rbod_score.putDouble(iditer, score);
    }

    DoubleRelation scoreResult =
        new MaterializedDoubleRelation(
            "Reference-points Outlier Scores",
            "reference-outlier",
            rbod_score,
            relation.getDBIDs());
    OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax(), 0., 1., 0.);
    OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
    // adds reference points to the result. header information for the
    // visualizer to find the reference points in the result
    result.addChildResult(
        new ReferencePointsResult<>("Reference points", "reference-points", refPoints));
    return result;
  }
Example #19
0
  private DBIDs mergeJoin(DBIDs first, DBIDs second) {
    assert (!(first instanceof HashSetDBIDs));
    assert (!(second instanceof HashSetDBIDs));
    ArrayModifiableDBIDs ids = DBIDUtil.newArray();

    DBIDIter i1 = first.iter(), i2 = second.iter();
    while (i1.valid() && i2.valid()) {
      int c = DBIDUtil.compare(i1, i2);
      if (c < 0) {
        i1.advance();
      } else if (c > 0) {
        i2.advance();
      } else {
        ids.add(i1);
        i1.advance();
        i2.advance();
      }
    }
    return ids;
  }
Example #20
0
  /**
   * Run the algorithm
   *
   * @param relation Data relation
   * @return Outlier result
   */
  public OutlierResult run(Relation<V> relation) {
    DoubleMinMax mm = new DoubleMinMax();
    // resulting scores
    WritableDoubleDataStore oscores =
        DataStoreUtil.makeDoubleStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);

    // Compute mean and covariance Matrix
    CovarianceMatrix temp = CovarianceMatrix.make(relation);
    double[] mean = temp.getMeanVector(relation).toArray();
    // debugFine(mean.toString());
    Matrix covarianceMatrix = temp.destroyToNaiveMatrix();
    // debugFine(covarianceMatrix.toString());
    Matrix covarianceTransposed =
        covarianceMatrix.cheatToAvoidSingularity(SINGULARITY_CHEAT).inverse();

    // Normalization factors for Gaussian PDF
    final double fakt =
        (1.0
            / (Math.sqrt(
                MathUtil.powi(MathUtil.TWOPI, RelationUtil.dimensionality(relation))
                    * covarianceMatrix.det())));

    // for each object compute Mahalanobis distance
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      double[] x = minusEquals(relation.get(iditer).toArray(), mean);
      // Gaussian PDF
      final double mDist = transposeTimesTimes(x, covarianceTransposed, x);
      final double prob = fakt * Math.exp(-mDist * .5);

      mm.put(prob);
      oscores.putDouble(iditer, prob);
    }

    final OutlierScoreMeta meta;
    if (invert) {
      double max = mm.getMax() != 0 ? mm.getMax() : 1.;
      for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        oscores.putDouble(iditer, (max - oscores.doubleValue(iditer)) / max);
      }
      meta = new BasicOutlierScoreMeta(0.0, 1.0);
    } else {
      meta = new InvertedOutlierScoreMeta(mm.getMin(), mm.getMax(), 0.0, Double.POSITIVE_INFINITY);
    }
    DoubleRelation res =
        new MaterializedDoubleRelation(
            "Gaussian Model Outlier Score", "gaussian-model-outlier", oscores, relation.getDBIDs());
    return new OutlierResult(meta, res);
  }
Example #21
0
  /**
   * Inserts the specified objects into this index. If a bulk load mode is implemented, the objects
   * are inserted in one bulk.
   *
   * @param ids the objects to be inserted
   */
  @Override
  public void insertAll(DBIDs ids) {
    if (ids.isEmpty() || (ids.size() == 1)) {
      return;
    }

    // Make an example leaf
    if (canBulkLoad()) {
      List<SpatialEntry> leafs = new ArrayList<>(ids.size());
      for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        leafs.add(createNewLeafEntry(iter));
      }
      bulkLoad(leafs);
    } else {
      for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        insert(DBIDUtil.deref(iter));
      }
    }

    doExtraIntegrityChecks();
  }
Example #22
0
  /**
   * Run the ODIN algorithm
   *
   * @param database Database to run on.
   * @param relation Relation to process.
   * @return ODIN outlier result.
   */
  public OutlierResult run(Database database, Relation<O> relation) {
    // Get the query functions:
    DistanceQuery<O> dq = database.getDistanceQuery(relation, getDistanceFunction());
    KNNQuery<O> knnq = database.getKNNQuery(dq, k);

    // Get the objects to process, and a data storage for counting and output:
    DBIDs ids = relation.getDBIDs();
    WritableDoubleDataStore scores =
        DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_DB, 0.);

    double inc = 1. / (k - 1);
    double min = Double.POSITIVE_INFINITY, max = 0.0;
    // Process all objects
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
      // Find the nearest neighbors (using an index, if available!)
      DBIDs neighbors = knnq.getKNNForDBID(iter, k);
      // For each neighbor, except ourselves, increase the in-degree:
      for (DBIDIter nei = neighbors.iter(); nei.valid(); nei.advance()) {
        if (DBIDUtil.equal(iter, nei)) {
          continue;
        }
        final double value = scores.doubleValue(nei) + inc;
        if (value < min) {
          min = value;
        }
        if (value > max) {
          max = value;
        }
        scores.put(nei, value);
      }
    }

    // Wrap the result and add metadata.
    OutlierScoreMeta meta = new InvertedOutlierScoreMeta(min, max, 0., inc * (ids.size() - 1), 1);
    DoubleRelation rel = new MaterializedDoubleRelation("ODIN In-Degree", "odin", scores, ids);
    return new OutlierResult(meta, rel);
  }
Example #23
0
File: SOF.java Project: 4sp1r3/elki
  /**
   * The main run method
   *
   * @param database Database to use (actually unused)
   * @param spatial Relation for neighborhood
   * @param relation Attributes to evaluate
   * @return Outlier result
   */
  public OutlierResult run(Database database, Relation<N> spatial, Relation<O> relation) {
    final NeighborSetPredicate npred =
        getNeighborSetPredicateFactory().instantiate(database, spatial);
    DistanceQuery<O> distFunc = getNonSpatialDistanceFunction().instantiate(relation);

    WritableDoubleDataStore lrds =
        DataStoreUtil.makeDoubleStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
    WritableDoubleDataStore lofs =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax lofminmax = new DoubleMinMax();

    // Compute densities
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      DBIDs neighbors = npred.getNeighborDBIDs(iditer);
      double avg = 0;
      for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
        avg += distFunc.distance(iditer, iter);
      }
      double lrd = 1 / (avg / neighbors.size());
      if (Double.isNaN(lrd)) {
        lrd = 0;
      }
      lrds.putDouble(iditer, lrd);
    }

    // Compute density quotients
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      DBIDs neighbors = npred.getNeighborDBIDs(iditer);
      double avg = 0;
      for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
        avg += lrds.doubleValue(iter);
      }
      final double lrd = (avg / neighbors.size()) / lrds.doubleValue(iditer);
      if (!Double.isNaN(lrd)) {
        lofs.putDouble(iditer, lrd);
        lofminmax.put(lrd);
      } else {
        lofs.putDouble(iditer, 0.0);
      }
    }

    // Build result representation.
    DoubleRelation scoreResult =
        new MaterializedDoubleRelation(
            "Spatial Outlier Factor", "sof-outlier", lofs, relation.getDBIDs());
    OutlierScoreMeta scoreMeta =
        new QuotientOutlierScoreMeta(
            lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
    OutlierResult or = new OutlierResult(scoreMeta, scoreResult);
    or.addChildResult(npred);
    return or;
  }
Example #24
0
File: LOCI.java Project: fjfd/elki
  /**
   * Run the algorithm
   *
   * @param database Database to process
   * @param relation Relation to process
   * @return Outlier result
   */
  public OutlierResult run(Database database, Relation<O> relation) {
    DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
    RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc);
    DBIDs ids = relation.getDBIDs();

    // LOCI preprocessing step
    WritableDataStore<DoubleIntArrayList> interestingDistances =
        DataStoreUtil.makeStorage(
            relation.getDBIDs(),
            DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED,
            DoubleIntArrayList.class);
    precomputeInterestingRadii(ids, rangeQuery, interestingDistances);
    // LOCI main step
    FiniteProgress progressLOCI =
        LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null;
    WritableDoubleDataStore mdef_norm =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore mdef_radius =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax minmax = new DoubleMinMax();

    // Shared instance, to save allocations.
    MeanVariance mv_n_r_alpha = new MeanVariance();

    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
      final DoubleIntArrayList cdist = interestingDistances.get(iditer);
      final double maxdist = cdist.getDouble(cdist.size() - 1);
      final int maxneig = cdist.getInt(cdist.size() - 1);

      double maxmdefnorm = 0.0;
      double maxnormr = 0;
      if (maxneig >= nmin) {
        // Compute the largest neighborhood we will need.
        DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist);
        // TODO: Ensure the result is sorted. This is currently implied.

        // For any critical distance, compute the normalized MDEF score.
        for (int i = 0, size = cdist.size(); i < size; i++) {
          // Only start when minimum size is fulfilled
          if (cdist.getInt(i) < nmin) {
            continue;
          }
          final double r = cdist.getDouble(i);
          final double alpha_r = alpha * r;
          // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!)
          final int n_alphar = cdist.getInt(cdist.find(alpha_r));
          // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF}
          mv_n_r_alpha.reset();
          for (DoubleDBIDListIter neighbor = maxneighbors.iter();
              neighbor.valid();
              neighbor.advance()) {
            // Stop at radius r
            if (neighbor.doubleValue() > r) {
              break;
            }
            DoubleIntArrayList cdist2 = interestingDistances.get(neighbor);
            int rn_alphar = cdist2.getInt(cdist2.find(alpha_r));
            mv_n_r_alpha.put(rn_alphar);
          }
          // We only use the average and standard deviation
          final double nhat_r_alpha = mv_n_r_alpha.getMean();
          final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev();

          // Redundant divisions by nhat_r_alpha removed.
          final double mdef = nhat_r_alpha - n_alphar;
          final double sigmamdef = sigma_nhat_r_alpha;
          final double mdefnorm = mdef / sigmamdef;

          if (mdefnorm > maxmdefnorm) {
            maxmdefnorm = mdefnorm;
            maxnormr = r;
          }
        }
      } else {
        // FIXME: when nmin was not fulfilled - what is the proper value then?
        maxmdefnorm = Double.POSITIVE_INFINITY;
        maxnormr = maxdist;
      }
      mdef_norm.putDouble(iditer, maxmdefnorm);
      mdef_radius.putDouble(iditer, maxnormr);
      minmax.put(maxmdefnorm);
      LOG.incrementProcessed(progressLOCI);
    }
    LOG.ensureCompleted(progressLOCI);
    DoubleRelation scoreResult =
        new MaterializedDoubleRelation(
            "LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs());
    OutlierScoreMeta scoreMeta =
        new QuotientOutlierScoreMeta(
            minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0);
    OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
    result.addChildResult(
        new MaterializedDoubleRelation(
            "LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs()));
    return result;
  }
  /**
   * Evaluate a single clustering.
   *
   * @param db Database
   * @param rel Data relation
   * @param c Clustering
   * @return Mean simplified silhouette
   */
  public double evaluateClustering(
      Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int ignorednoise = centroids(rel, clusters, centroids, noiseOption);

    MeanVariance mssil = new MeanVariance();

    Iterator<? extends Cluster<?>> ci = clusters.iterator();
    for (int i = 0; ci.hasNext(); i++) {
      Cluster<?> cluster = ci.next();
      if (cluster.size() <= 1) {
        // As suggested in Rousseeuw, we use 0 for singletons.
        mssil.put(0., cluster.size());
        continue;
      }
      if (cluster.isNoise()) {
        switch (noiseOption) {
          case IGNORE_NOISE:
            continue; // Ignore elements
          case TREAT_NOISE_AS_SINGLETONS:
            // As suggested in Rousseeuw, we use 0 for singletons.
            mssil.put(0., cluster.size());
            continue;
          case MERGE_NOISE:
            break; // Treat as cluster below
        }
      }

      // Cluster center:
      final NumberVector center = centroids[i];
      assert (center != null);
      for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
        NumberVector obj = rel.get(it);
        // a: Distance to own centroid
        double a = distance.distance(center, obj);

        // b: Distance to other clusters centroids:
        double min = Double.POSITIVE_INFINITY;
        Iterator<? extends Cluster<?>> cj = clusters.iterator();
        for (int j = 0; cj.hasNext(); j++) {
          Cluster<?> ocluster = cj.next();
          if (i == j) {
            continue;
          }
          NumberVector other = centroids[j];
          if (other == null) { // Noise!
            switch (noiseOption) {
              case IGNORE_NOISE:
                continue;
              case TREAT_NOISE_AS_SINGLETONS:
                // Treat each object like a centroid!
                for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
                  double dist = distance.distance(rel.get(it2), obj);
                  min = dist < min ? dist : min;
                }
                continue;
              case MERGE_NOISE:
                break; // Treat as cluster below, but should not be reachable.
            }
          }
          // Clusters: use centroid.
          double dist = distance.distance(other, obj);
          min = dist < min ? dist : min;
        }

        // One 'real' cluster only?
        min = min < Double.POSITIVE_INFINITY ? min : a;
        mssil.put((min - a) / (min > a ? min : a));
      }
    }

    double penalty = 1.;
    // Only if {@link NoiseHandling#IGNORE_NOISE}:
    if (penalize && ignorednoise > 0) {
      penalty = (rel.size() - ignorednoise) / (double) rel.size();
    }
    final double meanssil = penalty * mssil.getMean();
    final double stdssil = penalty * mssil.getSampleStddev();
    if (LOG.isStatistics()) {
      LOG.statistics(
          new StringStatistic(
              key + ".simplified-silhouette.noise-handling", noiseOption.toString()));
      if (ignorednoise > 0) {
        LOG.statistics(new LongStatistic(key + ".simplified-silhouette.ignored", ignorednoise));
      }
      LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.mean", meanssil));
      LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.stddev", stdssil));
    }

    EvaluationResult ev =
        EvaluationResult.findOrCreate(
            db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure(
        "Simp. Silhouette +-" + FormatUtil.NF2.format(stdssil), meanssil, -1., 1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return meanssil;
  }
  /**
   * Run the algorithm.
   *
   * @param database Database to use
   * @param relation Relation to use
   * @return Result
   */
  public OutlierResult run(Database database, Relation<?> relation) {
    WritableDoubleDataStore scores =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);

    DoubleMinMax minmax = new DoubleMinMax();

    try (InputStream in = FileUtil.tryGzipInput(new FileInputStream(file)); //
        TokenizedReader reader = CSVReaderFormat.DEFAULT_FORMAT.makeReader()) {
      Tokenizer tokenizer = reader.getTokenizer();
      CharSequence buf = reader.getBuffer();
      Matcher mi = idpattern.matcher(buf), ms = scorepattern.matcher(buf);
      reader.reset(in);
      while (reader.nextLineExceptComments()) {
        Integer id = null;
        double score = Double.NaN;
        for (
        /* initialized by nextLineExceptComments */ ; tokenizer.valid(); tokenizer.advance()) {
          mi.region(tokenizer.getStart(), tokenizer.getEnd());
          ms.region(tokenizer.getStart(), tokenizer.getEnd());
          final boolean mif = mi.find();
          final boolean msf = ms.find();
          if (mif && msf) {
            throw new AbortException(
                "ID pattern and score pattern both match value: " + tokenizer.getSubstring());
          }
          if (mif) {
            if (id != null) {
              throw new AbortException(
                  "ID pattern matched twice: previous value "
                      + id
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            id = Integer.parseInt(buf.subSequence(mi.end(), tokenizer.getEnd()).toString());
          }
          if (msf) {
            if (!Double.isNaN(score)) {
              throw new AbortException(
                  "Score pattern matched twice: previous value "
                      + score
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            score = ParseUtil.parseDouble(buf, ms.end(), tokenizer.getEnd());
          }
        }
        if (id != null && !Double.isNaN(score)) {
          scores.putDouble(DBIDUtil.importInteger(id), score);
          minmax.put(score);
        } else if (id == null && Double.isNaN(score)) {
          LOG.warning(
              "Line did not match either ID nor score nor comment: " + reader.getLineNumber());
        } else {
          throw new AbortException(
              "Line matched only ID or only SCORE patterns: " + reader.getLineNumber());
        }
      }
    } catch (IOException e) {
      throw new AbortException(
          "Could not load outlier scores: " + e.getMessage() + " when loading " + file, e);
    }

    OutlierScoreMeta meta;
    if (inverted) {
      meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    } else {
      meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    }
    DoubleRelation scoresult =
        new MaterializedDoubleRelation(
            "External Outlier", "external-outlier", scores, relation.getDBIDs());
    OutlierResult or = new OutlierResult(meta, scoresult);

    // Apply scaling
    if (scaling instanceof OutlierScalingFunction) {
      ((OutlierScalingFunction) scaling).prepare(or);
    }
    DoubleMinMax mm = new DoubleMinMax();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      double val = scoresult.doubleValue(iditer);
      val = scaling.getScaled(val);
      scores.putDouble(iditer, val);
      mm.put(val);
    }
    meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax());
    or = new OutlierResult(meta, scoresult);
    return or;
  }
  /**
   * Run the algorithm
   *
   * @param db Database
   * @param relation Relation
   * @return Clustering hierarchy
   */
  public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();

    if (size > 0x10000) {
      throw new AbortException(
          "This implementation does not scale to data sets larger than "
              + 0x10000
              + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    if (Linkage.SINGLE.equals(linkage)) {
      LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    }

    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    boolean square =
        Linkage.WARD.equals(linkage)
            && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction()));
    for (int x = 0; ix.valid(); x++, ix.advance()) {
      iy.seek(0);
      for (int y = 0; y < x; y++, iy.advance()) {
        scratch[pos] = dq.distance(ix, iy);
        // Ward uses variances -- i.e. squared values
        if (square) {
          scratch[pos] *= scratch[pos];
        }
        pos++;
      }
    }

    // Initialize space for result:
    WritableDBIDDataStore parent =
        DataStoreUtil.makeDBIDStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore height =
        DataStoreUtil.makeDoubleStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableIntegerDataStore csize =
        DataStoreUtil.makeIntegerStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
      parent.put(it, it);
      height.put(it, Double.POSITIVE_INFINITY);
      csize.put(it, 1);
    }

    // Repeat until everything merged, except the desired number of clusters:
    FiniteProgress prog =
        LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null;
    for (int i = 1; i < size; i++) {
      double min = Double.POSITIVE_INFINITY;
      int minx = -1, miny = -1;
      for (ix.seek(0); ix.valid(); ix.advance()) {
        if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int xbase = triangleSize(ix.getOffset());
        for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) {
          if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) {
            continue;
          }
          final int idx = xbase + iy.getOffset();
          if (scratch[idx] <= min) {
            min = scratch[idx];
            minx = ix.getOffset();
            miny = iy.getOffset();
          }
        }
      }
      assert (minx >= 0 && miny >= 0);
      // Avoid allocating memory, by reusing existing iterators:
      ix.seek(minx);
      iy.seek(miny);
      // Perform merge in data structure: x -> y
      // Since y < x, prefer keeping y, dropping x.
      int sizex = csize.intValue(ix), sizey = csize.intValue(iy);
      height.put(ix, min);
      parent.put(ix, iy);
      csize.put(iy, sizex + sizey);

      // Update distance matrix. Note: miny < minx
      final int xbase = triangleSize(minx), ybase = triangleSize(miny);
      // Write to (y, j), with j < y
      for (ij.seek(0); ij.getOffset() < miny; ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int sizej = csize.intValue(ij);
        scratch[ybase + ij.getOffset()] =
            linkage.combine(
                sizex,
                scratch[xbase + ij.getOffset()],
                sizey,
                scratch[ybase + ij.getOffset()],
                sizej,
                min);
      }
      // Write to (j, y), with y < j < x
      for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int jbase = triangleSize(ij.getOffset());
        final int sizej = csize.intValue(ij);
        scratch[jbase + miny] =
            linkage.combine(
                sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min);
      }
      // Write to (j, y), with y < x < j
      for (ij.seek(minx + 1); ij.valid(); ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int jbase = triangleSize(ij.getOffset());
        final int sizej = csize.intValue(ij);
        scratch[jbase + miny] =
            linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min);
      }
      LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);

    return new PointerHierarchyRepresentationResult(ids, parent, height);
  }
Example #28
0
  /**
   * Evaluate a single clustering.
   *
   * @param db Database
   * @param rel Data relation
   * @param c Clustering
   * @return C-Index
   */
  public double evaluateClustering(
      Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();

    // theta is the sum, w the number of within group distances
    double theta = 0;
    int w = 0;
    int ignorednoise = 0;
    int isize = clusters.size() <= 1 ? rel.size() : rel.size() / (clusters.size() - 1);
    DoubleArray pairDists = new DoubleArray(isize);

    for (int i = 0; i < clusters.size(); i++) {
      Cluster<?> cluster = clusters.get(i);
      if (cluster.size() <= 1 || cluster.isNoise()) {
        switch (noiseOption) {
          case IGNORE_NOISE:
            ignorednoise += cluster.size();
            continue; // Ignore
          case TREAT_NOISE_AS_SINGLETONS:
            continue; // No within-cluster distances!
          case MERGE_NOISE:
            break; // Treat like a cluster
        }
      }
      for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
        O obj = rel.get(it1);
        // Compare object to every cluster, but only once
        for (int j = i; j < clusters.size(); j++) {
          Cluster<?> ocluster = clusters.get(j);
          if (ocluster.size() <= 1 || ocluster.isNoise()) {
            switch (noiseOption) {
              case IGNORE_NOISE:
                continue; // Ignore this cluster.
              case TREAT_NOISE_AS_SINGLETONS:
              case MERGE_NOISE:
                break; // Treat like a cluster
            }
          }
          for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
            if (DBIDUtil.compare(it1, it2) <= 0) { // Only once.
              continue;
            }
            double dist = dq.distance(obj, rel.get(it2));
            pairDists.add(dist);
            if (ocluster == cluster) { // Within-cluster distances.
              theta += dist;
              w++;
            }
          }
        }
      }
    }

    // Simulate best and worst cases:
    pairDists.sort();
    double min = 0, max = 0;
    for (int i = 0, j = pairDists.size() - 1; i < w; i++, j--) {
      min += pairDists.get(i);
      max += pairDists.get(j);
    }

    double cIndex = (max > min) ? (theta - min) / (max - min) : 0.;

    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString()));
      if (ignorednoise > 0) {
        LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise));
      }
      LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex));
    }

    EvaluationResult ev =
        EvaluationResult.findOrCreate(
            db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("C-Index", cIndex, 0., 1., 0., true);
    db.getHierarchy().resultChanged(ev);
    return cIndex;
  }
  /**
   * Evaluate a single clustering.
   *
   * @param db Database
   * @param rel Data relation
   * @param c Clustering
   * @return Gamma index
   */
  public double evaluateClustering(
      Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();

    int ignorednoise = 0, withinPairs = 0;
    for (Cluster<?> cluster : clusters) {
      if ((cluster.size() <= 1 || cluster.isNoise())) {
        switch (noiseHandling) {
          case IGNORE_NOISE:
            ignorednoise += cluster.size();
            continue;
          case TREAT_NOISE_AS_SINGLETONS:
            continue; // No concordant distances.
          case MERGE_NOISE:
            break; // Treat like a cluster below.
        }
      }
      withinPairs += (cluster.size() * (cluster.size() - 1)) >>> 1;
      if (withinPairs < 0) {
        throw new AbortException(
            "Integer overflow - clusters too large to compute pairwise distances.");
      }
    }
    // Materialize within-cluster distances (sorted):
    double[] withinDistances = computeWithinDistances(rel, clusters, withinPairs);
    int[] withinTies = new int[withinDistances.length];
    // Count ties within
    countTies(withinDistances, withinTies);

    long concordantPairs = 0, discordantPairs = 0, betweenPairs = 0;

    // Step two, compute discordant distances:
    for (int i = 0; i < clusters.size(); i++) {
      Cluster<?> ocluster1 = clusters.get(i);
      if ((ocluster1.size() <= 1 || ocluster1.isNoise()) //
          && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
        continue;
      }
      for (int j = i + 1; j < clusters.size(); j++) {
        Cluster<?> ocluster2 = clusters.get(j);
        if ((ocluster2.size() <= 1 || ocluster2.isNoise()) //
            && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
          continue;
        }
        betweenPairs += ocluster1.size() * ocluster2.size();
        for (DBIDIter oit1 = ocluster1.getIDs().iter(); oit1.valid(); oit1.advance()) {
          NumberVector obj = rel.get(oit1);
          for (DBIDIter oit2 = ocluster2.getIDs().iter(); oit2.valid(); oit2.advance()) {
            double dist = distanceFunction.distance(obj, rel.get(oit2));
            int p = Arrays.binarySearch(withinDistances, dist);
            if (p >= 0) { // Tied distances:
              while (p > 0 && withinDistances[p - 1] >= dist) {
                --p;
              }
              concordantPairs += p;
              discordantPairs += withinDistances.length - p - withinTies[p];
              continue;
            }
            p = -p - 1;
            concordantPairs += p;
            discordantPairs += withinDistances.length - p;
          }
        }
      }
    }

    // Total number of pairs possible:
    final long t = ((rel.size() - ignorednoise) * (long) (rel.size() - ignorednoise - 1)) >>> 1;
    final long tt = (t * (t - 1)) >>> 1;

    final double gamma =
        (concordantPairs - discordantPairs) / (double) (concordantPairs + discordantPairs);
    final double tau =
        computeTau(concordantPairs, discordantPairs, tt, withinDistances.length, betweenPairs);

    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString()));
      if (ignorednoise > 0) {
        LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise));
      }
      LOG.statistics(new DoubleStatistic(key + ".gamma", gamma));
      LOG.statistics(new DoubleStatistic(key + ".tau", tau));
    }

    EvaluationResult ev =
        EvaluationResult.findOrCreate(
            db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Concordance-based Evaluation");
    g.addMeasure("Gamma", gamma, -1., 1., 0., false);
    g.addMeasure("Tau", tau, -1., +1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return gamma;
  }
Example #30
0
 @Override
 public void deleteAll(DBIDs ids) {
   for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
     delete(iter);
   }
 }