Java Logging Examples, de.lmu.ifi.dbs.elki.logging.Logging Java Examples

Example #1

0

Show file

File: DBSCAN.java Project: XiaoKeSec/elki

  /** Performs the DBSCAN algorithm on the given database. */
  public Clustering<Model> run(Relation<O> relation) {
    final int size = relation.size();
    if (size < minpts) {
      Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
      result.addToplevelCluster(
          new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER));
      return result;
    }

    RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
    resultList = new ArrayList<>();
    noise = DBIDUtil.newHashSet();
    runDBSCAN(relation, rangeQuery);

    double averagen = ncounter / (double) relation.size();
    LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen));
    if (averagen < 1 + 0.1 * (minpts - 1)) {
      LOG.warning("There are very few neighbors found. Epsilon may be too small.");
    }
    if (averagen > 100 * minpts) {
      LOG.warning("There are very many neighbors found. Epsilon may be too large.");
    }

    Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
    for (ModifiableDBIDs res : resultList) {
      result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER));
    }
    result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return result;
  }

Example #2

0

Show file

File: OUTRES.java Project: 4sp1r3/elki

  /**
   * Main loop for OUTRES
   *
   * @param relation Relation to process
   * @return Outlier detection result
   */
  public OutlierResult run(Relation<V> relation) {
    WritableDoubleDataStore ranks =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax minmax = new DoubleMinMax();

    KernelDensityEstimator kernel = new KernelDensityEstimator(relation);
    long[] subspace = BitsUtil.zero(kernel.dim);

    FiniteProgress progress =
        LOG.isVerbose() ? new FiniteProgress("OUTRES scores", relation.size(), LOG) : null;

    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      BitsUtil.zeroI(subspace);
      double score = outresScore(0, subspace, iditer, kernel);
      ranks.putDouble(iditer, score);
      minmax.put(score);
      LOG.incrementProcessed(progress);
    }
    LOG.ensureCompleted(progress);

    OutlierScoreMeta meta =
        new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.);
    OutlierResult outresResult =
        new OutlierResult(
            meta,
            new MaterializedDoubleRelation("OUTRES", "outres-score", ranks, relation.getDBIDs()));
    return outresResult;
  }

Example #3

0

Show file

File: SUBCLU.java Project: fjfd/elki

  /**
   * Runs the DBSCAN algorithm on the specified partition of the database in the given subspace. If
   * parameter {@code ids} is null DBSCAN will be applied to the whole database.
   *
   * @param relation the database holding the objects to run DBSCAN on
   * @param ids the IDs of the database defining the partition to run DBSCAN on - if this parameter
   *     is null DBSCAN will be applied to the whole database
   * @param subspace the subspace to run DBSCAN on
   * @return the clustering result of the DBSCAN run
   */
  private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace subspace) {
    // distance function
    distanceFunction.setSelectedDimensions(subspace.getDimensions());

    ProxyDatabase proxy;
    if (ids == null) {
      // TODO: in this case, we might want to use an index - the proxy below
      // will prevent this!
      ids = relation.getDBIDs();
    }

    proxy = new ProxyDatabase(ids, relation);

    DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts);
    // run DBSCAN
    if (LOG.isVerbose()) {
      LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString());
    }
    Clustering<Model> dbsres = dbscan.run(proxy);

    // separate cluster and noise
    List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters();
    List<Cluster<Model>> clusters = new ArrayList<>();
    for (Cluster<Model> c : clusterAndNoise) {
      if (!c.isNoise()) {
        clusters.add(c);
      }
    }
    return clusters;
  }

Example #4

0

Show file

File: AutomaticEvaluation.java Project: victordov/elki

 protected void autoEvaluateClusterings(ResultHierarchy hier, Result newResult) {
   Collection<Clustering<?>> clusterings =
       ResultUtil.filterResults(hier, newResult, Clustering.class);
   if (LOG.isDebugging()) {
     LOG.warning("Number of new clustering results: " + clusterings.size());
   }
   for (Iterator<Clustering<?>> c = clusterings.iterator(); c.hasNext(); ) {
     Clustering<?> test = c.next();
     if ("allinone-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("allinnoise-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("bylabel-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("bymodel-clustering".equals(test.getShortName())) {
       c.remove();
     }
   }
   if (clusterings.size() > 0) {
     try {
       new EvaluateClustering(new ByLabelClustering(), false, true)
           .processNewResult(hier, newResult);
     } catch (NoSupportedDataTypeException e) {
       // Pass - the data probably did not have labels.
     }
   }
 }

Example #5

0

Show file

File: CacheDoubleDistanceKNNLists.java Project: 4sp1r3/elki

  @Override
  public void run() {
    Database database = input.getDatabase();
    Relation<O> relation = database.getRelation(distance.getInputTypeRestriction());
    DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, distance);
    KNNQuery<O> knnQ = database.getKNNQuery(distanceQuery, DatabaseQuery.HINT_HEAVY_USE);

    // open file.
    try (RandomAccessFile file = new RandomAccessFile(out, "rw");
        FileChannel channel = file.getChannel();
        // and acquire a file write lock
        FileLock lock = channel.lock()) {
      // write magic header
      file.writeInt(KNN_CACHE_MAGIC);

      int bufsize = k * 12 * 2 + 10; // Initial size, enough for 2 kNN.
      ByteBuffer buffer = ByteBuffer.allocateDirect(bufsize);

      FiniteProgress prog =
          LOG.isVerbose() ? new FiniteProgress("Computing kNN", relation.size(), LOG) : null;

      for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
        final KNNList nn = knnQ.getKNNForDBID(it, k);
        final int nnsize = nn.size();

        // Grow the buffer when needed:
        if (nnsize * 12 + 10 > bufsize) {
          while (nnsize * 12 + 10 > bufsize) {
            bufsize <<= 1;
          }
          buffer = ByteBuffer.allocateDirect(bufsize);
        }

        buffer.clear();
        ByteArrayUtil.writeUnsignedVarint(buffer, it.internalGetIndex());
        ByteArrayUtil.writeUnsignedVarint(buffer, nnsize);
        int c = 0;
        for (DoubleDBIDListIter ni = nn.iter(); ni.valid(); ni.advance(), c++) {
          ByteArrayUtil.writeUnsignedVarint(buffer, ni.internalGetIndex());
          buffer.putDouble(ni.doubleValue());
        }
        if (c != nn.size()) {
          throw new AbortException("Sizes did not agree. Cache is invalid.");
        }

        buffer.flip();
        channel.write(buffer);
        LOG.incrementProcessed(prog);
      }
      LOG.ensureCompleted(prog);
      lock.release();
    } catch (IOException e) {
      LOG.exception(e);
    }
    // FIXME: close!
  }

Example #6

0

Show file

File: EvaluateClustering.java Project: mangwang/elki

 @Override
 public void processNewResult(ResultHierarchy hier, Result newResult) {
   // We may just have added this result.
   if (newResult instanceof Clustering && isReferenceResult((Clustering<?>) newResult)) {
     return;
   }
   Database db = ResultUtil.findDatabase(hier);
   List<Clustering<?>> crs = ResultUtil.getClusteringResults(newResult);
   if (crs == null || crs.size() < 1) {
     return;
   }
   // Compute the reference clustering
   Clustering<?> refc = null;
   // Try to find an existing reference clustering (globally)
   {
     Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, db, Clustering.class);
     for (Clustering<?> test : cs) {
       if (isReferenceResult(test)) {
         refc = test;
         break;
       }
     }
   }
   // Try to find an existing reference clustering (locally)
   if (refc == null) {
     Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, newResult, Clustering.class);
     for (Clustering<?> test : cs) {
       if (isReferenceResult(test)) {
         refc = test;
         break;
       }
     }
   }
   if (refc == null) {
     LOG.debug("Generating a new reference clustering.");
     Result refres = referencealg.run(db);
     List<Clustering<?>> refcrs = ResultUtil.getClusteringResults(refres);
     if (refcrs.size() == 0) {
       LOG.warning("Reference algorithm did not return a clustering result!");
       return;
     }
     if (refcrs.size() > 1) {
       LOG.warning("Reference algorithm returned more than one result!");
     }
     refc = refcrs.get(0);
   } else {
     LOG.debug("Using existing clustering: " + refc.getLongName() + " " + refc.getShortName());
   }
   for (Clustering<?> c : crs) {
     if (c == refc) {
       continue;
     }
     evaluteResult(db, c, refc);
   }
 }

Example #7

0

Show file

File: Eclat.java Project: victordov/elki

  /**
   * Run the Eclat algorithm
   *
   * @param db Database to process
   * @param relation Bit vector relation
   * @return Frequent patterns found
   */
  public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable arrays, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());

    LOG.verbose("Build 1-dimensional transaction lists.");
    Duration ctime = LOG.newDuration(STAT + "eclat.transposition.time").begin();
    DBIDs[] idx = buildIndex(relation, dim, minsupp);
    LOG.statistics(ctime.end());

    FiniteProgress prog =
        LOG.isVerbose() ? new FiniteProgress("Building frequent itemsets", idx.length, LOG) : null;
    Duration etime = LOG.newDuration(STAT + "eclat.extraction.time").begin();
    final List<Itemset> solution = new ArrayList<>();
    for (int i = 0; i < idx.length; i++) {
      LOG.incrementProcessed(prog);
      extractItemsets(idx, i, minsupp, solution);
    }
    LOG.ensureCompleted(prog);
    Collections.sort(solution);
    LOG.statistics(etime.end());

    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("Eclat", "eclat", solution, meta);
  }

Example #8

0

Show file

File: LOCI.java Project: fjfd/elki

  /**
   * Preprocessing step: determine the radii of interest for each point.
   *
   * @param ids IDs to process
   * @param rangeQuery Range query
   * @param interestingDistances Distances of interest
   */
  protected void precomputeInterestingRadii(
      DBIDs ids,
      RangeQuery<O> rangeQuery,
      WritableDataStore<DoubleIntArrayList> interestingDistances) {
    FiniteProgress progressPreproc =
        LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null;
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
      DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax);
      // build list of critical distances
      DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1);
      {
        int i = 0;
        DoubleDBIDListIter ni = neighbors.iter();
        while (ni.valid()) {
          final double curdist = ni.doubleValue();
          ++i;
          ni.advance();
          // Skip, if tied to the next object:
          if (ni.valid() && curdist == ni.doubleValue()) {
            continue;
          }
          cdist.append(curdist, i);
          // Scale radius, and reinsert
          if (alpha != 1.) {
            final double ri = curdist / alpha;
            if (ri <= rmax) {
              cdist.append(ri, Integer.MIN_VALUE);
            }
          }
        }
      }
      cdist.sort();

      // fill the gaps to have fast lookups of number of neighbors at a given
      // distance.
      int lastk = 0;
      for (int i = 0, size = cdist.size(); i < size; i++) {
        final int k = cdist.getInt(i);
        if (k == Integer.MIN_VALUE) {
          cdist.setValue(i, lastk);
        } else {
          lastk = k;
        }
      }
      // TODO: shrink the list, removing duplicate radii?

      interestingDistances.put(iditer, cdist);
      LOG.incrementProcessed(progressPreproc);
    }
    LOG.ensureCompleted(progressPreproc);
  }

Example #9

0

Show file

File: FileBasedFloatDistanceFunction.java Project: deric/elki

 @Override
 public void checkRange(DBIDRange range) {
   final int size = max + 1 - min;
   if (size < range.size()) {
     LOG.warning("Distance matrix has size " + size + " but range has size: " + range.size());
   }
 }

Example #10

0

Show file

File: KMeansLloyd.java Project: mangwang/elki

  @Override
  public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
      return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
      LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
      clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment =
        DataStoreUtil.makeIntegerStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];

    IndefiniteProgress prog =
        LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat =
        LOG.isStatistics()
            ? new DoubleStatistic(this.getClass().getName() + ".variance-sum")
            : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
      LOG.incrementProcessed(prog);
      boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
      logVarstat(varstat, varsum);
      // Stop if no cluster assignment changed.
      if (!changed) {
        break;
      }
      // Recompute means.
      means = means(clusters, means, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
      LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }

    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
      DBIDs ids = clusters.get(i);
      if (ids.size() == 0) {
        continue;
      }
      KMeansModel model = new KMeansModel(means[i], varsum[i]);
      result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
  }

Example #11

0

Show file

File: SUBCLU.java Project: fjfd/elki

  /**
   * Generates {@code d+1}-dimensional subspace candidates from the specified {@code d}-dimensional
   * subspaces.
   *
   * @param subspaces the {@code d}-dimensional subspaces
   * @return the {@code d+1}-dimensional subspace candidates
   */
  private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) {
    List<Subspace> candidates = new ArrayList<>();

    if (subspaces.isEmpty()) {
      return candidates;
    }

    // Generate (d+1)-dimensional candidate subspaces
    int d = subspaces.get(0).dimensionality();

    StringBuilder msgFine = new StringBuilder("\n");
    if (LOG.isDebuggingFiner()) {
      msgFine.append("subspaces ").append(subspaces).append('\n');
    }

    for (int i = 0; i < subspaces.size(); i++) {
      Subspace s1 = subspaces.get(i);
      for (int j = i + 1; j < subspaces.size(); j++) {
        Subspace s2 = subspaces.get(j);
        Subspace candidate = s1.join(s2);

        if (candidate != null) {
          if (LOG.isDebuggingFiner()) {
            msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n');
          }
          // prune irrelevant candidate subspaces
          List<Subspace> lowerSubspaces = lowerSubspaces(candidate);
          if (LOG.isDebuggingFiner()) {
            msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n');
          }
          boolean irrelevantCandidate = false;
          for (Subspace s : lowerSubspaces) {
            if (!subspaces.contains(s)) {
              irrelevantCandidate = true;
              break;
            }
          }
          if (!irrelevantCandidate) {
            candidates.add(candidate);
          }
        }
      }
    }

    if (LOG.isDebuggingFiner()) {
      LOG.debugFiner(msgFine.toString());
    }
    if (LOG.isDebugging()) {
      StringBuilder msg = new StringBuilder();
      msg.append(d + 1).append("-dimensional candidate subspaces: ");
      for (Subspace candidate : candidates) {
        msg.append(candidate.dimensonsToString()).append(' ');
      }
      LOG.debug(msg.toString());
    }

    return candidates;
  }

Example #12

0

Show file

File: PerturbationFilter.java Project: fjfd/elki

 @Override
 protected void prepareComplete() {
   StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null;
   scalingreferencevalues = new double[dimensionality];
   randomPerAttribute = new Random[dimensionality];
   if (scalingreference == ScalingReference.STDDEV) {
     if (buf != null) {
       buf.append("Standard deviation per attribute: ");
     }
     for (int d = 0; d < dimensionality; d++) {
       scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage;
       if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
         scalingreferencevalues[d] = percentage;
       }
       randomPerAttribute[d] = new Random(RANDOM.nextLong());
       if (buf != null) {
         buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
       }
     }
   } else if (scalingreference == ScalingReference.MINMAX
       && minima.length == 0
       && maxima.length == 0) {
     if (buf != null) {
       buf.append("extension per attribute: ");
     }
     for (int d = 0; d < dimensionality; d++) {
       scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage;
       if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
         scalingreferencevalues[d] = percentage;
       }
       randomPerAttribute[d] = new Random(RANDOM.nextLong());
       if (buf != null) {
         buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
       }
     }
   }
   mvs = null;
   if (buf != null) {
     LOG.debugFine(buf.toString());
   }
 }

Example #13

0

Show file

File: GeneratorXMLSpec.java Project: fjfd/elki

  /** Runs the wrapper with the specified arguments. */
  @Override
  public void run() throws UnableToComplyException {
    MultipleObjectsBundle data = generator.loadData();
    if (LOG.isVerbose()) {
      LOG.verbose("Writing output ...");
    }
    try {
      if (outputFile.exists()) {
        if (LOG.isVerbose()) {
          LOG.verbose(
              "The file "
                  + outputFile
                  + " already exists, "
                  + "the generator result will be APPENDED.");
        }
      }

      try (OutputStreamWriter outStream = new FileWriter(outputFile, true)) {
        writeClusters(outStream, data);
      }
    } catch (FileNotFoundException e) {
      throw new UnableToComplyException(e);
    } catch (IOException e) {
      throw new UnableToComplyException(e);
    }
    if (LOG.isVerbose()) {
      LOG.verbose("Done.");
    }
  }

Example #14

0

Show file

File: DBSCAN.java Project: XiaoKeSec/elki

  /**
   * Run the DBSCAN algorithm
   *
   * @param relation Data relation
   * @param rangeQuery Range query class
   */
  protected void runDBSCAN(Relation<O> relation, RangeQuery<O> rangeQuery) {
    final int size = relation.size();
    FiniteProgress objprog =
        LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null;
    IndefiniteProgress clusprog =
        LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;

    processedIDs = DBIDUtil.newHashSet(size);
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      if (!processedIDs.contains(iditer)) {
        expandCluster(relation, rangeQuery, iditer, objprog, clusprog);
      }
      if (objprog != null && clusprog != null) {
        objprog.setProcessed(processedIDs.size(), LOG);
        clusprog.setProcessed(resultList.size(), LOG);
      }
      if (processedIDs.size() == size) {
        break;
      }
    }
    // Finish progress logging
    LOG.ensureCompleted(objprog);
    LOG.setCompleted(clusprog);
  }

Example #15

0

Show file

File: RankingQualityHistogram.java Project: fjfd/elki

  /**
   * Process a database
   *
   * @param database Database to process
   * @param relation Relation to process
   * @return Histogram of ranking qualities
   */
  public HistogramResult<DoubleVector> run(Database database, Relation<O> relation) {
    final DistanceQuery<O> distanceQuery =
        database.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<O> knnQuery = database.getKNNQuery(distanceQuery, relation.size());

    if (LOG.isVerbose()) {
      LOG.verbose("Preprocessing clusters...");
    }
    // Cluster by labels
    Collection<Cluster<Model>> split =
        (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();

    DoubleStaticHistogram hist = new DoubleStaticHistogram(numbins, 0.0, 1.0);

    if (LOG.isVerbose()) {
      LOG.verbose("Processing points...");
    }
    FiniteProgress progress =
        LOG.isVerbose()
            ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG)
            : null;

    MeanVariance mv = new MeanVariance();
    // sort neighbors
    for (Cluster<?> clus : split) {
      for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
        KNNList knn = knnQuery.getKNNForDBID(iter, relation.size());
        double result = new ROCEvaluation().evaluate(clus, knn);

        mv.put(result);
        hist.increment(result, 1. / relation.size());

        LOG.incrementProcessed(progress);
      }
    }
    LOG.ensureCompleted(progress);

    // Transform Histogram into a Double Vector array.
    Collection<DoubleVector> res = new ArrayList<>(relation.size());
    for (DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) {
      DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), iter.getValue()});
      res.add(row);
    }
    HistogramResult<DoubleVector> result =
        new HistogramResult<>("Ranking Quality Histogram", "ranking-histogram", res);
    result.addHeader("Mean: " + mv.getMean() + " Variance: " + mv.getSampleVariance());
    return result;
  }

Example #16

0

Show file

File: DBSCAN.java Project: XiaoKeSec/elki

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      DoubleParameter epsilonP =
          new DoubleParameter(EPSILON_ID) //
              .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
      if (config.grab(epsilonP)) {
        epsilon = epsilonP.getValue();
      }

      IntParameter minptsP =
          new IntParameter(MINPTS_ID) //
              .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if (config.grab(minptsP)) {
        minpts = minptsP.getValue();
        if (minpts <= 2) {
          LOG.warning(
              "DBSCAN with minPts <= 2 is equivalent to single-link clustering at a single height. Consider using larger values of minPts.");
        }
      }
    }

Example #17

0

Show file

File: FileBasedFloatDistanceFunction.java Project: deric/elki

  private void loadCache(DistanceParser parser, File matrixfile) throws IOException {
    InputStream in =
        new BufferedInputStream(FileUtil.tryGzipInput(new FileInputStream(matrixfile)));
    cache =
        new TLongFloatHashMap(
            Constants.DEFAULT_CAPACITY,
            Constants.DEFAULT_LOAD_FACTOR,
            -1L,
            Float.POSITIVE_INFINITY);
    min = Integer.MAX_VALUE;
    max = Integer.MIN_VALUE;
    parser.parse(
        in,
        new DistanceCacheWriter() {
          @Override
          public void put(int id1, int id2, double distance) {
            if (id1 < id2) {
              min = id1 < min ? id1 : min;
              max = id2 > max ? id2 : max;
            } else {
              min = id2 < min ? id2 : min;
              max = id1 > max ? id1 : max;
            }
            cache.put(makeKey(id1, id2), (float) distance);
          }

          @Override
          public boolean containsKey(int id1, int id2) {
            return cache.containsKey(makeKey(id1, id2));
          }
        });
    if (min != 0) {
      LOG.verbose(
          "Distance matrix is supposed to be 0-indexed. Choosing offset "
              + min
              + " to compensate.");
    }
  }

Example #18

0

Show file

File: DistanceQuantileSampler.java Project: victordov/elki

  public Result run(Database database, Relation<O> rel) {
    DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction());
    int size = rel.size();
    long pairs = (size * (long) size) >> 1;

    final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling;
    if (ssize > Integer.MAX_VALUE) {
      throw new AbortException("Sampling size too large.");
    }
    final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize);

    DoubleMaxHeap heap = new DoubleMaxHeap(qsize);

    ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs());
    DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
    Random r = rand.getSingleThreadedRandom();

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null;
    for (long i = 0; i < ssize; i++) {
      int x = r.nextInt(size - 1) + 1, y = r.nextInt(x);
      double dist = dq.distance(i1.seek(x), i2.seek(y));
      // Skip NaN, and/or zeros.
      if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) {
        continue;
      }
      heap.add(dist, qsize);
      LOG.incrementProcessed(prog);
    }

    LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile));
    LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize));
    LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek()));
    LOG.ensureCompleted(prog);
    Collection<String> header = Arrays.asList(new String[] {"Distance"});
    Collection<Vector> data = Arrays.asList(new Vector[] {new Vector(heap.peek())});
    return new CollectionResult<Vector>("Distances sample", "distance-sample", data, header);
  }

Example #19

0

Show file

File: PerturbationFilter.java Project: fjfd/elki

/**
 * A filter to perturb the values by adding micro-noise.
 *
 * <p>The added noise is generated, attribute-wise, by a Gaussian with mean=0 and a specified
 * standard deviation or by a uniform distribution with a specified range. The standard deviation or
 * the range can be scaled, attribute-wise, to a given percentage of the original standard deviation
 * in the data distribution (assuming a Gaussian distribution there), or to a percentage of the
 * extension in each attribute ({@code maximumValue - minimumValue}).
 *
 * <p>This filter has a potentially wide use but has been implemented for the following publication:
 *
 * <p>Reference:
 *
 * <p>A. Zimek, R. J. G. B. Campello, J. Sander:</br> Data Perturbation for Outlier Detection
 * Ensembles.<\br> In: Proc. 26th International Conference on Scientific and Statistical Database
 * Management (SSDBM), Aalborg, Denmark, 2014.
 *
 * @author Arthur Zimek
 */
@Title("Data Perturbation for Outlier Detection Ensembles")
@Description(
    "A filter to perturb a datasset on read by an additive noise component, implemented for use in an outlier ensemble (this reference).")
@Reference(
    authors = "A. Zimek, R. J. G. B. Campello, J. Sander", //
    title = "Data Perturbation for Outlier Detection Ensembles", //
    booktitle =
        "Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014", //
    url = "http://dx.doi.org/10.1145/2618243.2618257")
public class PerturbationFilter<V extends NumberVector>
    extends AbstractVectorConversionFilter<V, V> {
  /** Class logger */
  private static final Logging LOG = Logging.getLogger(PerturbationFilter.class);

  /**
   * Scaling reference options.
   *
   * @author Arthur Zimek
   * @apiviz.exclude
   */
  public static enum ScalingReference {
    UNITCUBE,
    STDDEV,
    MINMAX
  }

  /**
   * Nature of the noise distribution.
   *
   * @author Arthur Zimek
   * @apiviz.exclude
   */
  public static enum NoiseDistribution {
    GAUSSIAN,
    UNIFORM
  }

  /** Which reference to use for scaling the noise. */
  private ScalingReference scalingreference;

  /** Nature of the noise distribution. */
  private NoiseDistribution noisedistribution;

  /** Random object to generate the attribute-wise seeds for the noise. */
  private final Random RANDOM;

  /**
   * Percentage of the variance of the random noise generation, given the variance of the
   * corresponding attribute in the data.
   */
  private double percentage;

  /** Temporary storage used during initialization. */
  private MeanVarianceMinMax[] mvs = null;

  /** Stores the scaling reference in each dimension. */
  private double[] scalingreferencevalues = new double[0];

  /** The random objects to generate noise distributions independently for each attribute. */
  private Random[] randomPerAttribute = null;

  /** Stores the maximum in each dimension. */
  private double[] maxima;

  /** Stores the minimum in each dimension. */
  private double[] minima;

  /** Stores the dimensionality from the preprocessing. */
  private int dimensionality = 0;

  /**
   * Constructor.
   *
   * @param seed Seed value, may be {@code null} for a random seed.
   * @param percentage Relative amount of jitter to add
   * @param scalingreference Scaling reference
   * @param minima Preset minimum values. May be {@code null}.
   * @param maxima Preset maximum values. May be {@code null}.
   * @param noisedistribution Nature of the noise distribution.
   */
  public PerturbationFilter(
      Long seed,
      double percentage,
      ScalingReference scalingreference,
      double[] minima,
      double[] maxima,
      NoiseDistribution noisedistribution) {
    super();
    this.percentage = percentage;
    this.scalingreference = scalingreference;
    this.minima = minima;
    this.maxima = maxima;
    this.noisedistribution = noisedistribution;
    this.RANDOM = (seed == null) ? new Random() : new Random(seed);
  }

  @Override
  protected boolean prepareStart(SimpleTypeInformation<V> in) {
    if (scalingreference == ScalingReference.MINMAX && minima.length != 0 && maxima.length != 0) {
      dimensionality = minima.length;
      scalingreferencevalues = new double[dimensionality];
      randomPerAttribute = new Random[dimensionality];
      for (int d = 0; d < dimensionality; d++) {
        scalingreferencevalues[d] = (maxima[d] - minima[d]) * percentage;
        if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
          scalingreferencevalues[d] = percentage;
        }
        randomPerAttribute[d] = new Random(RANDOM.nextLong());
      }
      return false;
    }
    if (scalingreference == ScalingReference.UNITCUBE) {
      return false;
    }
    return (scalingreferencevalues.length == 0);
  }

  @Override
  protected void prepareProcessInstance(V featureVector) {
    // First object? Then init. (We didn't have a dimensionality before!)
    if (mvs == null) {
      dimensionality = featureVector.getDimensionality();
      mvs = MeanVarianceMinMax.newArray(dimensionality);
    }
    for (int d = 0; d < featureVector.getDimensionality(); d++) {
      mvs[d].put(featureVector.doubleValue(d));
    }
  }

  @Override
  protected void prepareComplete() {
    StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null;
    scalingreferencevalues = new double[dimensionality];
    randomPerAttribute = new Random[dimensionality];
    if (scalingreference == ScalingReference.STDDEV) {
      if (buf != null) {
        buf.append("Standard deviation per attribute: ");
      }
      for (int d = 0; d < dimensionality; d++) {
        scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage;
        if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
          scalingreferencevalues[d] = percentage;
        }
        randomPerAttribute[d] = new Random(RANDOM.nextLong());
        if (buf != null) {
          buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
        }
      }
    } else if (scalingreference == ScalingReference.MINMAX
        && minima.length == 0
        && maxima.length == 0) {
      if (buf != null) {
        buf.append("extension per attribute: ");
      }
      for (int d = 0; d < dimensionality; d++) {
        scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage;
        if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
          scalingreferencevalues[d] = percentage;
        }
        randomPerAttribute[d] = new Random(RANDOM.nextLong());
        if (buf != null) {
          buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
        }
      }
    }
    mvs = null;
    if (buf != null) {
      LOG.debugFine(buf.toString());
    }
  }

  @Override
  protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
    return TypeUtil.NUMBER_VECTOR_FIELD;
  }

  @Override
  protected V filterSingleObject(V featureVector) {
    if (scalingreference == ScalingReference.UNITCUBE && dimensionality == 0) {
      dimensionality = featureVector.getDimensionality();
      scalingreferencevalues = new double[dimensionality];
      randomPerAttribute = new Random[dimensionality];
      for (int d = 0; d < dimensionality; d++) {
        scalingreferencevalues[d] = percentage;
        randomPerAttribute[d] = new Random(RANDOM.nextLong());
      }
    }
    if (scalingreferencevalues.length != featureVector.getDimensionality()) {
      throw new IllegalArgumentException(
          "FeatureVectors and given Minima/Maxima differ in length.");
    }
    double[] values = new double[featureVector.getDimensionality()];
    for (int d = 0; d < featureVector.getDimensionality(); d++) {
      if (this.noisedistribution.equals(NoiseDistribution.GAUSSIAN)) {
        values[d] =
            featureVector.doubleValue(d)
                + randomPerAttribute[d].nextGaussian() * scalingreferencevalues[d];
      } else if (this.noisedistribution.equals(NoiseDistribution.UNIFORM)) {
        values[d] =
            featureVector.doubleValue(d)
                + randomPerAttribute[d].nextDouble() * scalingreferencevalues[d];
      }
    }
    return factory.newNumberVector(values);
  }

  @Override
  protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
    initializeOutputType(in);
    return in;
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * Parameterization class.
   *
   * @author Arthur Zimek
   * @apiviz.exclude
   */
  public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
    /** Parameter for minimum. */
    public static final OptionID MINIMA_ID =
        new OptionID(
            "perturbationfilter.min",
            "Only used, if "
                + ScalingReference.MINMAX
                + " is set as scaling reference: a comma separated concatenation of the minimum values in each dimension assumed as a reference. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");

    /** Parameter for maximum. */
    public static final OptionID MAXIMA_ID =
        new OptionID(
            "perturbationfilter.max",
            "Only used, if "
                + ScalingReference.MINMAX
                + " is set as scaling reference: a comma separated concatenation of the maximum values in each dimension assumed as a reference. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");

    /** Stores the maximum in each dimension. */
    private double[] maxima = new double[0];

    /** Stores the minimum in each dimension. */
    private double[] minima = new double[0];

    /**
     * Optional parameter to specify a seed for random Gaussian noise generation. If unused, system
     * time is used as seed.
     *
     * <p>Key: {@code -perturbationfilter.seed}
     */
    public static final OptionID SEED_ID =
        new OptionID("perturbationfilter.seed", "Seed for random noise generation.");

    /**
     * Seed for randomly shuffling the rows of the database. If null, system time is used as seed.
     */
    protected Long seed = null;

    /**
     * Optional parameter to specify a percentage of the standard deviation of the random Gaussian
     * noise generation, given the standard deviation of the corresponding attribute in the original
     * data distribution (assuming a Gaussian there).
     *
     * <p>Key: {@code -perturbationfilter.percentage}
     *
     * <p>Default: <code>0.01</code>
     *
     * <p>Constraint: 0 &lt; percentage &leq;1
     */
    public static final OptionID PERCENTAGE_ID =
        new OptionID(
            "perturbationfilter.percentage",
            "Percentage of the standard deviation of the random Gaussian noise generation per attribute, given the standard deviation of the corresponding attribute in the original data distribution (assuming a Gaussian distribution there).");

    /**
     * Parameter for selecting scaling reference.
     *
     * <p>Key: {@code -perturbationfilter.scalingreference}
     *
     * <p>Default: <code>ScalingReference.UNITCUBE</code>
     */
    public static final OptionID SCALINGREFERENCE_ID =
        new OptionID(
            "perturbationfilter.scalingreference",
            "The reference for scaling the Gaussian noise. Default is "
                + ScalingReference.UNITCUBE
                + ", parameter "
                + PERCENTAGE_ID.getName()
                + " will then directly define the standard deviation of all noise Gaussians. For options "
                + ScalingReference.STDDEV
                + " and  "
                + ScalingReference.MINMAX
                + ", the percentage of the attributewise standard deviation or extension, repectively, will define the attributewise standard deviation of the noise Gaussians.");

    /**
     * Parameter for selecting the noise distribution.
     *
     * <p>Key: {@code -perturbationfilter.noisedistribution}
     *
     * <p>Default: <code>NoiseDistribution.UNIFORM</code>
     */
    public static final OptionID NOISEDISTRIBUTION_ID =
        new OptionID(
            "perturbationfilter.noisedistribution",
            "The nature of the noise distribution, default is " + NoiseDistribution.UNIFORM);

    /**
     * Percentage of the variance of the random Gaussian noise generation or of the range of the
     * uniform distribution, given the variance of the corresponding attribute in the data.
     */
    protected double percentage;

    /** The option which reference to use for scaling the noise. */
    protected ScalingReference scalingreference;

    /** The option which nature of noise distribution to choose. */
    protected NoiseDistribution noisedistribution;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      EnumParameter<ScalingReference> scalingReferenceP =
          new EnumParameter<>(
              SCALINGREFERENCE_ID, ScalingReference.class, ScalingReference.UNITCUBE);
      if (config.grab(scalingReferenceP)) {
        scalingreference = scalingReferenceP.getValue();
      }
      EnumParameter<NoiseDistribution> noisedistributionP =
          new EnumParameter<>(
              NOISEDISTRIBUTION_ID, NoiseDistribution.class, NoiseDistribution.UNIFORM);
      if (config.grab(noisedistributionP)) {
        noisedistribution = noisedistributionP.getValue();
      }
      DoubleParameter percentageP = new DoubleParameter(PERCENTAGE_ID, .01);
      percentageP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
      percentageP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
      if (config.grab(percentageP)) {
        percentage = percentageP.getValue();
      }
      LongParameter seedP = new LongParameter(SEED_ID);
      seedP.setOptional(true);
      if (config.grab(seedP)) {
        seed = seedP.getValue();
      }
      DoubleListParameter minimaP = new DoubleListParameter(MINIMA_ID);
      minimaP.setOptional(true);
      if (config.grab(minimaP)) {
        minima = minimaP.getValue().clone();
      }
      DoubleListParameter maximaP = new DoubleListParameter(MAXIMA_ID);
      maximaP.setOptional(true);
      if (config.grab(maximaP)) {
        maxima = maximaP.getValue().clone();
      }

      config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(minimaP, maximaP));
      config.checkConstraint(new EqualSizeGlobalConstraint(minimaP, maximaP));
    }

    @Override
    protected PerturbationFilter<V> makeInstance() {
      return new PerturbationFilter<>(
          seed, percentage, scalingreference, minima, maxima, noisedistribution);
    }
  }
}

Example #20

0

Show file

File: SOF.java Project: 4sp1r3/elki

/**
 * The Spatial Outlier Factor (SOF) is a spatial {@link
 * de.lmu.ifi.dbs.elki.algorithm.outlier.lof.LOF LOF} variation.
 *
 * <p>Since the "reachability distance" of LOF cannot be used canonically in the bichromatic case,
 * this part of LOF is dropped and the exact distance is used instead.
 *
 * <p>Huang, T., Qin, X.<br>
 * Detecting outliers in spatial database.<br>
 * In: Proc. 3rd International Conference on Image and Graphics, Hong Kong, China. A LOF variation
 * simplified with reachDist(o,p) == dist(o,p).
 *
 * @author Ahmed Hettab
 * @since 0.4.0
 * @param <N> Neighborhood object type
 * @param <O> Attribute object type
 */
@Title("Spatial Outlier Factor")
@Reference(
    authors = "Huang, T., Qin, X.",
    title = "Detecting outliers in spatial database",
    booktitle = "Proc. 3rd International Conference on Image and Graphics",
    url = "http://dx.doi.org/10.1109/ICIG.2004.53")
public class SOF<N, O> extends AbstractDistanceBasedSpatialOutlier<N, O> {
  /** The logger for this class. */
  private static final Logging LOG = Logging.getLogger(SOF.class);

  /**
   * Constructor.
   *
   * @param npred Neighborhood predicate
   * @param nonSpatialDistanceFunction Distance function on non-spatial attributes
   */
  public SOF(
      NeighborSetPredicate.Factory<N> npred,
      PrimitiveDistanceFunction<O> nonSpatialDistanceFunction) {
    super(npred, nonSpatialDistanceFunction);
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * The main run method
   *
   * @param database Database to use (actually unused)
   * @param spatial Relation for neighborhood
   * @param relation Attributes to evaluate
   * @return Outlier result
   */
  public OutlierResult run(Database database, Relation<N> spatial, Relation<O> relation) {
    final NeighborSetPredicate npred =
        getNeighborSetPredicateFactory().instantiate(database, spatial);
    DistanceQuery<O> distFunc = getNonSpatialDistanceFunction().instantiate(relation);

    WritableDoubleDataStore lrds =
        DataStoreUtil.makeDoubleStorage(
            relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
    WritableDoubleDataStore lofs =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    DoubleMinMax lofminmax = new DoubleMinMax();

    // Compute densities
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      DBIDs neighbors = npred.getNeighborDBIDs(iditer);
      double avg = 0;
      for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
        avg += distFunc.distance(iditer, iter);
      }
      double lrd = 1 / (avg / neighbors.size());
      if (Double.isNaN(lrd)) {
        lrd = 0;
      }
      lrds.putDouble(iditer, lrd);
    }

    // Compute density quotients
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      DBIDs neighbors = npred.getNeighborDBIDs(iditer);
      double avg = 0;
      for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
        avg += lrds.doubleValue(iter);
      }
      final double lrd = (avg / neighbors.size()) / lrds.doubleValue(iditer);
      if (!Double.isNaN(lrd)) {
        lofs.putDouble(iditer, lrd);
        lofminmax.put(lrd);
      } else {
        lofs.putDouble(iditer, 0.0);
      }
    }

    // Build result representation.
    DoubleRelation scoreResult =
        new MaterializedDoubleRelation(
            "Spatial Outlier Factor", "sof-outlier", lofs, relation.getDBIDs());
    OutlierScoreMeta scoreMeta =
        new QuotientOutlierScoreMeta(
            lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
    OutlierResult or = new OutlierResult(scoreMeta, scoreResult);
    or.addChildResult(npred);
    return or;
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(
        getNeighborSetPredicateFactory().getInputTypeRestriction(), TypeUtil.NUMBER_VECTOR_FIELD);
  }

  /**
   * Parameterization class
   *
   * @author Ahmed Hettab
   * @apiviz.exclude
   * @param <N> Neighborhood type
   * @param <O> Attribute object type
   */
  public static class Parameterizer<N, O>
      extends AbstractDistanceBasedSpatialOutlier.Parameterizer<N, O> {
    @Override
    protected SOF<N, O> makeInstance() {
      return new SOF<>(npredf, distanceFunction);
    }
  }
}

Example #21

0

Show file