/** Performs the DBSCAN algorithm on the given database. */ public Clustering<Model> run(Relation<O> relation) { final int size = relation.size(); if (size < minpts) { Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); result.addToplevelCluster( new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER)); return result; } RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction()); resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); runDBSCAN(relation, rangeQuery); double averagen = ncounter / (double) relation.size(); LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen)); if (averagen < 1 + 0.1 * (minpts - 1)) { LOG.warning("There are very few neighbors found. Epsilon may be too small."); } if (averagen > 100 * minpts) { LOG.warning("There are very many neighbors found. Epsilon may be too large."); } Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); for (ModifiableDBIDs res : resultList) { result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER)); } result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER)); return result; }
@Override public void processNewResult(ResultHierarchy hier, Result newResult) { // We may just have added this result. if (newResult instanceof Clustering && isReferenceResult((Clustering<?>) newResult)) { return; } Database db = ResultUtil.findDatabase(hier); List<Clustering<?>> crs = ResultUtil.getClusteringResults(newResult); if (crs == null || crs.size() < 1) { return; } // Compute the reference clustering Clustering<?> refc = null; // Try to find an existing reference clustering (globally) { Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, db, Clustering.class); for (Clustering<?> test : cs) { if (isReferenceResult(test)) { refc = test; break; } } } // Try to find an existing reference clustering (locally) if (refc == null) { Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, newResult, Clustering.class); for (Clustering<?> test : cs) { if (isReferenceResult(test)) { refc = test; break; } } } if (refc == null) { LOG.debug("Generating a new reference clustering."); Result refres = referencealg.run(db); List<Clustering<?>> refcrs = ResultUtil.getClusteringResults(refres); if (refcrs.size() == 0) { LOG.warning("Reference algorithm did not return a clustering result!"); return; } if (refcrs.size() > 1) { LOG.warning("Reference algorithm returned more than one result!"); } refc = refcrs.get(0); } else { LOG.debug("Using existing clustering: " + refc.getLongName() + " " + refc.getShortName()); } for (Clustering<?> c : crs) { if (c == refc) { continue; } evaluteResult(db, c, refc); } }
@Override public void checkRange(DBIDRange range) { final int size = max + 1 - min; if (size < range.size()) { LOG.warning("Distance matrix has size " + size + " but range has size: " + range.size()); } }
protected void autoEvaluateClusterings(ResultHierarchy hier, Result newResult) { Collection<Clustering<?>> clusterings = ResultUtil.filterResults(hier, newResult, Clustering.class); if (LOG.isDebugging()) { LOG.warning("Number of new clustering results: " + clusterings.size()); } for (Iterator<Clustering<?>> c = clusterings.iterator(); c.hasNext(); ) { Clustering<?> test = c.next(); if ("allinone-clustering".equals(test.getShortName())) { c.remove(); } else if ("allinnoise-clustering".equals(test.getShortName())) { c.remove(); } else if ("bylabel-clustering".equals(test.getShortName())) { c.remove(); } else if ("bymodel-clustering".equals(test.getShortName())) { c.remove(); } } if (clusterings.size() > 0) { try { new EvaluateClustering(new ByLabelClustering(), false, true) .processNewResult(hier, newResult); } catch (NoSupportedDataTypeException e) { // Pass - the data probably did not have labels. } } }
@Override protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID) // .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); if (config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } IntParameter minptsP = new IntParameter(MINPTS_ID) // .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(minptsP)) { minpts = minptsP.getValue(); if (minpts <= 2) { LOG.warning( "DBSCAN with minPts <= 2 is equivalent to single-link clustering at a single height. Consider using larger values of minPts."); } } }
/** * Run the algorithm. * * @param database Database to use * @param relation Relation to use * @return Result */ public OutlierResult run(Database database, Relation<?> relation) { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); try (InputStream in = FileUtil.tryGzipInput(new FileInputStream(file)); // TokenizedReader reader = CSVReaderFormat.DEFAULT_FORMAT.makeReader()) { Tokenizer tokenizer = reader.getTokenizer(); CharSequence buf = reader.getBuffer(); Matcher mi = idpattern.matcher(buf), ms = scorepattern.matcher(buf); reader.reset(in); while (reader.nextLineExceptComments()) { Integer id = null; double score = Double.NaN; for ( /* initialized by nextLineExceptComments */ ; tokenizer.valid(); tokenizer.advance()) { mi.region(tokenizer.getStart(), tokenizer.getEnd()); ms.region(tokenizer.getStart(), tokenizer.getEnd()); final boolean mif = mi.find(); final boolean msf = ms.find(); if (mif && msf) { throw new AbortException( "ID pattern and score pattern both match value: " + tokenizer.getSubstring()); } if (mif) { if (id != null) { throw new AbortException( "ID pattern matched twice: previous value " + id + " second value: " + tokenizer.getSubstring()); } id = Integer.parseInt(buf.subSequence(mi.end(), tokenizer.getEnd()).toString()); } if (msf) { if (!Double.isNaN(score)) { throw new AbortException( "Score pattern matched twice: previous value " + score + " second value: " + tokenizer.getSubstring()); } score = ParseUtil.parseDouble(buf, ms.end(), tokenizer.getEnd()); } } if (id != null && !Double.isNaN(score)) { scores.putDouble(DBIDUtil.importInteger(id), score); minmax.put(score); } else if (id == null && Double.isNaN(score)) { LOG.warning( "Line did not match either ID nor score nor comment: " + reader.getLineNumber()); } else { throw new AbortException( "Line matched only ID or only SCORE patterns: " + reader.getLineNumber()); } } } catch (IOException e) { throw new AbortException( "Could not load outlier scores: " + e.getMessage() + " when loading " + file, e); } OutlierScoreMeta meta; if (inverted) { meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax()); } else { meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); } DoubleRelation scoresult = new MaterializedDoubleRelation( "External Outlier", "external-outlier", scores, relation.getDBIDs()); OutlierResult or = new OutlierResult(meta, scoresult); // Apply scaling if (scaling instanceof OutlierScalingFunction) { ((OutlierScalingFunction) scaling).prepare(or); } DoubleMinMax mm = new DoubleMinMax(); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double val = scoresult.doubleValue(iditer); val = scaling.getScaled(val); scores.putDouble(iditer, val); mm.put(val); } meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax()); or = new OutlierResult(meta, scoresult); return or; }
protected void autoEvaluateOutliers(ResultHierarchy hier, Result newResult) { Collection<OutlierResult> outliers = ResultUtil.filterResults(hier, newResult, OutlierResult.class); if (LOG.isDebugging()) { LOG.debug("Number of new outlier results: " + outliers.size()); } if (outliers.size() > 0) { Database db = ResultUtil.findDatabase(hier); ResultUtil.ensureClusteringResult(db, db); Collection<Clustering<?>> clusterings = ResultUtil.filterResults(hier, db, Clustering.class); if (clusterings.size() == 0) { LOG.warning( "Could not find a clustering result, even after running 'ensureClusteringResult'?!?"); return; } Clustering<?> basec = clusterings.iterator().next(); // Find minority class label int min = Integer.MAX_VALUE; int total = 0; String label = null; if (basec.getAllClusters().size() > 1) { for (Cluster<?> c : basec.getAllClusters()) { final int csize = c.getIDs().size(); total += csize; if (csize < min) { min = csize; label = c.getName(); } } } if (label == null) { LOG.warning("Could not evaluate outlier results, as I could not find a minority label."); return; } if (min == 1) { LOG.warning( "The minority class label had a single object. Try using 'ClassLabelFilter' to identify the class label column."); } if (min > 0.05 * total) { LOG.warning( "The minority class I discovered (labeled '" + label + "') has " + (min * 100. / total) + "% of objects. Outlier classes should be more rare!"); } LOG.verbose("Evaluating using minority class: " + label); Pattern pat = Pattern.compile("^" + Pattern.quote(label) + "$"); // Evaluate rankings. new OutlierRankingEvaluation(pat).processNewResult(hier, newResult); // Compute ROC curve new OutlierROCCurve(pat).processNewResult(hier, newResult); // Compute Precision at k new OutlierPrecisionAtKCurve(pat, min << 1).processNewResult(hier, newResult); // Compute ROC curve new OutlierPrecisionRecallCurve(pat).processNewResult(hier, newResult); // Compute outlier histogram new ComputeOutlierHistogram(pat, 50, new LinearScaling(), false) .processNewResult(hier, newResult); } }