Пример #1
0
  /** Performs the DBSCAN algorithm on the given database. */
  public Clustering<Model> run(Relation<O> relation) {
    final int size = relation.size();
    if (size < minpts) {
      Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
      result.addToplevelCluster(
          new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER));
      return result;
    }

    RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
    resultList = new ArrayList<>();
    noise = DBIDUtil.newHashSet();
    runDBSCAN(relation, rangeQuery);

    double averagen = ncounter / (double) relation.size();
    LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen));
    if (averagen < 1 + 0.1 * (minpts - 1)) {
      LOG.warning("There are very few neighbors found. Epsilon may be too small.");
    }
    if (averagen > 100 * minpts) {
      LOG.warning("There are very many neighbors found. Epsilon may be too large.");
    }

    Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
    for (ModifiableDBIDs res : resultList) {
      result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER));
    }
    result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return result;
  }
Пример #2
0
 @Override
 public void processNewResult(ResultHierarchy hier, Result newResult) {
   // We may just have added this result.
   if (newResult instanceof Clustering && isReferenceResult((Clustering<?>) newResult)) {
     return;
   }
   Database db = ResultUtil.findDatabase(hier);
   List<Clustering<?>> crs = ResultUtil.getClusteringResults(newResult);
   if (crs == null || crs.size() < 1) {
     return;
   }
   // Compute the reference clustering
   Clustering<?> refc = null;
   // Try to find an existing reference clustering (globally)
   {
     Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, db, Clustering.class);
     for (Clustering<?> test : cs) {
       if (isReferenceResult(test)) {
         refc = test;
         break;
       }
     }
   }
   // Try to find an existing reference clustering (locally)
   if (refc == null) {
     Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, newResult, Clustering.class);
     for (Clustering<?> test : cs) {
       if (isReferenceResult(test)) {
         refc = test;
         break;
       }
     }
   }
   if (refc == null) {
     LOG.debug("Generating a new reference clustering.");
     Result refres = referencealg.run(db);
     List<Clustering<?>> refcrs = ResultUtil.getClusteringResults(refres);
     if (refcrs.size() == 0) {
       LOG.warning("Reference algorithm did not return a clustering result!");
       return;
     }
     if (refcrs.size() > 1) {
       LOG.warning("Reference algorithm returned more than one result!");
     }
     refc = refcrs.get(0);
   } else {
     LOG.debug("Using existing clustering: " + refc.getLongName() + " " + refc.getShortName());
   }
   for (Clustering<?> c : crs) {
     if (c == refc) {
       continue;
     }
     evaluteResult(db, c, refc);
   }
 }
 @Override
 public void checkRange(DBIDRange range) {
   final int size = max + 1 - min;
   if (size < range.size()) {
     LOG.warning("Distance matrix has size " + size + " but range has size: " + range.size());
   }
 }
Пример #4
0
 protected void autoEvaluateClusterings(ResultHierarchy hier, Result newResult) {
   Collection<Clustering<?>> clusterings =
       ResultUtil.filterResults(hier, newResult, Clustering.class);
   if (LOG.isDebugging()) {
     LOG.warning("Number of new clustering results: " + clusterings.size());
   }
   for (Iterator<Clustering<?>> c = clusterings.iterator(); c.hasNext(); ) {
     Clustering<?> test = c.next();
     if ("allinone-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("allinnoise-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("bylabel-clustering".equals(test.getShortName())) {
       c.remove();
     } else if ("bymodel-clustering".equals(test.getShortName())) {
       c.remove();
     }
   }
   if (clusterings.size() > 0) {
     try {
       new EvaluateClustering(new ByLabelClustering(), false, true)
           .processNewResult(hier, newResult);
     } catch (NoSupportedDataTypeException e) {
       // Pass - the data probably did not have labels.
     }
   }
 }
Пример #5
0
    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      DoubleParameter epsilonP =
          new DoubleParameter(EPSILON_ID) //
              .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
      if (config.grab(epsilonP)) {
        epsilon = epsilonP.getValue();
      }

      IntParameter minptsP =
          new IntParameter(MINPTS_ID) //
              .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if (config.grab(minptsP)) {
        minpts = minptsP.getValue();
        if (minpts <= 2) {
          LOG.warning(
              "DBSCAN with minPts <= 2 is equivalent to single-link clustering at a single height. Consider using larger values of minPts.");
        }
      }
    }
Пример #6
0
  /**
   * Run the algorithm.
   *
   * @param database Database to use
   * @param relation Relation to use
   * @return Result
   */
  public OutlierResult run(Database database, Relation<?> relation) {
    WritableDoubleDataStore scores =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);

    DoubleMinMax minmax = new DoubleMinMax();

    try (InputStream in = FileUtil.tryGzipInput(new FileInputStream(file)); //
        TokenizedReader reader = CSVReaderFormat.DEFAULT_FORMAT.makeReader()) {
      Tokenizer tokenizer = reader.getTokenizer();
      CharSequence buf = reader.getBuffer();
      Matcher mi = idpattern.matcher(buf), ms = scorepattern.matcher(buf);
      reader.reset(in);
      while (reader.nextLineExceptComments()) {
        Integer id = null;
        double score = Double.NaN;
        for (
        /* initialized by nextLineExceptComments */ ; tokenizer.valid(); tokenizer.advance()) {
          mi.region(tokenizer.getStart(), tokenizer.getEnd());
          ms.region(tokenizer.getStart(), tokenizer.getEnd());
          final boolean mif = mi.find();
          final boolean msf = ms.find();
          if (mif && msf) {
            throw new AbortException(
                "ID pattern and score pattern both match value: " + tokenizer.getSubstring());
          }
          if (mif) {
            if (id != null) {
              throw new AbortException(
                  "ID pattern matched twice: previous value "
                      + id
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            id = Integer.parseInt(buf.subSequence(mi.end(), tokenizer.getEnd()).toString());
          }
          if (msf) {
            if (!Double.isNaN(score)) {
              throw new AbortException(
                  "Score pattern matched twice: previous value "
                      + score
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            score = ParseUtil.parseDouble(buf, ms.end(), tokenizer.getEnd());
          }
        }
        if (id != null && !Double.isNaN(score)) {
          scores.putDouble(DBIDUtil.importInteger(id), score);
          minmax.put(score);
        } else if (id == null && Double.isNaN(score)) {
          LOG.warning(
              "Line did not match either ID nor score nor comment: " + reader.getLineNumber());
        } else {
          throw new AbortException(
              "Line matched only ID or only SCORE patterns: " + reader.getLineNumber());
        }
      }
    } catch (IOException e) {
      throw new AbortException(
          "Could not load outlier scores: " + e.getMessage() + " when loading " + file, e);
    }

    OutlierScoreMeta meta;
    if (inverted) {
      meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    } else {
      meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    }
    DoubleRelation scoresult =
        new MaterializedDoubleRelation(
            "External Outlier", "external-outlier", scores, relation.getDBIDs());
    OutlierResult or = new OutlierResult(meta, scoresult);

    // Apply scaling
    if (scaling instanceof OutlierScalingFunction) {
      ((OutlierScalingFunction) scaling).prepare(or);
    }
    DoubleMinMax mm = new DoubleMinMax();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      double val = scoresult.doubleValue(iditer);
      val = scaling.getScaled(val);
      scores.putDouble(iditer, val);
      mm.put(val);
    }
    meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax());
    or = new OutlierResult(meta, scoresult);
    return or;
  }
Пример #7
0
 protected void autoEvaluateOutliers(ResultHierarchy hier, Result newResult) {
   Collection<OutlierResult> outliers =
       ResultUtil.filterResults(hier, newResult, OutlierResult.class);
   if (LOG.isDebugging()) {
     LOG.debug("Number of new outlier results: " + outliers.size());
   }
   if (outliers.size() > 0) {
     Database db = ResultUtil.findDatabase(hier);
     ResultUtil.ensureClusteringResult(db, db);
     Collection<Clustering<?>> clusterings = ResultUtil.filterResults(hier, db, Clustering.class);
     if (clusterings.size() == 0) {
       LOG.warning(
           "Could not find a clustering result, even after running 'ensureClusteringResult'?!?");
       return;
     }
     Clustering<?> basec = clusterings.iterator().next();
     // Find minority class label
     int min = Integer.MAX_VALUE;
     int total = 0;
     String label = null;
     if (basec.getAllClusters().size() > 1) {
       for (Cluster<?> c : basec.getAllClusters()) {
         final int csize = c.getIDs().size();
         total += csize;
         if (csize < min) {
           min = csize;
           label = c.getName();
         }
       }
     }
     if (label == null) {
       LOG.warning("Could not evaluate outlier results, as I could not find a minority label.");
       return;
     }
     if (min == 1) {
       LOG.warning(
           "The minority class label had a single object. Try using 'ClassLabelFilter' to identify the class label column.");
     }
     if (min > 0.05 * total) {
       LOG.warning(
           "The minority class I discovered (labeled '"
               + label
               + "') has "
               + (min * 100. / total)
               + "% of objects. Outlier classes should be more rare!");
     }
     LOG.verbose("Evaluating using minority class: " + label);
     Pattern pat = Pattern.compile("^" + Pattern.quote(label) + "$");
     // Evaluate rankings.
     new OutlierRankingEvaluation(pat).processNewResult(hier, newResult);
     // Compute ROC curve
     new OutlierROCCurve(pat).processNewResult(hier, newResult);
     // Compute Precision at k
     new OutlierPrecisionAtKCurve(pat, min << 1).processNewResult(hier, newResult);
     // Compute ROC curve
     new OutlierPrecisionRecallCurve(pat).processNewResult(hier, newResult);
     // Compute outlier histogram
     new ComputeOutlierHistogram(pat, 50, new LinearScaling(), false)
         .processNewResult(hier, newResult);
   }
 }