예제 #1
0
  /**
   * Run the algorithm.
   *
   * @param database Database to use
   * @param relation Relation to use
   * @return Result
   */
  public OutlierResult run(Database database, Relation<?> relation) {
    WritableDoubleDataStore scores =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);

    DoubleMinMax minmax = new DoubleMinMax();

    try (InputStream in = FileUtil.tryGzipInput(new FileInputStream(file)); //
        TokenizedReader reader = CSVReaderFormat.DEFAULT_FORMAT.makeReader()) {
      Tokenizer tokenizer = reader.getTokenizer();
      CharSequence buf = reader.getBuffer();
      Matcher mi = idpattern.matcher(buf), ms = scorepattern.matcher(buf);
      reader.reset(in);
      while (reader.nextLineExceptComments()) {
        Integer id = null;
        double score = Double.NaN;
        for (
        /* initialized by nextLineExceptComments */ ; tokenizer.valid(); tokenizer.advance()) {
          mi.region(tokenizer.getStart(), tokenizer.getEnd());
          ms.region(tokenizer.getStart(), tokenizer.getEnd());
          final boolean mif = mi.find();
          final boolean msf = ms.find();
          if (mif && msf) {
            throw new AbortException(
                "ID pattern and score pattern both match value: " + tokenizer.getSubstring());
          }
          if (mif) {
            if (id != null) {
              throw new AbortException(
                  "ID pattern matched twice: previous value "
                      + id
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            id = Integer.parseInt(buf.subSequence(mi.end(), tokenizer.getEnd()).toString());
          }
          if (msf) {
            if (!Double.isNaN(score)) {
              throw new AbortException(
                  "Score pattern matched twice: previous value "
                      + score
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            score = ParseUtil.parseDouble(buf, ms.end(), tokenizer.getEnd());
          }
        }
        if (id != null && !Double.isNaN(score)) {
          scores.putDouble(DBIDUtil.importInteger(id), score);
          minmax.put(score);
        } else if (id == null && Double.isNaN(score)) {
          LOG.warning(
              "Line did not match either ID nor score nor comment: " + reader.getLineNumber());
        } else {
          throw new AbortException(
              "Line matched only ID or only SCORE patterns: " + reader.getLineNumber());
        }
      }
    } catch (IOException e) {
      throw new AbortException(
          "Could not load outlier scores: " + e.getMessage() + " when loading " + file, e);
    }

    OutlierScoreMeta meta;
    if (inverted) {
      meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    } else {
      meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    }
    DoubleRelation scoresult =
        new MaterializedDoubleRelation(
            "External Outlier", "external-outlier", scores, relation.getDBIDs());
    OutlierResult or = new OutlierResult(meta, scoresult);

    // Apply scaling
    if (scaling instanceof OutlierScalingFunction) {
      ((OutlierScalingFunction) scaling).prepare(or);
    }
    DoubleMinMax mm = new DoubleMinMax();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      double val = scoresult.doubleValue(iditer);
      val = scaling.getScaled(val);
      scores.putDouble(iditer, val);
      mm.put(val);
    }
    meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax());
    or = new OutlierResult(meta, scoresult);
    return or;
  }
예제 #2
0
  /**
   * Evaluate a single outlier result as histogram.
   *
   * @param database Database to process
   * @param or Outlier result
   * @return Result
   */
  public HistogramResult<DoubleVector> evaluateOutlierResult(Database database, OutlierResult or) {
    if (scaling instanceof OutlierScalingFunction) {
      OutlierScalingFunction oscaling = (OutlierScalingFunction) scaling;
      oscaling.prepare(or);
    }

    ModifiableDBIDs ids = DBIDUtil.newHashSet(or.getScores().getDBIDs());
    DBIDs outlierIds = DatabaseUtil.getObjectsByLabelMatch(database, positiveClassName);
    // first value for outliers, second for each object
    // If we have useful (finite) min/max, use these for binning.
    double min = scaling.getMin();
    double max = scaling.getMax();
    final ObjHistogram<DoubleDoublePair> hist;
    if (Double.isInfinite(min)
        || Double.isNaN(min)
        || Double.isInfinite(max)
        || Double.isNaN(max)) {
      hist =
          new AbstractObjDynamicHistogram<DoubleDoublePair>(bins) {
            @Override
            public DoubleDoublePair aggregate(DoubleDoublePair first, DoubleDoublePair second) {
              first.first += second.first;
              first.second += second.second;
              return first;
            }

            @Override
            protected DoubleDoublePair makeObject() {
              return new DoubleDoublePair(0., 0.);
            }

            @Override
            protected DoubleDoublePair cloneForCache(DoubleDoublePair data) {
              return new DoubleDoublePair(data.first, data.second);
            }

            @Override
            protected DoubleDoublePair downsample(Object[] data, int start, int end, int size) {
              DoubleDoublePair sum = new DoubleDoublePair(0, 0);
              for (int i = start; i < end; i++) {
                DoubleDoublePair p = (DoubleDoublePair) data[i];
                if (p != null) {
                  sum.first += p.first;
                  sum.second += p.second;
                }
              }
              return sum;
            }
          };
    } else {
      hist =
          new AbstractObjStaticHistogram<DoubleDoublePair>(bins, min, max) {
            @Override
            protected DoubleDoublePair makeObject() {
              return new DoubleDoublePair(0., 0.);
            }

            @Override
            public void putData(double coord, DoubleDoublePair data) {
              DoubleDoublePair exist = get(coord);
              exist.first += data.first;
              exist.second += data.second;
            }
          };
    }

    // first fill histogram only with values of outliers
    DoubleDoublePair negative, positive;
    if (!splitfreq) {
      negative = new DoubleDoublePair(1. / ids.size(), 0);
      positive = new DoubleDoublePair(0, 1. / ids.size());
    } else {
      negative = new DoubleDoublePair(1. / (ids.size() - outlierIds.size()), 0);
      positive = new DoubleDoublePair(0, 1. / outlierIds.size());
    }
    ids.removeDBIDs(outlierIds);
    // fill histogram with values of each object
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
      double result = or.getScores().doubleValue(iter);
      result = scaling.getScaled(result);
      if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) {
        hist.putData(result, negative);
      }
    }
    for (DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) {
      double result = or.getScores().doubleValue(iter);
      result = scaling.getScaled(result);
      if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) {
        hist.putData(result, positive);
      }
    }
    Collection<DoubleVector> collHist = new ArrayList<>(hist.getNumBins());
    for (ObjHistogram.Iter<DoubleDoublePair> iter = hist.iter(); iter.valid(); iter.advance()) {
      DoubleDoublePair data = iter.getValue();
      DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), data.first, data.second});
      collHist.add(row);
    }
    return new HistogramResult<>("Outlier Score Histogram", "outlier-histogram", collHist);
  }