/** * Run the algorithm. * * @param database Database to use * @param relation Relation to use * @return Result */ public OutlierResult run(Database database, Relation<?> relation) { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); try (InputStream in = FileUtil.tryGzipInput(new FileInputStream(file)); // TokenizedReader reader = CSVReaderFormat.DEFAULT_FORMAT.makeReader()) { Tokenizer tokenizer = reader.getTokenizer(); CharSequence buf = reader.getBuffer(); Matcher mi = idpattern.matcher(buf), ms = scorepattern.matcher(buf); reader.reset(in); while (reader.nextLineExceptComments()) { Integer id = null; double score = Double.NaN; for ( /* initialized by nextLineExceptComments */ ; tokenizer.valid(); tokenizer.advance()) { mi.region(tokenizer.getStart(), tokenizer.getEnd()); ms.region(tokenizer.getStart(), tokenizer.getEnd()); final boolean mif = mi.find(); final boolean msf = ms.find(); if (mif && msf) { throw new AbortException( "ID pattern and score pattern both match value: " + tokenizer.getSubstring()); } if (mif) { if (id != null) { throw new AbortException( "ID pattern matched twice: previous value " + id + " second value: " + tokenizer.getSubstring()); } id = Integer.parseInt(buf.subSequence(mi.end(), tokenizer.getEnd()).toString()); } if (msf) { if (!Double.isNaN(score)) { throw new AbortException( "Score pattern matched twice: previous value " + score + " second value: " + tokenizer.getSubstring()); } score = ParseUtil.parseDouble(buf, ms.end(), tokenizer.getEnd()); } } if (id != null && !Double.isNaN(score)) { scores.putDouble(DBIDUtil.importInteger(id), score); minmax.put(score); } else if (id == null && Double.isNaN(score)) { LOG.warning( "Line did not match either ID nor score nor comment: " + reader.getLineNumber()); } else { throw new AbortException( "Line matched only ID or only SCORE patterns: " + reader.getLineNumber()); } } } catch (IOException e) { throw new AbortException( "Could not load outlier scores: " + e.getMessage() + " when loading " + file, e); } OutlierScoreMeta meta; if (inverted) { meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax()); } else { meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); } DoubleRelation scoresult = new MaterializedDoubleRelation( "External Outlier", "external-outlier", scores, relation.getDBIDs()); OutlierResult or = new OutlierResult(meta, scoresult); // Apply scaling if (scaling instanceof OutlierScalingFunction) { ((OutlierScalingFunction) scaling).prepare(or); } DoubleMinMax mm = new DoubleMinMax(); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double val = scoresult.doubleValue(iditer); val = scaling.getScaled(val); scores.putDouble(iditer, val); mm.put(val); } meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax()); or = new OutlierResult(meta, scoresult); return or; }
/** * Evaluate a single outlier result as histogram. * * @param database Database to process * @param or Outlier result * @return Result */ public HistogramResult<DoubleVector> evaluateOutlierResult(Database database, OutlierResult or) { if (scaling instanceof OutlierScalingFunction) { OutlierScalingFunction oscaling = (OutlierScalingFunction) scaling; oscaling.prepare(or); } ModifiableDBIDs ids = DBIDUtil.newHashSet(or.getScores().getDBIDs()); DBIDs outlierIds = DatabaseUtil.getObjectsByLabelMatch(database, positiveClassName); // first value for outliers, second for each object // If we have useful (finite) min/max, use these for binning. double min = scaling.getMin(); double max = scaling.getMax(); final ObjHistogram<DoubleDoublePair> hist; if (Double.isInfinite(min) || Double.isNaN(min) || Double.isInfinite(max) || Double.isNaN(max)) { hist = new AbstractObjDynamicHistogram<DoubleDoublePair>(bins) { @Override public DoubleDoublePair aggregate(DoubleDoublePair first, DoubleDoublePair second) { first.first += second.first; first.second += second.second; return first; } @Override protected DoubleDoublePair makeObject() { return new DoubleDoublePair(0., 0.); } @Override protected DoubleDoublePair cloneForCache(DoubleDoublePair data) { return new DoubleDoublePair(data.first, data.second); } @Override protected DoubleDoublePair downsample(Object[] data, int start, int end, int size) { DoubleDoublePair sum = new DoubleDoublePair(0, 0); for (int i = start; i < end; i++) { DoubleDoublePair p = (DoubleDoublePair) data[i]; if (p != null) { sum.first += p.first; sum.second += p.second; } } return sum; } }; } else { hist = new AbstractObjStaticHistogram<DoubleDoublePair>(bins, min, max) { @Override protected DoubleDoublePair makeObject() { return new DoubleDoublePair(0., 0.); } @Override public void putData(double coord, DoubleDoublePair data) { DoubleDoublePair exist = get(coord); exist.first += data.first; exist.second += data.second; } }; } // first fill histogram only with values of outliers DoubleDoublePair negative, positive; if (!splitfreq) { negative = new DoubleDoublePair(1. / ids.size(), 0); positive = new DoubleDoublePair(0, 1. / ids.size()); } else { negative = new DoubleDoublePair(1. / (ids.size() - outlierIds.size()), 0); positive = new DoubleDoublePair(0, 1. / outlierIds.size()); } ids.removeDBIDs(outlierIds); // fill histogram with values of each object for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { double result = or.getScores().doubleValue(iter); result = scaling.getScaled(result); if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) { hist.putData(result, negative); } } for (DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) { double result = or.getScores().doubleValue(iter); result = scaling.getScaled(result); if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) { hist.putData(result, positive); } } Collection<DoubleVector> collHist = new ArrayList<>(hist.getNumBins()); for (ObjHistogram.Iter<DoubleDoublePair> iter = hist.iter(); iter.valid(); iter.advance()) { DoubleDoublePair data = iter.getValue(); DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), data.first, data.second}); collHist.add(row); } return new HistogramResult<>("Outlier Score Histogram", "outlier-histogram", collHist); }