@Override public void cleanup() { try { reader.close(); } catch (IOException e) { getLogger().exception(e); } }
@Override public void initStream(InputStream in) { reader.reset(in); }
/** * Constructor. * * @param format Reader format */ public AbstractStreamingParser(CSVReaderFormat format) { super(); this.reader = format.makeReader(); this.tokenizer = reader.getTokenizer(); }
@Override public void parse(InputStream in, DistanceCacheWriter cache) { reader.reset(in); int min = Integer.MAX_VALUE, max = Integer.MIN_VALUE; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Parsing distance matrix", LOG) : null; try { while (reader.nextLineExceptComments()) { LOG.incrementProcessed(prog); if (!tokenizer.valid()) { throw new IllegalArgumentException( "Less than three values in line " + reader.getLineNumber()); } int id1, id2; try { id1 = (int) tokenizer.getLongBase10(); tokenizer.advance(); } catch (NumberFormatException e) { throw new IllegalArgumentException( "Error in line " + reader.getLineNumber() + ": id1 is not an integer!"); } if (!tokenizer.valid()) { throw new IllegalArgumentException( "Less than three values in line " + reader.getLineNumber()); } try { id2 = (int) tokenizer.getLongBase10(); tokenizer.advance(); } catch (NumberFormatException e) { throw new IllegalArgumentException( "Error in line " + reader.getLineNumber() + ": id2 is not an integer!"); } if (!tokenizer.valid()) { throw new IllegalArgumentException( "Less than three values in line " + reader.getLineNumber()); } // Track minimum and maximum if (id1 < id2) { min = (id1 < min) ? id1 : min; max = (id2 > min) ? id2 : max; } else { min = (id2 < min) ? id2 : min; max = (id1 > min) ? id1 : max; } try { double distance = tokenizer.getDouble(); cache.put(id1, id2, distance); } catch (IllegalArgumentException e) { throw new IllegalArgumentException( "Error in line " + reader.getLineNumber() + ":" + e.getMessage(), e); } tokenizer.advance(); if (tokenizer.valid()) { throw new IllegalArgumentException( "More than three values in line " + reader.getLineNumber()); } } } catch (IOException e) { throw new IllegalArgumentException( "Error while parsing line " + reader.getLineNumber() + "."); } LOG.setCompleted(prog); // check if all distance values are specified for (int i1 = min; i1 <= max; i1++) { for (int i2 = i1 + 1; i2 <= max; i2++) { if (!cache.containsKey(i1, i2)) { throw new IllegalArgumentException( "Distance value for " + i1 + " to " + i2 + " is missing!"); } } } }
/** * Constructor. * * @param format Input format */ public AsciiDistanceParser(CSVReaderFormat format) { super(); this.reader = format.makeReader(); this.tokenizer = reader.getTokenizer(); }
/** * Run the algorithm. * * @param database Database to use * @param relation Relation to use * @return Result */ public OutlierResult run(Database database, Relation<?> relation) { WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); try (InputStream in = FileUtil.tryGzipInput(new FileInputStream(file)); // TokenizedReader reader = CSVReaderFormat.DEFAULT_FORMAT.makeReader()) { Tokenizer tokenizer = reader.getTokenizer(); CharSequence buf = reader.getBuffer(); Matcher mi = idpattern.matcher(buf), ms = scorepattern.matcher(buf); reader.reset(in); while (reader.nextLineExceptComments()) { Integer id = null; double score = Double.NaN; for ( /* initialized by nextLineExceptComments */ ; tokenizer.valid(); tokenizer.advance()) { mi.region(tokenizer.getStart(), tokenizer.getEnd()); ms.region(tokenizer.getStart(), tokenizer.getEnd()); final boolean mif = mi.find(); final boolean msf = ms.find(); if (mif && msf) { throw new AbortException( "ID pattern and score pattern both match value: " + tokenizer.getSubstring()); } if (mif) { if (id != null) { throw new AbortException( "ID pattern matched twice: previous value " + id + " second value: " + tokenizer.getSubstring()); } id = Integer.parseInt(buf.subSequence(mi.end(), tokenizer.getEnd()).toString()); } if (msf) { if (!Double.isNaN(score)) { throw new AbortException( "Score pattern matched twice: previous value " + score + " second value: " + tokenizer.getSubstring()); } score = ParseUtil.parseDouble(buf, ms.end(), tokenizer.getEnd()); } } if (id != null && !Double.isNaN(score)) { scores.putDouble(DBIDUtil.importInteger(id), score); minmax.put(score); } else if (id == null && Double.isNaN(score)) { LOG.warning( "Line did not match either ID nor score nor comment: " + reader.getLineNumber()); } else { throw new AbortException( "Line matched only ID or only SCORE patterns: " + reader.getLineNumber()); } } } catch (IOException e) { throw new AbortException( "Could not load outlier scores: " + e.getMessage() + " when loading " + file, e); } OutlierScoreMeta meta; if (inverted) { meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax()); } else { meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); } DoubleRelation scoresult = new MaterializedDoubleRelation( "External Outlier", "external-outlier", scores, relation.getDBIDs()); OutlierResult or = new OutlierResult(meta, scoresult); // Apply scaling if (scaling instanceof OutlierScalingFunction) { ((OutlierScalingFunction) scaling).prepare(or); } DoubleMinMax mm = new DoubleMinMax(); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double val = scoresult.doubleValue(iditer); val = scaling.getScaled(val); scores.putDouble(iditer, val); mm.put(val); } meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax()); or = new OutlierResult(meta, scoresult); return or; }