Esempio n. 1
0
  @Override
  public void parse(InputStream in, DistanceCacheWriter cache) {
    reader.reset(in);

    int min = Integer.MAX_VALUE, max = Integer.MIN_VALUE;
    IndefiniteProgress prog =
        LOG.isVerbose() ? new IndefiniteProgress("Parsing distance matrix", LOG) : null;
    try {
      while (reader.nextLineExceptComments()) {
        LOG.incrementProcessed(prog);
        if (!tokenizer.valid()) {
          throw new IllegalArgumentException(
              "Less than three values in line " + reader.getLineNumber());
        }
        int id1, id2;
        try {
          id1 = (int) tokenizer.getLongBase10();
          tokenizer.advance();
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "Error in line " + reader.getLineNumber() + ": id1 is not an integer!");
        }
        if (!tokenizer.valid()) {
          throw new IllegalArgumentException(
              "Less than three values in line " + reader.getLineNumber());
        }

        try {
          id2 = (int) tokenizer.getLongBase10();
          tokenizer.advance();
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "Error in line " + reader.getLineNumber() + ": id2 is not an integer!");
        }
        if (!tokenizer.valid()) {
          throw new IllegalArgumentException(
              "Less than three values in line " + reader.getLineNumber());
        }

        // Track minimum and maximum
        if (id1 < id2) {
          min = (id1 < min) ? id1 : min;
          max = (id2 > min) ? id2 : max;
        } else {
          min = (id2 < min) ? id2 : min;
          max = (id1 > min) ? id1 : max;
        }

        try {
          double distance = tokenizer.getDouble();
          cache.put(id1, id2, distance);
        } catch (IllegalArgumentException e) {
          throw new IllegalArgumentException(
              "Error in line " + reader.getLineNumber() + ":" + e.getMessage(), e);
        }
        tokenizer.advance();
        if (tokenizer.valid()) {
          throw new IllegalArgumentException(
              "More than three values in line " + reader.getLineNumber());
        }
      }
    } catch (IOException e) {
      throw new IllegalArgumentException(
          "Error while parsing line " + reader.getLineNumber() + ".");
    }

    LOG.setCompleted(prog);

    // check if all distance values are specified
    for (int i1 = min; i1 <= max; i1++) {
      for (int i2 = i1 + 1; i2 <= max; i2++) {
        if (!cache.containsKey(i1, i2)) {
          throw new IllegalArgumentException(
              "Distance value for " + i1 + " to " + i2 + " is missing!");
        }
      }
    }
  }
  /**
   * Run the algorithm.
   *
   * @param database Database to use
   * @param relation Relation to use
   * @return Result
   */
  public OutlierResult run(Database database, Relation<?> relation) {
    WritableDoubleDataStore scores =
        DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);

    DoubleMinMax minmax = new DoubleMinMax();

    try (InputStream in = FileUtil.tryGzipInput(new FileInputStream(file)); //
        TokenizedReader reader = CSVReaderFormat.DEFAULT_FORMAT.makeReader()) {
      Tokenizer tokenizer = reader.getTokenizer();
      CharSequence buf = reader.getBuffer();
      Matcher mi = idpattern.matcher(buf), ms = scorepattern.matcher(buf);
      reader.reset(in);
      while (reader.nextLineExceptComments()) {
        Integer id = null;
        double score = Double.NaN;
        for (
        /* initialized by nextLineExceptComments */ ; tokenizer.valid(); tokenizer.advance()) {
          mi.region(tokenizer.getStart(), tokenizer.getEnd());
          ms.region(tokenizer.getStart(), tokenizer.getEnd());
          final boolean mif = mi.find();
          final boolean msf = ms.find();
          if (mif && msf) {
            throw new AbortException(
                "ID pattern and score pattern both match value: " + tokenizer.getSubstring());
          }
          if (mif) {
            if (id != null) {
              throw new AbortException(
                  "ID pattern matched twice: previous value "
                      + id
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            id = Integer.parseInt(buf.subSequence(mi.end(), tokenizer.getEnd()).toString());
          }
          if (msf) {
            if (!Double.isNaN(score)) {
              throw new AbortException(
                  "Score pattern matched twice: previous value "
                      + score
                      + " second value: "
                      + tokenizer.getSubstring());
            }
            score = ParseUtil.parseDouble(buf, ms.end(), tokenizer.getEnd());
          }
        }
        if (id != null && !Double.isNaN(score)) {
          scores.putDouble(DBIDUtil.importInteger(id), score);
          minmax.put(score);
        } else if (id == null && Double.isNaN(score)) {
          LOG.warning(
              "Line did not match either ID nor score nor comment: " + reader.getLineNumber());
        } else {
          throw new AbortException(
              "Line matched only ID or only SCORE patterns: " + reader.getLineNumber());
        }
      }
    } catch (IOException e) {
      throw new AbortException(
          "Could not load outlier scores: " + e.getMessage() + " when loading " + file, e);
    }

    OutlierScoreMeta meta;
    if (inverted) {
      meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    } else {
      meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax());
    }
    DoubleRelation scoresult =
        new MaterializedDoubleRelation(
            "External Outlier", "external-outlier", scores, relation.getDBIDs());
    OutlierResult or = new OutlierResult(meta, scoresult);

    // Apply scaling
    if (scaling instanceof OutlierScalingFunction) {
      ((OutlierScalingFunction) scaling).prepare(or);
    }
    DoubleMinMax mm = new DoubleMinMax();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      double val = scoresult.doubleValue(iditer);
      val = scaling.getScaled(val);
      scores.putDouble(iditer, val);
      mm.put(val);
    }
    meta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax());
    or = new OutlierResult(meta, scoresult);
    return or;
  }