Ejemplo n.º 1
0
  /** Runs the wrapper with the specified arguments. */
  @Override
  public void run() throws UnableToComplyException {
    MultipleObjectsBundle data = generator.loadData();
    if (LOG.isVerbose()) {
      LOG.verbose("Writing output ...");
    }
    try {
      if (outputFile.exists()) {
        if (LOG.isVerbose()) {
          LOG.verbose(
              "The file "
                  + outputFile
                  + " already exists, "
                  + "the generator result will be APPENDED.");
        }
      }

      try (OutputStreamWriter outStream = new FileWriter(outputFile, true)) {
        writeClusters(outStream, data);
      }
    } catch (FileNotFoundException e) {
      throw new UnableToComplyException(e);
    } catch (IOException e) {
      throw new UnableToComplyException(e);
    }
    if (LOG.isVerbose()) {
      LOG.verbose("Done.");
    }
  }
Ejemplo n.º 2
0
  @Override
  public Clustering<BiclusterWithInversionsModel> biclustering() {
    double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);

    BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());

    Clustering<BiclusterWithInversionsModel> result =
        new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
    ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());

    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
    for (int i = 0; i < n; i++) {
      cand.reset();
      multipleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      singleNodeDeletion(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      nodeAddition(mat, cand);
      if (LOG.isVeryVerbose()) {
        LOG.veryverbose(
            "Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      cand.maskMatrix(mat, dist);
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
      final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
      noise.removeDBIDs(cids);
      result.addToplevelCluster(new Cluster<>(cids, model));

      if (LOG.isVerbose()) {
        LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
        LOG.verbose("Number of rows: " + cand.rowcard + "\n");
        LOG.verbose("Number of columns: " + cand.colcard + "\n");
        // LOG.verbose("Total number of masked values: " + maskedVals.size() +
        // "\n");
      }
      LOG.incrementProcessed(prog);
    }
    // Add a noise cluster, full-dimensional.
    if (!noise.isEmpty()) {
      long[] allcols = BitsUtil.ones(getColDim());
      BiclusterWithInversionsModel model =
          new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
      result.addToplevelCluster(new Cluster<>(noise, true, model));
    }
    LOG.ensureCompleted(prog);
    return result;
  }
Ejemplo n.º 3
0
  /**
   * Process a database
   *
   * @param database Database to process
   * @param relation Relation to process
   * @return Histogram of ranking qualities
   */
  public HistogramResult<DoubleVector> run(Database database, Relation<O> relation) {
    final DistanceQuery<O> distanceQuery =
        database.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<O> knnQuery = database.getKNNQuery(distanceQuery, relation.size());

    if (LOG.isVerbose()) {
      LOG.verbose("Preprocessing clusters...");
    }
    // Cluster by labels
    Collection<Cluster<Model>> split =
        (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();

    DoubleStaticHistogram hist = new DoubleStaticHistogram(numbins, 0.0, 1.0);

    if (LOG.isVerbose()) {
      LOG.verbose("Processing points...");
    }
    FiniteProgress progress =
        LOG.isVerbose()
            ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG)
            : null;

    MeanVariance mv = new MeanVariance();
    // sort neighbors
    for (Cluster<?> clus : split) {
      for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
        KNNList knn = knnQuery.getKNNForDBID(iter, relation.size());
        double result = new ROCEvaluation().evaluate(clus, knn);

        mv.put(result);
        hist.increment(result, 1. / relation.size());

        LOG.incrementProcessed(progress);
      }
    }
    LOG.ensureCompleted(progress);

    // Transform Histogram into a Double Vector array.
    Collection<DoubleVector> res = new ArrayList<>(relation.size());
    for (DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) {
      DoubleVector row = new DoubleVector(new double[] {iter.getCenter(), iter.getValue()});
      res.add(row);
    }
    HistogramResult<DoubleVector> result =
        new HistogramResult<>("Ranking Quality Histogram", "ranking-histogram", res);
    result.addHeader("Mean: " + mv.getMean() + " Variance: " + mv.getSampleVariance());
    return result;
  }
Ejemplo n.º 4
0
  /**
   * Runs the DBSCAN algorithm on the specified partition of the database in the given subspace. If
   * parameter {@code ids} is null DBSCAN will be applied to the whole database.
   *
   * @param relation the database holding the objects to run DBSCAN on
   * @param ids the IDs of the database defining the partition to run DBSCAN on - if this parameter
   *     is null DBSCAN will be applied to the whole database
   * @param subspace the subspace to run DBSCAN on
   * @return the clustering result of the DBSCAN run
   */
  private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace subspace) {
    // distance function
    distanceFunction.setSelectedDimensions(subspace.getDimensions());

    ProxyDatabase proxy;
    if (ids == null) {
      // TODO: in this case, we might want to use an index - the proxy below
      // will prevent this!
      ids = relation.getDBIDs();
    }

    proxy = new ProxyDatabase(ids, relation);

    DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts);
    // run DBSCAN
    if (LOG.isVerbose()) {
      LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString());
    }
    Clustering<Model> dbsres = dbscan.run(proxy);

    // separate cluster and noise
    List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters();
    List<Cluster<Model>> clusters = new ArrayList<>();
    for (Cluster<Model> c : clusterAndNoise) {
      if (!c.isNoise()) {
        clusters.add(c);
      }
    }
    return clusters;
  }
Ejemplo n.º 5
0
  /**
   * Run the Eclat algorithm
   *
   * @param db Database to process
   * @param relation Bit vector relation
   * @return Frequent patterns found
   */
  public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable arrays, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());

    LOG.verbose("Build 1-dimensional transaction lists.");
    Duration ctime = LOG.newDuration(STAT + "eclat.transposition.time").begin();
    DBIDs[] idx = buildIndex(relation, dim, minsupp);
    LOG.statistics(ctime.end());

    FiniteProgress prog =
        LOG.isVerbose() ? new FiniteProgress("Building frequent itemsets", idx.length, LOG) : null;
    Duration etime = LOG.newDuration(STAT + "eclat.extraction.time").begin();
    final List<Itemset> solution = new ArrayList<>();
    for (int i = 0; i < idx.length; i++) {
      LOG.incrementProcessed(prog);
      extractItemsets(idx, i, minsupp, solution);
    }
    LOG.ensureCompleted(prog);
    Collections.sort(solution);
    LOG.statistics(etime.end());

    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("Eclat", "eclat", solution, meta);
  }
Ejemplo n.º 6
0
  private void loadCache(DistanceParser parser, File matrixfile) throws IOException {
    InputStream in =
        new BufferedInputStream(FileUtil.tryGzipInput(new FileInputStream(matrixfile)));
    cache =
        new TLongFloatHashMap(
            Constants.DEFAULT_CAPACITY,
            Constants.DEFAULT_LOAD_FACTOR,
            -1L,
            Float.POSITIVE_INFINITY);
    min = Integer.MAX_VALUE;
    max = Integer.MIN_VALUE;
    parser.parse(
        in,
        new DistanceCacheWriter() {
          @Override
          public void put(int id1, int id2, double distance) {
            if (id1 < id2) {
              min = id1 < min ? id1 : min;
              max = id2 > max ? id2 : max;
            } else {
              min = id2 < min ? id2 : min;
              max = id1 > max ? id1 : max;
            }
            cache.put(makeKey(id1, id2), (float) distance);
          }

          @Override
          public boolean containsKey(int id1, int id2) {
            return cache.containsKey(makeKey(id1, id2));
          }
        });
    if (min != 0) {
      LOG.verbose(
          "Distance matrix is supposed to be 0-indexed. Choosing offset "
              + min
              + " to compensate.");
    }
  }
  /**
   * Run the algorithm
   *
   * @param db Database
   * @param relation Relation
   * @return Clustering hierarchy
   */
  public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();

    if (size > 0x10000) {
      throw new AbortException(
          "This implementation does not scale to data sets larger than "
              + 0x10000
              + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    if (Linkage.SINGLE.equals(linkage)) {
      LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    }

    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    boolean square =
        Linkage.WARD.equals(linkage)
            && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction()));
    for (int x = 0; ix.valid(); x++, ix.advance()) {
      iy.seek(0);
      for (int y = 0; y < x; y++, iy.advance()) {
        scratch[pos] = dq.distance(ix, iy);
        // Ward uses variances -- i.e. squared values
        if (square) {
          scratch[pos] *= scratch[pos];
        }
        pos++;
      }
    }

    // Initialize space for result:
    WritableDBIDDataStore parent =
        DataStoreUtil.makeDBIDStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore height =
        DataStoreUtil.makeDoubleStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableIntegerDataStore csize =
        DataStoreUtil.makeIntegerStorage(
            ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
      parent.put(it, it);
      height.put(it, Double.POSITIVE_INFINITY);
      csize.put(it, 1);
    }

    // Repeat until everything merged, except the desired number of clusters:
    FiniteProgress prog =
        LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null;
    for (int i = 1; i < size; i++) {
      double min = Double.POSITIVE_INFINITY;
      int minx = -1, miny = -1;
      for (ix.seek(0); ix.valid(); ix.advance()) {
        if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int xbase = triangleSize(ix.getOffset());
        for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) {
          if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) {
            continue;
          }
          final int idx = xbase + iy.getOffset();
          if (scratch[idx] <= min) {
            min = scratch[idx];
            minx = ix.getOffset();
            miny = iy.getOffset();
          }
        }
      }
      assert (minx >= 0 && miny >= 0);
      // Avoid allocating memory, by reusing existing iterators:
      ix.seek(minx);
      iy.seek(miny);
      // Perform merge in data structure: x -> y
      // Since y < x, prefer keeping y, dropping x.
      int sizex = csize.intValue(ix), sizey = csize.intValue(iy);
      height.put(ix, min);
      parent.put(ix, iy);
      csize.put(iy, sizex + sizey);

      // Update distance matrix. Note: miny < minx
      final int xbase = triangleSize(minx), ybase = triangleSize(miny);
      // Write to (y, j), with j < y
      for (ij.seek(0); ij.getOffset() < miny; ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int sizej = csize.intValue(ij);
        scratch[ybase + ij.getOffset()] =
            linkage.combine(
                sizex,
                scratch[xbase + ij.getOffset()],
                sizey,
                scratch[ybase + ij.getOffset()],
                sizej,
                min);
      }
      // Write to (j, y), with y < j < x
      for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int jbase = triangleSize(ij.getOffset());
        final int sizej = csize.intValue(ij);
        scratch[jbase + miny] =
            linkage.combine(
                sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min);
      }
      // Write to (j, y), with y < x < j
      for (ij.seek(minx + 1); ij.valid(); ij.advance()) {
        if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
          continue;
        }
        final int jbase = triangleSize(ij.getOffset());
        final int sizej = csize.intValue(ij);
        scratch[jbase + miny] =
            linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min);
      }
      LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);

    return new PointerHierarchyRepresentationResult(ids, parent, height);
  }
Ejemplo n.º 8
0
 protected void autoEvaluateOutliers(ResultHierarchy hier, Result newResult) {
   Collection<OutlierResult> outliers =
       ResultUtil.filterResults(hier, newResult, OutlierResult.class);
   if (LOG.isDebugging()) {
     LOG.debug("Number of new outlier results: " + outliers.size());
   }
   if (outliers.size() > 0) {
     Database db = ResultUtil.findDatabase(hier);
     ResultUtil.ensureClusteringResult(db, db);
     Collection<Clustering<?>> clusterings = ResultUtil.filterResults(hier, db, Clustering.class);
     if (clusterings.size() == 0) {
       LOG.warning(
           "Could not find a clustering result, even after running 'ensureClusteringResult'?!?");
       return;
     }
     Clustering<?> basec = clusterings.iterator().next();
     // Find minority class label
     int min = Integer.MAX_VALUE;
     int total = 0;
     String label = null;
     if (basec.getAllClusters().size() > 1) {
       for (Cluster<?> c : basec.getAllClusters()) {
         final int csize = c.getIDs().size();
         total += csize;
         if (csize < min) {
           min = csize;
           label = c.getName();
         }
       }
     }
     if (label == null) {
       LOG.warning("Could not evaluate outlier results, as I could not find a minority label.");
       return;
     }
     if (min == 1) {
       LOG.warning(
           "The minority class label had a single object. Try using 'ClassLabelFilter' to identify the class label column.");
     }
     if (min > 0.05 * total) {
       LOG.warning(
           "The minority class I discovered (labeled '"
               + label
               + "') has "
               + (min * 100. / total)
               + "% of objects. Outlier classes should be more rare!");
     }
     LOG.verbose("Evaluating using minority class: " + label);
     Pattern pat = Pattern.compile("^" + Pattern.quote(label) + "$");
     // Evaluate rankings.
     new OutlierRankingEvaluation(pat).processNewResult(hier, newResult);
     // Compute ROC curve
     new OutlierROCCurve(pat).processNewResult(hier, newResult);
     // Compute Precision at k
     new OutlierPrecisionAtKCurve(pat, min << 1).processNewResult(hier, newResult);
     // Compute ROC curve
     new OutlierPrecisionRecallCurve(pat).processNewResult(hier, newResult);
     // Compute outlier histogram
     new ComputeOutlierHistogram(pat, 50, new LinearScaling(), false)
         .processNewResult(hier, newResult);
   }
 }