Example #1
0
 @Override
 protected void prepareComplete() {
   StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null;
   scalingreferencevalues = new double[dimensionality];
   randomPerAttribute = new Random[dimensionality];
   if (scalingreference == ScalingReference.STDDEV) {
     if (buf != null) {
       buf.append("Standard deviation per attribute: ");
     }
     for (int d = 0; d < dimensionality; d++) {
       scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage;
       if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
         scalingreferencevalues[d] = percentage;
       }
       randomPerAttribute[d] = new Random(RANDOM.nextLong());
       if (buf != null) {
         buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
       }
     }
   } else if (scalingreference == ScalingReference.MINMAX
       && minima.length == 0
       && maxima.length == 0) {
     if (buf != null) {
       buf.append("extension per attribute: ");
     }
     for (int d = 0; d < dimensionality; d++) {
       scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage;
       if (scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
         scalingreferencevalues[d] = percentage;
       }
       randomPerAttribute[d] = new Random(RANDOM.nextLong());
       if (buf != null) {
         buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
       }
     }
   }
   mvs = null;
   if (buf != null) {
     LOG.debugFine(buf.toString());
   }
 }
Example #2
0
  /**
   * Algorithm 3 of Cheng and Church.
   *
   * <p>Try to re-add rows or columns that decrease the overall score.
   *
   * <p>Also try adding inverted rows.
   *
   * @param mat Data matrix
   * @param cand Bicluster candidate
   */
  private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) {
    cand.updateRowAndColumnMeans(mat, true);
    cand.computeMeanSquaredDeviation(mat);
    while (true) {
      // We need this to be final + mutable
      final boolean[] added = new boolean[] {false, false};

      // Step 2: add columns
      cand.visitRow(
          mat,
          0,
          CellVisitor.NOT_SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (!selcol);
              if (cand.computeColResidue(mat, col) <= cand.residue) {
                cand.selectColumn(col, true);
                added[0] = true;
              }
              return false;
            }
          });

      // Step 3: recompute values
      if (added[0]) {
        cand.updateRowAndColumnMeans(mat, true);
        cand.computeMeanSquaredDeviation(mat);
      }

      // Step 4: try adding rows.
      cand.visitColumn(
          mat,
          0,
          CellVisitor.NOT_SELECTED,
          new CellVisitor() {
            @Override
            public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
              assert (!selrow);
              if (cand.computeRowResidue(mat, row, false) <= cand.residue) {
                cand.selectRow(row, true);
                added[1] = true;
              }
              return false;
            }
          });

      // Step 5: try adding inverted rows.
      if (useinverted) {
        cand.visitColumn(
            mat,
            0,
            CellVisitor.NOT_SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (!selrow);
                if (cand.computeRowResidue(mat, row, true) <= cand.residue) {
                  cand.selectRow(row, true);
                  cand.invertRow(row, true);
                  added[1] = true;
                }
                return false;
              }
            });
      }
      if (added[1]) {
        cand.updateRowAndColumnMeans(mat, true);
        cand.computeMeanSquaredDeviation(mat);
        if (LOG.isDebuggingFine()) {
          LOG.debugFine(
              "Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
        }
      }
      if (!added[0] && !added[1]) {
        break;
      }
    }
  }
Example #3
0
  /**
   * Algorithm 1 of Cheng and Church:
   *
   * <p>Remove single rows or columns.
   *
   * <p>Inverted rows are not supported in this method.
   *
   * @param mat Data matrix
   * @param cand Bicluster candidate
   */
  private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
    // Assume that cand.residue is up to date!
    while (cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) {
      // Store current maximum. Need final mutable, so use arrays.
      final double[] max = {Double.NEGATIVE_INFINITY};
      final int[] best = {-1, -1};

      // Test rows
      if (cand.rowcard > 2) {
        cand.visitColumn(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selrow);
                double rowResidue = cand.computeRowResidue(mat, row, false);
                if (max[0] < rowResidue) {
                  max[0] = rowResidue;
                  best[0] = row;
                }
                return false;
              }
            });
      }

      // Test columns:
      if (cand.colcard > 2) {
        cand.visitRow(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selcol);
                double colResidue = cand.computeColResidue(mat, col);
                if (max[0] < colResidue) {
                  max[0] = colResidue;
                  best[1] = col;
                }
                return false;
              }
            });
      }

      if (best[1] >= 0) { // then override bestrow!
        cand.selectColumn(best[1], false);
      } else {
        assert (best[0] >= 0);
        cand.selectRow(best[0], false);
      }
      // TODO: incremental update could be much faster?
      cand.updateRowAndColumnMeans(mat, false);
      cand.computeMeanSquaredDeviation(mat);
      if (LOG.isDebuggingFine()) {
        LOG.debugFine(
            "Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
    }
  }
Example #4
0
  /**
   * Algorithm 2 of Cheng and Church.
   *
   * <p>Remove all rows and columns that reduce the residue by alpha.
   *
   * <p>Inverted rows are not supported in this method.
   *
   * @param mat Data matrix
   * @param cand Bicluster candidate
   */
  private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
    cand.updateRowAndColumnMeans(mat, false);
    cand.computeMeanSquaredDeviation(mat);

    // Note: assumes that cand.residue = H(I,J)
    while (cand.residue > delta) {
      final boolean[] modified = {false, false};

      // Step 2: remove rows above threshold
      if (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) {
        final double alphaResidue = alpha * cand.residue;
        cand.visitColumn(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selrow);
                if (cand.computeRowResidue(mat, row, false) > alphaResidue) {
                  cand.selectRow(row, false);
                  modified[0] = true;
                }
                return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD);
              }
            });

        // Step 3: update residue
        if (modified[0]) {
          cand.updateRowAndColumnMeans(mat, false);
          cand.computeMeanSquaredDeviation(mat);
        }
      }

      // Step 4: remove columns above threshold
      if (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) {
        final double alphaResidue = alpha * cand.residue;
        cand.visitRow(
            mat,
            0,
            CellVisitor.SELECTED,
            new CellVisitor() {
              @Override
              public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
                assert (selcol);
                if (cand.computeColResidue(mat, col) > alphaResidue) {
                  cand.selectColumn(col, false);
                  modified[1] = true;
                }
                return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD);
              }
            });
        if (modified[1]) {
          cand.updateRowAndColumnMeans(mat, false);
          cand.computeMeanSquaredDeviation(mat);
        }
      }

      if (LOG.isDebuggingFine()) {
        LOG.debugFine(
            "Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
      }
      // Step 5: if nothing has been removed, try removing single nodes.
      if (!modified[0] && !modified[1]) {
        break;
        // Will be executed next in main loop, as per algorithm 4.
        // singleNodeDeletion();
      }
    }
  }
Example #5
0
  /**
   * Performs the SUBCLU algorithm on the given database.
   *
   * @param relation Relation to process
   * @return Clustering result
   */
  public Clustering<SubspaceModel> run(Relation<V> relation) {
    final int dimensionality = RelationUtil.dimensionality(relation);

    StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null;

    // Generate all 1-dimensional clusters
    LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters.");

    // mapping of dimensionality to set of subspaces
    HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>();

    // list of 1-dimensional subspaces containing clusters
    List<Subspace> s_1 = new ArrayList<>();
    subspaceMap.put(0, s_1);

    // mapping of subspaces to list of clusters
    TreeMap<Subspace, List<Cluster<Model>>> clusterMap =
        new TreeMap<>(new Subspace.DimensionComparator());

    for (int d = 0; d < dimensionality; d++) {
      Subspace currentSubspace = new Subspace(d);
      List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace);

      if (LOG.isDebuggingFiner()) {
        StringBuilder msg = new StringBuilder();
        msg.append('\n')
            .append(clusters.size())
            .append(" clusters in subspace ")
            .append(currentSubspace.dimensonsToString())
            .append(": \n");
        for (Cluster<Model> cluster : clusters) {
          msg.append("      " + cluster.getIDs() + "\n");
        }
        LOG.debugFiner(msg.toString());
      }

      if (!clusters.isEmpty()) {
        s_1.add(currentSubspace);
        clusterMap.put(currentSubspace, clusters);
      }
    }

    // Generate (d+1)-dimensional clusters from d-dimensional clusters
    for (int d = 0; d < dimensionality - 1; d++) {
      if (stepprog != null) {
        stepprog.beginStep(
            d + 2,
            "Generate "
                + (d + 2)
                + "-dimensional clusters from "
                + (d + 1)
                + "-dimensional clusters.",
            LOG);
      }

      List<Subspace> subspaces = subspaceMap.get(d);
      if (subspaces == null || subspaces.isEmpty()) {
        if (stepprog != null) {
          for (int dim = d + 1; dim < dimensionality - 1; dim++) {
            stepprog.beginStep(
                dim + 2,
                "Generation of"
                    + (dim + 2)
                    + "-dimensional clusters not applicable, because no more "
                    + (d + 2)
                    + "-dimensional subspaces found.",
                LOG);
          }
        }
        break;
      }

      List<Subspace> candidates = generateSubspaceCandidates(subspaces);
      List<Subspace> s_d = new ArrayList<>();

      for (Subspace candidate : candidates) {
        Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap);
        if (LOG.isDebuggingFine()) {
          LOG.debugFine(
              "best subspace of "
                  + candidate.dimensonsToString()
                  + ": "
                  + bestSubspace.dimensonsToString());
        }

        List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace);
        List<Cluster<Model>> clusters = new ArrayList<>();
        for (Cluster<Model> cluster : bestSubspaceClusters) {
          List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate);
          if (!candidateClusters.isEmpty()) {
            clusters.addAll(candidateClusters);
          }
        }

        if (LOG.isDebuggingFine()) {
          StringBuilder msg = new StringBuilder();
          msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n");
          for (Cluster<Model> c : clusters) {
            msg.append("      " + c.getIDs() + "\n");
          }
          LOG.debugFine(msg.toString());
        }

        if (!clusters.isEmpty()) {
          s_d.add(candidate);
          clusterMap.put(candidate, clusters);
        }
      }

      if (!s_d.isEmpty()) {
        subspaceMap.put(d + 1, s_d);
      }
    }

    // build result
    int numClusters = 1;
    result = new Clustering<>("SUBCLU clustering", "subclu-clustering");
    for (Subspace subspace : clusterMap.descendingKeySet()) {
      List<Cluster<Model>> clusters = clusterMap.get(subspace);
      for (Cluster<Model> cluster : clusters) {
        Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs());
        newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs())));
        newCluster.setName("cluster_" + numClusters++);
        result.addToplevelCluster(newCluster);
      }
    }

    LOG.setCompleted(stepprog);
    return result;
  }