Example #1
0
  @Override
  public Clustering getClustering(Dataset<E> parent) {
    setDataset(parent);

    int estClusters = (int) Math.sqrt(dataset.size());
    Clustering result = new ClusterList(estClusters);

    // estimated capacity
    int perCluster = (int) (parent.size() / (float) estClusters);
    int[] assign = getMapping();
    if (assign != null) {
      int id;
      Cluster clust;
      for (int i = 0; i < assign.length; i++) {
        id = assign[i];
        clust = result.createCluster(id, perCluster);
        clust.add(dataset.get(i));
      }
    } else {
      // try some cutoff method?
      throw new RuntimeException("don't know how to get clusters..");
    }

    // proximity.printLower(5, 2);
    // similarity.print(4, 2);
    result.lookupAdd(dataset);
    if (props != null) {
      result.setParams(props);
    }
    return result;
  }
Example #2
0
 /** Dummy mapping for debugging purposes */
 public void createMapping() {
   treeData = new DynamicTreeData();
   mapping = new int[dataset.size()];
   for (int i = 0; i < dataset.size(); i++) {
     mapping[i] = i;
   }
 }
Example #3
0
  @Test
  public void testSingleLinkage() {
    Dataset<? extends Instance> dataset = FakeClustering.kumarData();
    assertEquals(6, dataset.size());
    Props pref = new Props();
    pref.put(AlgParams.LINKAGE, SingleLinkage.name);
    pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.ROWS_CLUSTERING);
    pref.put(PropType.PERFORMANCE, AlgParams.KEEP_PROXIMITY, true);
    HierarchicalResult result = subject.hierarchy(dataset, pref);
    Matrix similarityMatrix = result.getProximityMatrix();
    assertNotNull(similarityMatrix);
    assertEquals(similarityMatrix.rowsCount(), dataset.size());
    assertEquals(similarityMatrix.columnsCount(), dataset.size());
    System.out.println("kumar - single");
    DendroTreeData tree = result.getTreeData();
    tree.print();
    assertEquals(dataset.size(), tree.numLeaves());
    DendroNode root = tree.getRoot();
    assertEquals(0.21587033144922904, root.getHeight(), DELTA);

    int levels = tree.distinctHeights(1e-7);
    // TODO: in this example nodes #7 and #8 are on different level,
    // but their height is the same. should we consider those as different
    assertEquals(4, levels);
  }
Example #4
0
  /**
   * @see http://alias-i.com/lingpipe/docs/api/com/aliasi/classify/PrecisionRecallEvaluation.html
   * @return
   */
  public static Clustering wineClustering() {

    if (simpleResponse == null) {
      simpleResponse = new ClusterList(3);
      Cluster a = new BaseCluster(13);
      a.setName("cluster A");
      a.setAttribute(0, a.attributeBuilder().create("x", BasicAttrType.INTEGER));
      Cluster b = new BaseCluster(9);
      b.setName("cluster B");
      b.setAttribute(0, b.attributeBuilder().create("x", BasicAttrType.INTEGER));

      Cluster c = new BaseCluster(5);
      c.setName("cluster C");
      c.setAttribute(0, c.attributeBuilder().create("x", BasicAttrType.INTEGER));

      Dataset<Instance> data = wine();
      System.out.println("dataset size " + data.size());
      // cabernet 9x -> a
      for (int i = 0; i < 9; i++) {
        a.add(data.instance(i));
      }

      // cabernet 2x => b
      b.add(data.instance(9));
      // cabernet 1x => c
      c.add(data.instance(10));
      b.add(data.instance(11));
      b.add(data.instance(12));

      // syrah 2x -> a
      for (int i = 13; i < 15; i++) {
        a.add(data.instance(i));
      }

      // syrah 2x -> c
      c.add(data.instance(15));

      // syrah 5x -> b
      for (int i = 16; i < 21; i++) {
        b.add(data.instance(i));
      }
      a.add(data.instance(21));
      // pinot 4x -> c
      for (int i = 22; i < 26; i++) {
        c.add(data.instance(i));
      }

      // pinot -> cabernet cluster
      b.add(data.instance(26));

      simpleResponse.add(a);
      simpleResponse.add(b);
      simpleResponse.add(c);
    }

    return simpleResponse;
  }
Example #5
0
 @Override
 public int size() {
   switch (resultType) {
     case COLUMNS_CLUSTERING:
       return dataset.attributeCount();
     case ROWS_CLUSTERING:
       return dataset.size();
   }
   throw new RuntimeException("Don't know wether cluster rows or columns.");
 }
Example #6
0
  @Test
  public void testSingleLinkageSchool() {
    Dataset<? extends Instance> dataset = FakeClustering.schoolData();
    assertEquals(17, dataset.size());
    Props pref = new Props();
    pref.put(AlgParams.LINKAGE, SingleLinkage.name);
    pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.ROWS_CLUSTERING);
    HierarchicalResult result = subject.hierarchy(dataset, pref);
    System.out.println("school - single");
    DendroTreeData tree = result.getTreeData();
    tree.print();
    assertEquals(dataset.size(), tree.numLeaves());
    DendroNode root = tree.getRoot();
    assertEquals(32.542734980330046, root.getHeight(), DELTA);
    assertEquals(2 * dataset.size() - 1, tree.numNodes());

    assertEquals(16, tree.distinctHeights());
    assertEquals(8, tree.treeLevels());
  }
Example #7
0
  @Test
  public void testInverseSorting() {
    Dataset<? extends Instance> dataset = FakeClustering.kumarData();
    assertEquals(6, dataset.size());
    Props pref = new Props();
    pref.put(AlgParams.LINKAGE, SingleLinkage.name);
    pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.ROWS_CLUSTERING);
    // inverse ordering
    pref.put(AlgParams.SMALLEST_FIRST, false);
    HierarchicalResult result = subject.hierarchy(dataset, pref);
    System.out.println("kumar - inverse");
    DendroTreeData tree = result.getTreeData();
    tree.print();
    assertEquals(dataset.size(), tree.numLeaves());
    DendroNode root = tree.getRoot();
    assertEquals(0.10198039027185574, root.getHeight(), DELTA);

    assertEquals(5, tree.distinctHeights());
    assertEquals(4, tree.treeLevels());
  }
Example #8
0
 @Override
 public Clustering updateCutoff(double cutoff) {
   this.cutoff = cutoff;
   int[] assign = new int[dataset.size()];
   int estClusters = (int) Math.sqrt(dataset.size());
   colorGenerator.reset();
   num = 0; // human readable
   Clustering clusters = new ClusterList(estClusters);
   DendroNode root = treeData.getRoot();
   if (root != null) {
     checkCutoff(root, cutoff, clusters, assign);
     if (clusters.size() > 0) {
       mapping = assign;
     } else {
       LOG.info("failed to cutoff dendrogram, cut = {}", cutoff);
     }
   }
   // add input dataset to clustering lookup
   if (noise != null) {
     Cluster clust = new BaseCluster<>(noise.size());
     clust.setColor(colorGenerator.next());
     clust.setClusterId(num++);
     clust.setParent(getDataset());
     clust.setName("Noise");
     clust.setAttributes(getDataset().getAttributes());
     for (Instance ins : noise) {
       clust.add(ins);
       mapping[ins.getIndex()] = num - 1;
     }
     clusters.add(clust);
   }
   clusters.lookupAdd(dataset);
   if (dendroMapping != null) {
     clusters.lookupAdd(dendroMapping);
   }
   clusters.lookupAdd(this);
   return clusters;
 }
Example #9
0
 /** TODO: make sure this test is correct */
 @Ignore
 public void testScore() throws ScoreException {
   Clustering c = new ClusterList(2);
   Dataset<? extends Instance> d = new ArrayDataset(8, 2);
   d.builder().create(new double[] {0, 0}, "0");
   d.builder().create(new double[] {0, 0}, "0");
   d.builder().create(new double[] {0, 0}, "0");
   d.builder().create(new double[] {1, 1}, "0");
   d.builder().create(new double[] {1, 1}, "1");
   d.builder().create(new double[] {1, 1}, "1");
   d.builder().create(new double[] {1, 1}, "1");
   d.builder().create(new double[] {1, 1}, "1");
   assertEquals(8, d.size());
   Cluster a = c.createCluster(0, 4);
   Cluster b = c.createCluster(1, 4);
   for (int i = 0; i < 4; i++) {
     a.add(d.get(i));
     b.add(d.get(i + 4));
   }
   assertEquals(2, c.size());
   assertEquals(0.14039740914097984, subject.score(c), delta);
 }