@Override public Clustering getClustering(Dataset<E> parent) { setDataset(parent); int estClusters = (int) Math.sqrt(dataset.size()); Clustering result = new ClusterList(estClusters); // estimated capacity int perCluster = (int) (parent.size() / (float) estClusters); int[] assign = getMapping(); if (assign != null) { int id; Cluster clust; for (int i = 0; i < assign.length; i++) { id = assign[i]; clust = result.createCluster(id, perCluster); clust.add(dataset.get(i)); } } else { // try some cutoff method? throw new RuntimeException("don't know how to get clusters.."); } // proximity.printLower(5, 2); // similarity.print(4, 2); result.lookupAdd(dataset); if (props != null) { result.setParams(props); } return result; }
/** Dummy mapping for debugging purposes */ public void createMapping() { treeData = new DynamicTreeData(); mapping = new int[dataset.size()]; for (int i = 0; i < dataset.size(); i++) { mapping[i] = i; } }
@Test public void testSingleLinkage() { Dataset<? extends Instance> dataset = FakeClustering.kumarData(); assertEquals(6, dataset.size()); Props pref = new Props(); pref.put(AlgParams.LINKAGE, SingleLinkage.name); pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.ROWS_CLUSTERING); pref.put(PropType.PERFORMANCE, AlgParams.KEEP_PROXIMITY, true); HierarchicalResult result = subject.hierarchy(dataset, pref); Matrix similarityMatrix = result.getProximityMatrix(); assertNotNull(similarityMatrix); assertEquals(similarityMatrix.rowsCount(), dataset.size()); assertEquals(similarityMatrix.columnsCount(), dataset.size()); System.out.println("kumar - single"); DendroTreeData tree = result.getTreeData(); tree.print(); assertEquals(dataset.size(), tree.numLeaves()); DendroNode root = tree.getRoot(); assertEquals(0.21587033144922904, root.getHeight(), DELTA); int levels = tree.distinctHeights(1e-7); // TODO: in this example nodes #7 and #8 are on different level, // but their height is the same. should we consider those as different assertEquals(4, levels); }
/** * @see http://alias-i.com/lingpipe/docs/api/com/aliasi/classify/PrecisionRecallEvaluation.html * @return */ public static Clustering wineClustering() { if (simpleResponse == null) { simpleResponse = new ClusterList(3); Cluster a = new BaseCluster(13); a.setName("cluster A"); a.setAttribute(0, a.attributeBuilder().create("x", BasicAttrType.INTEGER)); Cluster b = new BaseCluster(9); b.setName("cluster B"); b.setAttribute(0, b.attributeBuilder().create("x", BasicAttrType.INTEGER)); Cluster c = new BaseCluster(5); c.setName("cluster C"); c.setAttribute(0, c.attributeBuilder().create("x", BasicAttrType.INTEGER)); Dataset<Instance> data = wine(); System.out.println("dataset size " + data.size()); // cabernet 9x -> a for (int i = 0; i < 9; i++) { a.add(data.instance(i)); } // cabernet 2x => b b.add(data.instance(9)); // cabernet 1x => c c.add(data.instance(10)); b.add(data.instance(11)); b.add(data.instance(12)); // syrah 2x -> a for (int i = 13; i < 15; i++) { a.add(data.instance(i)); } // syrah 2x -> c c.add(data.instance(15)); // syrah 5x -> b for (int i = 16; i < 21; i++) { b.add(data.instance(i)); } a.add(data.instance(21)); // pinot 4x -> c for (int i = 22; i < 26; i++) { c.add(data.instance(i)); } // pinot -> cabernet cluster b.add(data.instance(26)); simpleResponse.add(a); simpleResponse.add(b); simpleResponse.add(c); } return simpleResponse; }
@Override public int size() { switch (resultType) { case COLUMNS_CLUSTERING: return dataset.attributeCount(); case ROWS_CLUSTERING: return dataset.size(); } throw new RuntimeException("Don't know wether cluster rows or columns."); }
@Test public void testSingleLinkageSchool() { Dataset<? extends Instance> dataset = FakeClustering.schoolData(); assertEquals(17, dataset.size()); Props pref = new Props(); pref.put(AlgParams.LINKAGE, SingleLinkage.name); pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.ROWS_CLUSTERING); HierarchicalResult result = subject.hierarchy(dataset, pref); System.out.println("school - single"); DendroTreeData tree = result.getTreeData(); tree.print(); assertEquals(dataset.size(), tree.numLeaves()); DendroNode root = tree.getRoot(); assertEquals(32.542734980330046, root.getHeight(), DELTA); assertEquals(2 * dataset.size() - 1, tree.numNodes()); assertEquals(16, tree.distinctHeights()); assertEquals(8, tree.treeLevels()); }
@Test public void testInverseSorting() { Dataset<? extends Instance> dataset = FakeClustering.kumarData(); assertEquals(6, dataset.size()); Props pref = new Props(); pref.put(AlgParams.LINKAGE, SingleLinkage.name); pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.ROWS_CLUSTERING); // inverse ordering pref.put(AlgParams.SMALLEST_FIRST, false); HierarchicalResult result = subject.hierarchy(dataset, pref); System.out.println("kumar - inverse"); DendroTreeData tree = result.getTreeData(); tree.print(); assertEquals(dataset.size(), tree.numLeaves()); DendroNode root = tree.getRoot(); assertEquals(0.10198039027185574, root.getHeight(), DELTA); assertEquals(5, tree.distinctHeights()); assertEquals(4, tree.treeLevels()); }
@Override public Clustering updateCutoff(double cutoff) { this.cutoff = cutoff; int[] assign = new int[dataset.size()]; int estClusters = (int) Math.sqrt(dataset.size()); colorGenerator.reset(); num = 0; // human readable Clustering clusters = new ClusterList(estClusters); DendroNode root = treeData.getRoot(); if (root != null) { checkCutoff(root, cutoff, clusters, assign); if (clusters.size() > 0) { mapping = assign; } else { LOG.info("failed to cutoff dendrogram, cut = {}", cutoff); } } // add input dataset to clustering lookup if (noise != null) { Cluster clust = new BaseCluster<>(noise.size()); clust.setColor(colorGenerator.next()); clust.setClusterId(num++); clust.setParent(getDataset()); clust.setName("Noise"); clust.setAttributes(getDataset().getAttributes()); for (Instance ins : noise) { clust.add(ins); mapping[ins.getIndex()] = num - 1; } clusters.add(clust); } clusters.lookupAdd(dataset); if (dendroMapping != null) { clusters.lookupAdd(dendroMapping); } clusters.lookupAdd(this); return clusters; }
/** TODO: make sure this test is correct */ @Ignore public void testScore() throws ScoreException { Clustering c = new ClusterList(2); Dataset<? extends Instance> d = new ArrayDataset(8, 2); d.builder().create(new double[] {0, 0}, "0"); d.builder().create(new double[] {0, 0}, "0"); d.builder().create(new double[] {0, 0}, "0"); d.builder().create(new double[] {1, 1}, "0"); d.builder().create(new double[] {1, 1}, "1"); d.builder().create(new double[] {1, 1}, "1"); d.builder().create(new double[] {1, 1}, "1"); d.builder().create(new double[] {1, 1}, "1"); assertEquals(8, d.size()); Cluster a = c.createCluster(0, 4); Cluster b = c.createCluster(1, 4); for (int i = 0; i < 4; i++) { a.add(d.get(i)); b.add(d.get(i + 4)); } assertEquals(2, c.size()); assertEquals(0.14039740914097984, subject.score(c), delta); }