/** * Based on "Details of the Adjusted Rand index and Clustering algorithms Supplement to the paper * “An empirical study on Principal Component Analysis for clustering gene expression data” (to * appear in Bioinformatics)" * * @return */ public Clustering<Instance, Cluster<Instance>> pcaData() { Clustering<Instance, Cluster<Instance>> clustering = new ClusterList<>(3); Random rand = new Random(); int size = 10; Dataset<? extends Instance> data = new ArrayDataset<>(size, 2); data.attributeBuilder().create("x1", "NUMERIC"); data.attributeBuilder().create("x2", "NUMERIC"); InstanceBuilder<? extends Instance> builder = data.builder(); BaseCluster c1 = new BaseCluster(2); clustering.add(c1); BaseCluster c2 = new BaseCluster(3); clustering.add(c2); BaseCluster c3 = new BaseCluster(5); clustering.add(c3); c1.add(next(rand, builder, "u1")); c1.add(next(rand, builder, "u2")); c2.add(next(rand, builder, "u1")); c2.add(next(rand, builder, "u2")); c2.add(next(rand, builder, "u2")); c3.add(next(rand, builder, "u2")); c3.add(next(rand, builder, "u3")); c3.add(next(rand, builder, "u3")); c3.add(next(rand, builder, "u3")); c3.add(next(rand, builder, "u3")); clustering.lookupAdd(data); return clustering; }
/** Dummy mapping for debugging purposes */ public void createMapping() { treeData = new DynamicTreeData(); mapping = new int[dataset.size()]; for (int i = 0; i < dataset.size(); i++) { mapping[i] = i; } }
@Override public Clustering getClustering(Dataset<E> parent) { setDataset(parent); int estClusters = (int) Math.sqrt(dataset.size()); Clustering result = new ClusterList(estClusters); // estimated capacity int perCluster = (int) (parent.size() / (float) estClusters); int[] assign = getMapping(); if (assign != null) { int id; Cluster clust; for (int i = 0; i < assign.length; i++) { id = assign[i]; clust = result.createCluster(id, perCluster); clust.add(dataset.get(i)); } } else { // try some cutoff method? throw new RuntimeException("don't know how to get clusters.."); } // proximity.printLower(5, 2); // similarity.print(4, 2); result.lookupAdd(dataset); if (props != null) { result.setParams(props); } return result; }
/** * Sample covariance * * @param dataset * @return */ protected double covariance(Dataset<E> dataset) { Matrix m = dataset.asMatrix(); Matrix cov = new SymmetricMatrixDiag(m.columnsCount()); DenseVector mean = new DenseVector(dataset.attributeCount()); for (int i = 0; i < mean.size(); i++) { mean.set(i, dataset.getAttribute(i).statistics(StatsNum.MEAN)); } Vector v; double res, sum = 0.0; for (int i = 0; i < m.rowsCount(); i++) { v = m.getRowVector(i).minus(mean); res = v.dot(v); sum += res; } return sum / (m.rowsCount() - 1); /* for (int i = 0; i < m.columnsCount(); i++) { mean = dataset.getAttribute(i).statistics(StatsNum.AVG); cov.set(i, i, dataset.getAttribute(i).statistics(StatsNum.VARIANCE)); for (int j = 0; j < i; j++) { //cov.set(i, j, mean); } } */ // return cov; }
@Test public void testSingleLinkage() { Dataset<? extends Instance> dataset = FakeClustering.kumarData(); assertEquals(6, dataset.size()); Props pref = new Props(); pref.put(AlgParams.LINKAGE, SingleLinkage.name); pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.ROWS_CLUSTERING); pref.put(PropType.PERFORMANCE, AlgParams.KEEP_PROXIMITY, true); HierarchicalResult result = subject.hierarchy(dataset, pref); Matrix similarityMatrix = result.getProximityMatrix(); assertNotNull(similarityMatrix); assertEquals(similarityMatrix.rowsCount(), dataset.size()); assertEquals(similarityMatrix.columnsCount(), dataset.size()); System.out.println("kumar - single"); DendroTreeData tree = result.getTreeData(); tree.print(); assertEquals(dataset.size(), tree.numLeaves()); DendroNode root = tree.getRoot(); assertEquals(0.21587033144922904, root.getHeight(), DELTA); int levels = tree.distinctHeights(1e-7); // TODO: in this example nodes #7 and #8 are on different level, // but their height is the same. should we consider those as different assertEquals(4, levels); }
public static Clustering wineCorrect() { if (simpleClustering == null) { simpleClustering = new ClusterList(3); Cluster a = new BaseCluster(12); a.setName("cabernet"); a.attributeBuilder().create("x", BasicAttrType.INTEGER); Cluster b = new BaseCluster(9); b.setName("syrah"); b.attributeBuilder().create("x", BasicAttrType.INTEGER); Cluster c = new BaseCluster(6); c.setName("pinot"); c.attributeBuilder().create("x", BasicAttrType.INTEGER); Dataset<Instance> data = wine(); for (int i = 0; i < 13; i++) { a.add(data.instance(i)); } for (int i = 13; i < 22; i++) { b.add(data.instance(i)); } for (int i = 22; i < 27; i++) { c.add(data.instance(i)); } simpleClustering.add(a); simpleClustering.add(b); simpleClustering.add(c); } return simpleClustering; }
@Override public int size() { switch (resultType) { case COLUMNS_CLUSTERING: return dataset.attributeCount(); case ROWS_CLUSTERING: return dataset.size(); } throw new RuntimeException("Don't know wether cluster rows or columns."); }
public static Clustering irisWrong() { if (irisWrong == null) { irisWrong = new ClusterList(3); Cluster a = new BaseCluster(50); a.setName("cluster 1"); a.setAttributes(irisData.getAttributes()); // add few instances to first cluster a.add(irisData.instance(0)); a.add(irisData.instance(1)); a.add(irisData.instance(2)); a.add(irisData.instance(149)); Cluster b = new BaseCluster(50); b.setName("cluster 2"); b.setAttributes(irisData.getAttributes()); b.add(irisData.instance(3)); b.add(irisData.instance(4)); b.add(irisData.instance(5)); b.add(irisData.instance(6)); Cluster c = new BaseCluster(50); c.setName("cluster 3"); c.setAttributes(irisData.getAttributes()); // rest goes to the last cluster for (int i = 7; i < 149; i++) { c.add(irisData.instance(i)); } irisWrong.add(a); irisWrong.add(b); irisWrong.add(c); } return irisWrong; }
@Test public void testCompleteLinkage() { Dataset<? extends Instance> dataset = FakeDatasets.irisDataset(); for (AgglomerativeClustering alg : algorithms) { NanoBench.create() .cpuAndMemory() .measurements(2) .measure( alg.getName() + " complete link - " + dataset.getName(), new HclustBenchmark().completeLinkage(alg, dataset)); } }
/** * Testing dataset from Kumar (chapter 8, page 519) * * @return */ public static Dataset<? extends Instance> kumarData() { if (kumar == null) { kumar = new ArrayDataset<>(4, 2); kumar.attributeBuilder().create("x", BasicAttrType.NUMERIC); kumar.attributeBuilder().create("y", BasicAttrType.NUMERIC); kumar.builder().create(new double[] {0.40, 0.53}, "1"); kumar.builder().create(new double[] {0.22, 0.38}, "2"); kumar.builder().create(new double[] {0.35, 0.32}, "3"); kumar.builder().create(new double[] {0.26, 0.19}, "4"); kumar.builder().create(new double[] {0.08, 0.41}, "5"); kumar.builder().create(new double[] {0.45, 0.30}, "6"); } return kumar; }
@Test public void testColumnClustering() throws IOException { Dataset<? extends Instance> dataset = FakeClustering.schoolData(); Props pref = new Props(); pref.put(AlgParams.LINKAGE, SingleLinkage.name); pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.COLUMNS_CLUSTERING); pref.put(PropType.PERFORMANCE, AlgParams.KEEP_PROXIMITY, true); HierarchicalResult result = subject.hierarchy(dataset, pref); Matrix similarityMatrix = result.getProximityMatrix(); assertNotNull(similarityMatrix); assertEquals(similarityMatrix.rowsCount(), dataset.attributeCount()); assertEquals(similarityMatrix.columnsCount(), dataset.attributeCount()); result.getTreeData().print(); }
@Override public E getInstance(int index) { if (dataset != null) { return dataset.instance(getMappedIndex(index)); } else { throw new RuntimeException("dataset is null"); } }
public Clustering<Instance, Cluster<Instance>> oneClassPerCluster() { Clustering<Instance, Cluster<Instance>> oneClass = new ClusterList(3); int size = 10; Random rand = new Random(); Dataset<? extends Instance> data = new ArrayDataset<>(size, 2); data.attributeBuilder().create("x1", "NUMERIC"); data.attributeBuilder().create("x2", "NUMERIC"); for (int i = 0; i < size; i++) { Instance inst = next(rand, data.builder(), "same class"); // cluster with single class BaseCluster clust = new BaseCluster(1); clust.add(inst); oneClass.add(clust); } oneClass.lookupAdd(data); return oneClass; }
@Test public void testSingleLinkageSchool() { Dataset<? extends Instance> dataset = FakeClustering.schoolData(); assertEquals(17, dataset.size()); Props pref = new Props(); pref.put(AlgParams.LINKAGE, SingleLinkage.name); pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.ROWS_CLUSTERING); HierarchicalResult result = subject.hierarchy(dataset, pref); System.out.println("school - single"); DendroTreeData tree = result.getTreeData(); tree.print(); assertEquals(dataset.size(), tree.numLeaves()); DendroNode root = tree.getRoot(); assertEquals(32.542734980330046, root.getHeight(), DELTA); assertEquals(2 * dataset.size() - 1, tree.numNodes()); assertEquals(16, tree.distinctHeights()); assertEquals(8, tree.treeLevels()); }
@Test public void testInverseSorting() { Dataset<? extends Instance> dataset = FakeClustering.kumarData(); assertEquals(6, dataset.size()); Props pref = new Props(); pref.put(AlgParams.LINKAGE, SingleLinkage.name); pref.put(AlgParams.CLUSTERING_TYPE, ClusteringType.ROWS_CLUSTERING); // inverse ordering pref.put(AlgParams.SMALLEST_FIRST, false); HierarchicalResult result = subject.hierarchy(dataset, pref); System.out.println("kumar - inverse"); DendroTreeData tree = result.getTreeData(); tree.print(); assertEquals(dataset.size(), tree.numLeaves()); DendroNode root = tree.getRoot(); assertEquals(0.10198039027185574, root.getHeight(), DELTA); assertEquals(5, tree.distinctHeights()); assertEquals(4, tree.treeLevels()); }
@Before public void setUp() { irisDataset(); // preload irisClusters = new ClusterList(3); Cluster a = new BaseCluster(50); a.setName("cluster 1"); a.setClusterId(0); a.setAttributes(irisData.getAttributes()); Cluster b = new BaseCluster(50); b.setName("cluster 2"); b.setAttributes(irisData.getAttributes()); b.setClusterId(1); Cluster c = new BaseCluster(50); c.setName("cluster 3"); c.setAttributes(irisData.getAttributes()); c.setClusterId(2); for (int i = 0; i < 50; i++) { a.add(irisData.instance(i)); b.add(irisData.instance(i + 50)); c.add(irisData.instance(i + 100)); } irisClusters.add(a); irisClusters.add(b); irisClusters.add(c); }
/** * Pretty bad clustering result, one class contained in two clusters * * @return */ public static Clustering irisWrong2() { if (irisWrong2 == null) { irisWrong2 = new ClusterList(3); Cluster a = new BaseCluster(50); a.setName("cluster 1"); // will contain 30 elements of first class a.setAttributes(irisData.getAttributes()); for (int i = 0; i < 30; i++) { a.add(irisData.instance(i)); } Cluster b = new BaseCluster(50); b.setName("cluster 2"); // will contain 20 elements of first class b.setAttributes(irisData.getAttributes()); for (int i = 30; i < 50; i++) { b.add(irisData.instance(i)); } Cluster c = new BaseCluster(50); c.setName("cluster 3"); c.setAttributes(irisData.getAttributes()); // the rest (100) goes to the last cluster for (int i = 50; i < 150; i++) { c.add(irisData.instance(i)); } irisWrong2.add(a); irisWrong2.add(b); irisWrong2.add(c); } return irisWrong2; }
public static Clustering iris() { if (irisClusters == null) { irisDataset(); /** fictive clustering, create iris cluster based on class labels (the dataset is sorted) */ irisClusters = new ClusterList(3); Cluster a = new BaseCluster(50); a.setName("cluster 1"); a.setAttributes(irisData.getAttributes()); Cluster b = new BaseCluster(50); b.setName("cluster 2"); b.setAttributes(irisData.getAttributes()); Cluster c = new BaseCluster(50); c.setName("cluster 3"); c.setAttributes(irisData.getAttributes()); for (int i = 0; i < 50; i++) { a.add(irisData.instance(i)); b.add(irisData.instance(i + 50)); c.add(irisData.instance(i + 100)); } irisClusters.add(a); irisClusters.add(b); irisClusters.add(c); } return irisClusters; }
@Override public Clustering updateCutoff(double cutoff) { this.cutoff = cutoff; int[] assign = new int[dataset.size()]; int estClusters = (int) Math.sqrt(dataset.size()); colorGenerator.reset(); num = 0; // human readable Clustering clusters = new ClusterList(estClusters); DendroNode root = treeData.getRoot(); if (root != null) { checkCutoff(root, cutoff, clusters, assign); if (clusters.size() > 0) { mapping = assign; } else { LOG.info("failed to cutoff dendrogram, cut = {}", cutoff); } } // add input dataset to clustering lookup if (noise != null) { Cluster clust = new BaseCluster<>(noise.size()); clust.setColor(colorGenerator.next()); clust.setClusterId(num++); clust.setParent(getDataset()); clust.setName("Noise"); clust.setAttributes(getDataset().getAttributes()); for (Instance ins : noise) { clust.add(ins); mapping[ins.getIndex()] = num - 1; } clusters.add(clust); } clusters.lookupAdd(dataset); if (dendroMapping != null) { clusters.lookupAdd(dendroMapping); } clusters.lookupAdd(this); return clusters; }
public static Dataset<Instance> wine() { if (wine == null) { wine = new SampleDataset(27); wine.attributeBuilder().create("x", BasicAttrType.INTEGER); String klass = "cabernet"; for (int i = 0; i < 13; i++) { wine.builder().create(new double[] {i}, klass); } String klass2 = "syrah"; for (int i = 0; i < 9; i++) { wine.builder().create(new double[] {i * 3 + 13}, klass2); } String klass3 = "pinot"; for (int i = 0; i < 5; i++) { wine.builder().create(new double[] {i * 4 + 50}, klass3); } } return wine; }
public static Dataset<? extends Instance> schoolData() { if (school == null) { CsvLoader loader = new CsvLoader(); school = new ArrayDataset(17, 4); school.setName("school data"); loader.setClassIndex(4); loader.setSeparator(' '); try { loader.load(fixture.schoolData(), school); } catch (IOException ex) { Exceptions.printStackTrace(ex); } } return school; }
/** TODO: make sure this test is correct */ @Ignore public void testScore() throws ScoreException { Clustering c = new ClusterList(2); Dataset<? extends Instance> d = new ArrayDataset(8, 2); d.builder().create(new double[] {0, 0}, "0"); d.builder().create(new double[] {0, 0}, "0"); d.builder().create(new double[] {0, 0}, "0"); d.builder().create(new double[] {1, 1}, "0"); d.builder().create(new double[] {1, 1}, "1"); d.builder().create(new double[] {1, 1}, "1"); d.builder().create(new double[] {1, 1}, "1"); d.builder().create(new double[] {1, 1}, "1"); assertEquals(8, d.size()); Cluster a = c.createCluster(0, 4); Cluster b = c.createCluster(1, 4); for (int i = 0; i < 4; i++) { a.add(d.get(i)); b.add(d.get(i + 4)); } assertEquals(2, c.size()); assertEquals(0.14039740914097984, subject.score(c), delta); }
/** * @see http://alias-i.com/lingpipe/docs/api/com/aliasi/classify/PrecisionRecallEvaluation.html * @return */ public static Clustering wineClustering() { if (simpleResponse == null) { simpleResponse = new ClusterList(3); Cluster a = new BaseCluster(13); a.setName("cluster A"); a.setAttribute(0, a.attributeBuilder().create("x", BasicAttrType.INTEGER)); Cluster b = new BaseCluster(9); b.setName("cluster B"); b.setAttribute(0, b.attributeBuilder().create("x", BasicAttrType.INTEGER)); Cluster c = new BaseCluster(5); c.setName("cluster C"); c.setAttribute(0, c.attributeBuilder().create("x", BasicAttrType.INTEGER)); Dataset<Instance> data = wine(); System.out.println("dataset size " + data.size()); // cabernet 9x -> a for (int i = 0; i < 9; i++) { a.add(data.instance(i)); } // cabernet 2x => b b.add(data.instance(9)); // cabernet 1x => c c.add(data.instance(10)); b.add(data.instance(11)); b.add(data.instance(12)); // syrah 2x -> a for (int i = 13; i < 15; i++) { a.add(data.instance(i)); } // syrah 2x -> c c.add(data.instance(15)); // syrah 5x -> b for (int i = 16; i < 21; i++) { b.add(data.instance(i)); } a.add(data.instance(21)); // pinot 4x -> c for (int i = 22; i < 26; i++) { c.add(data.instance(i)); } // pinot -> cabernet cluster b.add(data.instance(26)); simpleResponse.add(a); simpleResponse.add(b); simpleResponse.add(c); } return simpleResponse; }