@Test public void testGetCentroidDoubleArrayArray() { try { Centroid.getCentroid(badNd); fail("Should throw IllegalArgumentException"); } catch (IllegalArgumentException e) { } // A 20-D centroid double[] centroid = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; assertArrayEquals(centroid, Centroid.getCentroid(goodNd), 1E-9); // A 3-D array double[][] threeD = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}, {13, 14, 15}, {16, 17, 18}}; // Its centroid double[] centroid3d = {8.5, 9.5, 10.5}; assertArrayEquals(centroid3d, Centroid.getCentroid(threeD), 1E-9); // A 2-D array double[][] twoD = {{1, 2}, {4, 5}, {7, 8}, {10, 11}, {13, 14}, {16, 17}}; // Its centroid double[] centroid2d = {8.5, 9.5}; assertArrayEquals(centroid2d, Centroid.getCentroid(twoD), 1E-9); // A 1-D array double[][] oneD = {{1}, {4}, {7}, {10}, {13}, {16}}; // Its centroid double[] centroid1d = {8.5}; assertArrayEquals(centroid1d, Centroid.getCentroid(oneD), 1E-9); }
private double euclideanDistance(Features point, Centroid centroid) { if (point.getNumberOfDim() != centroid.getNumberOfDim()) throw new IllegalArgumentException(); double sumOfSquares = 0; for (int i = 0; i < point.getNumberOfDim(); i++) { sumOfSquares += (point.getDim(i) - centroid.getDim(i)) * (point.getDim(i) - centroid.getDim(i)); } return Math.sqrt(sumOfSquares); }
@Test public void testGetCentroidDoubleArray() { assertEquals(10.5, Centroid.getCentroid(oneDa), 1E-9); assertEquals(10.0, Centroid.getCentroid(oneDb), 1E-9); assertEquals(9.0, Centroid.getCentroid(oneDc), 1E-9); assertEquals(8.5, Centroid.getCentroid(oneDd), 1E-9); assertEquals(8.0, Centroid.getCentroid(oneDe), 1E-9); assertEquals(7.5, Centroid.getCentroid(oneDf), 1E-9); assertEquals(7.0, Centroid.getCentroid(oneDg), 1E-9); assertEquals(6.5, Centroid.getCentroid(oneDh), 1E-9); assertEquals(6.0, Centroid.getCentroid(oneDi), 1E-9); assertEquals(5.5, Centroid.getCentroid(oneDj), 1E-9); }
@Test public void testBasicClustering() { List<? extends WeightedVector> data = cubishTestData(1); BallKMeans r = new BallKMeans(new BruteSearch(new EuclideanDistanceMeasure()), 6, 20); r.cluster(data); for (Centroid centroid : r) { for (int i = 0; i < 10; i++) { System.out.printf("%10.4f", centroid.get(i)); } System.out.printf("\n"); } }
@Test public void testClustering() { UpdatableSearcher searcher = new BruteSearch(new EuclideanDistanceMeasure()); BallKMeans clusterer = new BallKMeans(searcher, 1 << NUM_DIMENSIONS, NUM_ITERATIONS); long startTime = System.currentTimeMillis(); clusterer.cluster(syntheticData.getFirst()); long endTime = System.currentTimeMillis(); assertEquals( "Total weight not preserved", totalWeight(syntheticData.getFirst()), totalWeight(clusterer), 1e-9); // Verify that each corner of the cube has a centroid very nearby. // This is probably FALSE for large-dimensional spaces! double maxWeight = 0; for (Vector mean : syntheticData.getSecond()) { WeightedThing<Vector> v = searcher.search(mean, 1).get(0); maxWeight = Math.max(v.getWeight(), maxWeight); } assertTrue("Maximum weight too large " + maxWeight, maxWeight < 0.05); double clusterTime = (endTime - startTime) / 1000.0; System.out.printf( "%s\n%.2f for clustering\n%.1f us per row\n\n", searcher.getClass().getName(), clusterTime, clusterTime / syntheticData.getFirst().size() * 1e6); // verify that the total weight of the centroids near each corner is correct double[] cornerWeights = new double[1 << NUM_DIMENSIONS]; Searcher trueFinder = new BruteSearch(new EuclideanDistanceMeasure()); for (Vector trueCluster : syntheticData.getSecond()) { trueFinder.add(trueCluster); } for (Centroid centroid : clusterer) { WeightedThing<Vector> closest = trueFinder.search(centroid, 1).get(0); cornerWeights[((Centroid) closest.getValue()).getIndex()] += centroid.getWeight(); } int expectedNumPoints = NUM_DATA_POINTS / (1 << NUM_DIMENSIONS); for (double v : cornerWeights) { System.out.printf("%f ", v); } System.out.println(); for (double v : cornerWeights) { assertEquals(expectedNumPoints, v, 0); } }
@Test public void testInitialization() { // start with super clusterable data List<? extends WeightedVector> data = cubishTestData(0.01); // just do initialization of ball k-means. This should drop a point into each of the clusters BallKMeans r = new BallKMeans(new BruteSearch(new EuclideanDistanceMeasure()), 6, 20); r.cluster(data); // put the centroids into a matrix Matrix x = new DenseMatrix(6, 5); int row = 0; for (Centroid c : r) { x.viewRow(row).assign(c.viewPart(0, 5)); row++; } // verify that each column looks right. Should contain zeros except for a single 6. final Vector columnNorms = x.aggregateColumns( new VectorFunction() { @Override public double apply(Vector f) { // return the sum of three discrepancy measures return Math.abs(f.minValue()) + Math.abs(f.maxValue() - 6) + Math.abs(f.norm(1) - 6); } }); // verify all errors are nearly zero assertEquals(0, columnNorms.norm(1) / columnNorms.size(), 0.1); // verify that the centroids are a permutation of the original ones SingularValueDecomposition svd = new SingularValueDecomposition(x); Vector s = svd.getS().viewDiagonal().assign(Functions.div(6)); assertEquals(5, s.getLengthSquared(), 0.05); assertEquals(5, s.norm(1), 0.05); }
@Override protected void map(Centroid key, IntWritable value, Context context) throws IOException, InterruptedException { context.write(new Text(key.toString()), new Text(value.toString())); }
@Override protected void map(Centroid key, Point value, Context context) throws IOException, InterruptedException { String out = key.toString() + " " + value.toString(); context.write(new Text(out), new Text("")); }