@Test public void testRun() throws Exception { Path input = getTestTempDirPath("input"); Path output = getTestTempDirPath("output"); Path seedsPath = getTestTempDirPath("seeds"); List<VectorWritable> points = getPointsWritable(REFERENCE); List<VectorWritable> seeds = getPointsWritable(SEEDS); Configuration conf = new Configuration(); ClusteringTestUtils.writePointsToFile(points, true, new Path(input, "file1"), fs, conf); ClusteringTestUtils.writePointsToFile(seeds, true, new Path(seedsPath, "part-seeds"), fs, conf); String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), input.toString(), optKey(VectorDistanceSimilarityJob.SEEDS), seedsPath.toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName() }; ToolRunner.run(new Configuration(), new VectorDistanceSimilarityJob(), args); int expect = SEEDS.length * REFERENCE.length; DummyOutputCollector<StringTuple, DoubleWritable> collector = new DummyOutputCollector<StringTuple, DoubleWritable>(); // for (Pair<StringTuple, DoubleWritable> record : new SequenceFileIterable<StringTuple, DoubleWritable>( new Path(output, "part-m-00000"), conf)) { collector.collect(record.getFirst(), record.getSecond()); } assertEquals(expect, collector.getData().size()); }
@Test public void testSeqFileClusterIteratorKMeans() throws IOException { Path pointsPath = getTestTempDirPath("points"); Path priorPath = getTestTempDirPath("prior"); Path outPath = getTestTempDirPath("output"); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE); ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf); Path path = new Path(priorPath, "priorClassifier"); ClusterClassifier prior = newClusterClassifier(); writeClassifier(prior, conf, path, fs); assertEquals(3, prior.getModels().size()); System.out.println("Prior"); for (Cluster cluster : prior.getModels()) { System.out.println(cluster.asFormatString(null)); } ClusteringPolicy policy = new KMeansClusteringPolicy(); ClusterIterator iterator = new ClusterIterator(policy); iterator.iterate(pointsPath, path, outPath, 5); for (int i = 1; i <= 5; i++) { System.out.println("Classifier-" + i); ClusterClassifier posterior = readClassifier(conf, new Path(outPath, "classifier-" + i), fs); assertEquals(3, posterior.getModels().size()); for (Cluster cluster : posterior.getModels()) { System.out.println(cluster.asFormatString(null)); } } }
@Test public void test() { double threshold = 0.5; Matrix topicTermCounts = ClusteringTestUtils.randomStructuredModel(20, 100); Matrix sparseTopicTermCounts = TopicModelUtils.sparsifyTopicTermCounts(topicTermCounts, threshold); assertColumnNormsEqualOrZero(topicTermCounts, sparseTopicTermCounts); assertFewerNonzeros(topicTermCounts, sparseTopicTermCounts); }