@Test
 public void testRun() throws Exception {
   Path input = getTestTempDirPath("input");
   Path output = getTestTempDirPath("output");
   Path seedsPath = getTestTempDirPath("seeds");
   List<VectorWritable> points = getPointsWritable(REFERENCE);
   List<VectorWritable> seeds = getPointsWritable(SEEDS);
   Configuration conf = new Configuration();
   ClusteringTestUtils.writePointsToFile(points, true, new Path(input, "file1"), fs, conf);
   ClusteringTestUtils.writePointsToFile(seeds, true, new Path(seedsPath, "part-seeds"), fs, conf);
   String[] args = {
     optKey(DefaultOptionCreator.INPUT_OPTION),
     input.toString(),
     optKey(VectorDistanceSimilarityJob.SEEDS),
     seedsPath.toString(),
     optKey(DefaultOptionCreator.OUTPUT_OPTION),
     output.toString(),
     optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
     EuclideanDistanceMeasure.class.getName()
   };
   ToolRunner.run(new Configuration(), new VectorDistanceSimilarityJob(), args);
   int expect = SEEDS.length * REFERENCE.length;
   DummyOutputCollector<StringTuple, DoubleWritable> collector =
       new DummyOutputCollector<StringTuple, DoubleWritable>();
   //
   for (Pair<StringTuple, DoubleWritable> record :
       new SequenceFileIterable<StringTuple, DoubleWritable>(
           new Path(output, "part-m-00000"), conf)) {
     collector.collect(record.getFirst(), record.getSecond());
   }
   assertEquals(expect, collector.getData().size());
 }
Esempio n. 2
0
  @Test
  public void testSeqFileClusterIteratorKMeans() throws IOException {
    Path pointsPath = getTestTempDirPath("points");
    Path priorPath = getTestTempDirPath("prior");
    Path outPath = getTestTempDirPath("output");
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    List<VectorWritable> points =
        TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE);
    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);
    Path path = new Path(priorPath, "priorClassifier");
    ClusterClassifier prior = newClusterClassifier();
    writeClassifier(prior, conf, path, fs);
    assertEquals(3, prior.getModels().size());
    System.out.println("Prior");
    for (Cluster cluster : prior.getModels()) {
      System.out.println(cluster.asFormatString(null));
    }
    ClusteringPolicy policy = new KMeansClusteringPolicy();
    ClusterIterator iterator = new ClusterIterator(policy);
    iterator.iterate(pointsPath, path, outPath, 5);

    for (int i = 1; i <= 5; i++) {
      System.out.println("Classifier-" + i);
      ClusterClassifier posterior = readClassifier(conf, new Path(outPath, "classifier-" + i), fs);
      assertEquals(3, posterior.getModels().size());
      for (Cluster cluster : posterior.getModels()) {
        System.out.println(cluster.asFormatString(null));
      }
    }
  }
Esempio n. 3
0
 @Test
 public void test() {
   double threshold = 0.5;
   Matrix topicTermCounts = ClusteringTestUtils.randomStructuredModel(20, 100);
   Matrix sparseTopicTermCounts =
       TopicModelUtils.sparsifyTopicTermCounts(topicTermCounts, threshold);
   assertColumnNormsEqualOrZero(topicTermCounts, sparseTopicTermCounts);
   assertFewerNonzeros(topicTermCounts, sparseTopicTermCounts);
 }