Beispiel #1
0
 /**
  * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2
  * parameters. All output data will be written to the output directory, which will be initially
  * deleted if it exists. The clustered points will reside in the path <output>/clustered-points.
  * By default, the job expects the a file containing synthetic_control.data as obtained from
  * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a
  * directory named "testdata", and writes output to a directory named "output".
  *
  * @param input the String denoting the input directory path
  * @param output the String denoting the output directory path
  * @param measure the DistanceMeasure to use
  * @param t1 the canopy T1 threshold
  * @param t2 the canopy T2 threshold
  */
 private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2)
     throws Exception {
   Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
   InputDriver.runJob(
       input,
       directoryContainingConvertedInput,
       "org.apache.mahout.math.RandomAccessSparseVector");
   CanopyDriver.run(
       new Configuration(),
       directoryContainingConvertedInput,
       output,
       measure,
       t1,
       t2,
       true,
       0.0,
       false);
   // run ClusterDumper
   ClusterDumper clusterDumper =
       new ClusterDumper(
           new Path(output, "clusters-0-final"), new Path(output, "clusteredPoints"));
   clusterDumper.printClusters(null);
 }