/** * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2 * parameters. All output data will be written to the output directory, which will be initially * deleted if it exists. The clustered points will reside in the path <output>/clustered-points. * By default, the job expects the a file containing synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a * directory named "testdata", and writes output to a directory named "output". * * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param t1 the canopy T1 threshold * @param t2 the canopy T2 threshold */ private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); InputDriver.runJob( input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); CanopyDriver.run( new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper( new Path(output, "clusters-0-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }