@Override public int run(String[] args) throws Exception { addOption( SEQ_FILE_DIR_OPTION, "s", "The directory containing Sequence Files for the Clusters", true); addOption( OUTPUT_OPTION, "o", "Optional output directory. Default is to output to the console."); addOption(SUBSTRING_OPTION, "b", "The number of chars of the asFormatString() to print"); addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print"); addOption( JSON_OPTION, "j", "Output the centroid as JSON. Otherwise it substitues in the terms for vector cell entries"); addOption( POINTS_DIR_OPTION, "p", "The directory containing points sequence files mapping input vectors to their cluster. " + "If specified, then the program will output the points associated with a cluster"); addOption(DICTIONARY_OPTION, "d", "The dictionary file"); addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text"); if (parseArguments(args) == null) { return -1; } seqFileDir = new Path(getOption(SEQ_FILE_DIR_OPTION)); if (hasOption(POINTS_DIR_OPTION)) { pointsDir = new Path(getOption(POINTS_DIR_OPTION)); } outputFile = getOption(OUTPUT_OPTION); if (hasOption(SUBSTRING_OPTION)) { int sub = Integer.parseInt(getOption(SUBSTRING_OPTION)); if (sub >= 0) { subString = sub; } } if (hasOption(JSON_OPTION)) { useJSON = true; } termDictionary = getOption(DICTIONARY_OPTION); dictionaryFormat = getOption(DICTIONARY_TYPE_OPTION); if (hasOption(NUM_WORDS_OPTION)) { numTopFeatures = Integer.parseInt(getOption(NUM_WORDS_OPTION)); } init(); printClusters(null); return 0; }
/** * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2 * parameters. All output data will be written to the output directory, which will be initially * deleted if it exists. The clustered points will reside in the path <output>/clustered-points. * By default, the job expects the a file containing synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a * directory named "testdata", and writes output to a directory named "output". * * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param t1 the canopy T1 threshold * @param t2 the canopy T2 threshold */ private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); InputDriver.runJob( input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); CanopyDriver.run( new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper( new Path(output, "clusters-0-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }