Beispiel #1
0
  @Override
  public int run(String[] args) throws Exception {
    addOption(
        SEQ_FILE_DIR_OPTION, "s", "The directory containing Sequence Files for the Clusters", true);
    addOption(
        OUTPUT_OPTION, "o", "Optional output directory. Default is to output to the console.");
    addOption(SUBSTRING_OPTION, "b", "The number of chars of the asFormatString() to print");
    addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
    addOption(
        JSON_OPTION,
        "j",
        "Output the centroid as JSON.  Otherwise it substitues in the terms for vector cell entries");
    addOption(
        POINTS_DIR_OPTION,
        "p",
        "The directory containing points sequence files mapping input vectors to their cluster.  "
            + "If specified, then the program will output the points associated with a cluster");
    addOption(DICTIONARY_OPTION, "d", "The dictionary file");
    addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text");
    if (parseArguments(args) == null) {
      return -1;
    }

    seqFileDir = new Path(getOption(SEQ_FILE_DIR_OPTION));
    if (hasOption(POINTS_DIR_OPTION)) {
      pointsDir = new Path(getOption(POINTS_DIR_OPTION));
    }
    outputFile = getOption(OUTPUT_OPTION);
    if (hasOption(SUBSTRING_OPTION)) {
      int sub = Integer.parseInt(getOption(SUBSTRING_OPTION));
      if (sub >= 0) {
        subString = sub;
      }
    }
    if (hasOption(JSON_OPTION)) {
      useJSON = true;
    }
    termDictionary = getOption(DICTIONARY_OPTION);
    dictionaryFormat = getOption(DICTIONARY_TYPE_OPTION);
    if (hasOption(NUM_WORDS_OPTION)) {
      numTopFeatures = Integer.parseInt(getOption(NUM_WORDS_OPTION));
    }
    init();
    printClusters(null);
    return 0;
  }
Beispiel #2
0
 /**
  * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2
  * parameters. All output data will be written to the output directory, which will be initially
  * deleted if it exists. The clustered points will reside in the path <output>/clustered-points.
  * By default, the job expects the a file containing synthetic_control.data as obtained from
  * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a
  * directory named "testdata", and writes output to a directory named "output".
  *
  * @param input the String denoting the input directory path
  * @param output the String denoting the output directory path
  * @param measure the DistanceMeasure to use
  * @param t1 the canopy T1 threshold
  * @param t2 the canopy T2 threshold
  */
 private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2)
     throws Exception {
   Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
   InputDriver.runJob(
       input,
       directoryContainingConvertedInput,
       "org.apache.mahout.math.RandomAccessSparseVector");
   CanopyDriver.run(
       new Configuration(),
       directoryContainingConvertedInput,
       output,
       measure,
       t1,
       t2,
       true,
       0.0,
       false);
   // run ClusterDumper
   ClusterDumper clusterDumper =
       new ClusterDumper(
           new Path(output, "clusters-0-final"), new Path(output, "clusteredPoints"));
   clusterDumper.printClusters(null);
 }