Exemplo n.º 1
0
  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.t1Option().create());
    addOption(DefaultOptionCreator.t2Option().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> argMap = parseArguments(args);
    if (argMap == null) {
      return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      HadoopUtil.delete(new Configuration(), output);
    }
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);

    run(input, output, measure, t1, t2);
    return 0;
  }
Exemplo n.º 2
0
  private Map<String, List<String>> handleArgs(String[] args) throws IOException {
    addOutputOption();
    addOption(
        "eigenInput",
        "ei",
        "The Path for purported eigenVector input files (SequenceFile<WritableComparable,VectorWritable>.",
        null);
    addOption(
        "corpusInput",
        "ci",
        "The Path for corpus input files (SequenceFile<WritableComparable,VectorWritable>.");
    addOption(DefaultOptionCreator.outputOption().create());
    addOption(DefaultOptionCreator.helpOption());
    addOption("inMemory", "mem", "Buffer eigen matrix into memory (if you have enough!)", "false");
    addOption("maxError", "err", "Maximum acceptable error", "0.05");
    addOption("minEigenvalue", "mev", "Minimum eigenvalue to keep the vector for", "0.0");
    addOption("maxEigens", "max", "Maximum number of eigenvectors to keep (0 means all)", "0");

    return parseArguments(args);
  }
Exemplo n.º 3
0
  @Override
  public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    addInputOption();
    addOutputOption();
    addOption(MinhashOptionCreator.minClusterSizeOption().create());
    addOption(MinhashOptionCreator.minVectorSizeOption().create());
    addOption(MinhashOptionCreator.hashTypeOption().create());
    addOption(MinhashOptionCreator.numHashFunctionsOption().create());
    addOption(MinhashOptionCreator.keyGroupsOption().create());
    addOption(MinhashOptionCreator.numReducersOption().create());
    addOption(MinhashOptionCreator.debugOutputOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    if (parseArguments(args) == null) {
      return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      HadoopUtil.delete(getConf(), output);
    }
    int minClusterSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_CLUSTER_SIZE));
    int minVectorSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_VECTOR_SIZE));
    String hashType = getOption(MinhashOptionCreator.HASH_TYPE);
    int numHashFunctions = Integer.valueOf(getOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS));
    int keyGroups = Integer.valueOf(getOption(MinhashOptionCreator.KEY_GROUPS));
    int numReduceTasks = Integer.parseInt(getOption(MinhashOptionCreator.NUM_REDUCERS));
    boolean debugOutput = hasOption(MinhashOptionCreator.DEBUG_OUTPUT);

    runJob(
        input,
        output,
        minClusterSize,
        minVectorSize,
        hashType,
        numHashFunctions,
        keyGroups,
        numReduceTasks,
        debugOutput);
    return 0;
  }
Exemplo n.º 4
0
  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputOpt = DefaultOptionCreator.outputOption().create();

    Option helpOpt = DefaultOptionCreator.helpOption();
    Option recordSplitterOpt =
        obuilder
            .withLongName("splitterPattern")
            .withArgument(
                abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Regular Expression pattern used to split given line into fields."
                    + " Default value splits comma or tab separated fields."
                    + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ")
            .withShortName("regex")
            .create();
    Option encodingOpt =
        obuilder
            .withLongName("encoding")
            .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The file encoding.  Default value: UTF-8")
            .withShortName("e")
            .create();
    Group group =
        gbuilder
            .withName("Options")
            .withOption(inputDirOpt)
            .withOption(outputOpt)
            .withOption(helpOpt)
            .withOption(recordSplitterOpt)
            .withOption(encodingOpt)
            .create();

    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);

      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }
      Parameters params = new Parameters();
      if (cmdLine.hasOption(recordSplitterOpt)) {
        params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt));
      }

      String encoding = "UTF-8";
      if (cmdLine.hasOption(encodingOpt)) {
        encoding = (String) cmdLine.getValue(encodingOpt);
      }
      params.set("encoding", encoding);
      String inputDir = (String) cmdLine.getValue(inputDirOpt);
      String outputDir = (String) cmdLine.getValue(outputOpt);
      params.set("input", inputDir);
      params.set("output", outputDir);
      params.set("groupingFieldCount", "2");
      params.set("gfield0", "1");
      params.set("gfield1", "2");
      params.set("selectedFieldCount", "1");
      params.set("field0", "3");
      params.set("maxTransactionLength", "100");
      KeyBasedStringTupleGrouper.startJob(params);

    } catch (OptionException ex) {
      CommandLineUtil.printHelp(group);
    }
  }
Exemplo n.º 5
0
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();

    Option allOpt =
        obuilder
            .withLongName("all")
            .withDescription("If set, Select all files. Default is false")
            .withShortName("all")
            .create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(allOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    parser.setHelpOption(helpOpt);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

      String catFile = "";
      if (cmdLine.hasOption(categoriesOpt)) {
        catFile = (String) cmdLine.getValue(categoriesOpt);
      }

      boolean all = false;
      if (cmdLine.hasOption(allOpt)) {
        all = true;
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (InterruptedException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }
Exemplo n.º 6
0
  @Override
  public int run(String[] args) throws Exception {
    String path = System.getProperty("user.dir");

    addInputOption();
    addOutputOption();

    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
    addOption(
        buildOption(
            TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Path labPath = new Path(path + "/../out/labelindex/");

    long labelSize = createLabelIndex(labPath);
    float alphaI = 1.0F;
    boolean trainComplementary = true;

    HadoopUtil.setSerializations(getConf());
    HadoopUtil.cacheFiles(labPath, getConf());
    HadoopUtil.delete(getConf(), new Path("/tmp/summedObservations"));
    HadoopUtil.delete(getConf(), new Path("/tmp/weights"));
    HadoopUtil.delete(getConf(), new Path("/tmp/thetas"));

    // Add up all the vectors with the same labels, while mapping the labels into our index
    Job indexInstances =
        prepareJob(
            new Path(path + "/../out/training"),
            new Path("/tmp/summedObservations"),
            SequenceFileInputFormat.class,
            IndexInstancesMapper.class,
            IntWritable.class,
            VectorWritable.class,
            VectorSumReducer.class,
            IntWritable.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class);
    indexInstances.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = indexInstances.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }
    // Sum up all the weights from the previous step, per label and per feature
    Job weightSummer =
        prepareJob(
            new Path("/tmp/summedObservations"),
            new Path("/tmp/weights"),
            SequenceFileInputFormat.class,
            WeightsMapper.class,
            Text.class,
            VectorWritable.class,
            VectorSumReducer.class,
            Text.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class);
    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
    weightSummer.setCombinerClass(VectorSumReducer.class);
    succeeded = weightSummer.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }

    // Put the per label and per feature vectors into the cache
    HadoopUtil.cacheFiles(new Path("/tmp/weights"), getConf());

    if (trainComplementary) {
      // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector
      // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight
      // Magnitude Errors
      Job thetaSummer =
          prepareJob(
              new Path("/tmp/summedObservations"),
              new Path("/tmp/thetas"),
              SequenceFileInputFormat.class,
              ThetaMapper.class,
              Text.class,
              VectorWritable.class,
              VectorSumReducer.class,
              Text.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      thetaSummer.setCombinerClass(VectorSumReducer.class);
      thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
      thetaSummer
          .getConfiguration()
          .setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
      succeeded = thetaSummer.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    // Put the per label theta normalizers into the cache
    HadoopUtil.cacheFiles(new Path("/tmp/thetas"), getConf());

    // Validate our model and then write it out to the official output
    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
    getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary);
    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path("/tmp/"), getConf());
    naiveBayesModel.validate();
    naiveBayesModel.serialize(new Path(path + "/../out/model"), getConf());

    return 0;
  }
  public static int main2(String[] args, Configuration conf) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Option inputDirOpt =
        obuilder
            .withLongName("input")
            .withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "The Directory on HDFS containing the collapsed, properly formatted files having "
                    + "one doc per line")
            .withShortName("i")
            .create();

    Option dictOpt =
        obuilder
            .withLongName("dictionary")
            .withRequired(false)
            .withArgument(abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the term-dictionary format is ... ")
            .withShortName("d")
            .create();

    Option dfsOpt =
        obuilder
            .withLongName("dfs")
            .withRequired(false)
            .withArgument(abuilder.withName("dfs").withMinimum(1).withMaximum(1).create())
            .withDescription("HDFS namenode URI")
            .withShortName("dfs")
            .create();

    Option numTopicsOpt =
        obuilder
            .withLongName("numTopics")
            .withRequired(true)
            .withArgument(abuilder.withName("numTopics").withMinimum(1).withMaximum(1).create())
            .withDescription("Number of topics to learn")
            .withShortName("top")
            .create();

    Option outputTopicFileOpt =
        obuilder
            .withLongName("topicOutputFile")
            .withRequired(true)
            .withArgument(
                abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create())
            .withDescription("File to write out p(term | topic)")
            .withShortName("to")
            .create();

    Option outputDocFileOpt =
        obuilder
            .withLongName("docOutputFile")
            .withRequired(true)
            .withArgument(abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create())
            .withDescription("File to write out p(topic | docid)")
            .withShortName("do")
            .create();

    Option alphaOpt =
        obuilder
            .withLongName("alpha")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("alpha")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("0.1")
                    .create())
            .withDescription("Smoothing parameter for p(topic | document) prior")
            .withShortName("a")
            .create();

    Option etaOpt =
        obuilder
            .withLongName("eta")
            .withRequired(false)
            .withArgument(
                abuilder.withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create())
            .withDescription("Smoothing parameter for p(term | topic)")
            .withShortName("e")
            .create();

    Option maxIterOpt =
        obuilder
            .withLongName("maxIterations")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("maxIterations")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(10)
                    .create())
            .withDescription("Maximum number of training passes")
            .withShortName("m")
            .create();

    Option modelCorpusFractionOption =
        obuilder
            .withLongName("modelCorpusFraction")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("modelCorpusFraction")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(0.0)
                    .create())
            .withShortName("mcf")
            .withDescription("For online updates, initial value of |model|/|corpus|")
            .create();

    Option burnInOpt =
        obuilder
            .withLongName("burnInIterations")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("burnInIterations")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(5)
                    .create())
            .withDescription("Minimum number of iterations")
            .withShortName("b")
            .create();

    Option convergenceOpt =
        obuilder
            .withLongName("convergence")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("convergence")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("0.0")
                    .create())
            .withDescription("Fractional rate of perplexity to consider convergence")
            .withShortName("c")
            .create();

    Option reInferDocTopicsOpt =
        obuilder
            .withLongName("reInferDocTopics")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("reInferDocTopics")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("no")
                    .create())
            .withDescription("re-infer p(topic | doc) : [no | randstart | continue]")
            .withShortName("rdt")
            .create();

    Option numTrainThreadsOpt =
        obuilder
            .withLongName("numTrainThreads")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("numTrainThreads")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("1")
                    .create())
            .withDescription("number of threads to train with")
            .withShortName("ntt")
            .create();

    Option numUpdateThreadsOpt =
        obuilder
            .withLongName("numUpdateThreads")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("numUpdateThreads")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("1")
                    .create())
            .withDescription("number of threads to update the model with")
            .withShortName("nut")
            .create();

    Option verboseOpt =
        obuilder
            .withLongName("verbose")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("verbose")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("false")
                    .create())
            .withDescription(
                "print verbose information, like top-terms in each topic, during iteration")
            .withShortName("v")
            .create();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(inputDirOpt)
            .withOption(numTopicsOpt)
            .withOption(alphaOpt)
            .withOption(etaOpt)
            .withOption(maxIterOpt)
            .withOption(burnInOpt)
            .withOption(convergenceOpt)
            .withOption(dictOpt)
            .withOption(reInferDocTopicsOpt)
            .withOption(outputDocFileOpt)
            .withOption(outputTopicFileOpt)
            .withOption(dfsOpt)
            .withOption(numTrainThreadsOpt)
            .withOption(numUpdateThreadsOpt)
            .withOption(modelCorpusFractionOption)
            .withOption(verboseOpt)
            .create();

    try {
      Parser parser = new Parser();

      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return -1;
      }

      String inputDirString = (String) cmdLine.getValue(inputDirOpt);
      String dictDirString = cmdLine.hasOption(dictOpt) ? (String) cmdLine.getValue(dictOpt) : null;
      int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt));
      double alpha = Double.parseDouble((String) cmdLine.getValue(alphaOpt));
      double eta = Double.parseDouble((String) cmdLine.getValue(etaOpt));
      int maxIterations = Integer.parseInt((String) cmdLine.getValue(maxIterOpt));
      int burnInIterations = (Integer) cmdLine.getValue(burnInOpt);
      double minFractionalErrorChange =
          Double.parseDouble((String) cmdLine.getValue(convergenceOpt));
      int numTrainThreads = Integer.parseInt((String) cmdLine.getValue(numTrainThreadsOpt));
      int numUpdateThreads = Integer.parseInt((String) cmdLine.getValue(numUpdateThreadsOpt));
      String topicOutFile = (String) cmdLine.getValue(outputTopicFileOpt);
      String docOutFile = (String) cmdLine.getValue(outputDocFileOpt);
      // String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt);
      boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt));
      double modelCorpusFraction = (Double) cmdLine.getValue(modelCorpusFractionOption);

      long start = System.nanoTime();

      if (conf.get("fs.default.name") == null) {
        String dfsNameNode = (String) cmdLine.getValue(dfsOpt);
        conf.set("fs.default.name", dfsNameNode);
      }
      String[] terms = loadDictionary(dictDirString, conf);
      logTime("dictionary loading", System.nanoTime() - start);
      start = System.nanoTime();
      Matrix corpus = loadVectors(inputDirString, conf);
      logTime("vector seqfile corpus loading", System.nanoTime() - start);
      start = System.nanoTime();
      InMemoryCollapsedVariationalBayes0 cvb0 =
          new InMemoryCollapsedVariationalBayes0(
              corpus,
              terms,
              numTopics,
              alpha,
              eta,
              numTrainThreads,
              numUpdateThreads,
              modelCorpusFraction);
      logTime("cvb0 init", System.nanoTime() - start);

      start = System.nanoTime();
      cvb0.setVerbose(verbose);
      cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations);
      logTime("total training time", System.nanoTime() - start);

      /*
      if ("randstart".equalsIgnoreCase(reInferDocTopics)) {
        cvb0.inferDocuments(0.0, 100, true);
      } else if ("continue".equalsIgnoreCase(reInferDocTopics)) {
        cvb0.inferDocuments(0.0, 100, false);
      }
       */

      start = System.nanoTime();
      cvb0.writeModel(new Path(topicOutFile));
      DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts);
      logTime("printTopics", System.nanoTime() - start);
    } catch (OptionException e) {
      log.error("Error while parsing options", e);
      CommandLineUtil.printHelp(group);
    }
    return 0;
  }
  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix", false);
    addOption(
        "similarityClassname",
        "s",
        "Name of distributed similarity class to instantiate, alternatively use "
            + "one of the predefined similarities ("
            + VectorSimilarityMeasures.list()
            + ')');
    addOption(
        "maxSimilaritiesPerRow",
        "m",
        "Number of maximum similarities per row (default: "
            + DEFAULT_MAX_SIMILARITIES_PER_ROW
            + ')',
        String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
    addOption(
        "excludeSelfSimilarity",
        "ess",
        "compute similarity of rows to themselves?",
        String.valueOf(false));
    addOption("threshold", "tr", "discard row pairs with a similarity value below this", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
      return -1;
    }

    int numberOfColumns;

    if (hasOption("numberOfColumns")) {
      // Number of columns explicitly specified via CLI
      numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
    } else {
      // else get the number of columns by determining the cardinality of a vector in the input
      // matrix
      numberOfColumns = getDimensions(getInputPath());
    }

    String similarityClassnameArg = getOption("similarityClassname");
    String similarityClassname;
    try {
      similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname();
    } catch (IllegalArgumentException iae) {
      similarityClassname = similarityClassnameArg;
    }

    // Clear the output and temp paths if the overwrite option has been set
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      // Clear the temp path
      HadoopUtil.delete(getConf(), getTempPath());
      // Clear the output path
      HadoopUtil.delete(getConf(), getOutputPath());
    }

    int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow"));
    boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity"));
    double threshold =
        hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD;

    Path weightsPath = getTempPath("weights");
    Path normsPath = getTempPath("norms.bin");
    Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin");
    Path maxValuesPath = getTempPath("maxValues.bin");
    Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job normsAndTranspose =
          prepareJob(
              getInputPath(),
              weightsPath,
              VectorNormMapper.class,
              IntWritable.class,
              VectorWritable.class,
              MergeVectorsReducer.class,
              IntWritable.class,
              VectorWritable.class);
      normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class);
      Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration();
      normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold));
      normsAndTransposeConf.set(NORMS_PATH, normsPath.toString());
      normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
      normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString());
      normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname);
      boolean succeeded = normsAndTranspose.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job pairwiseSimilarity =
          prepareJob(
              weightsPath,
              pairwiseSimilarityPath,
              CooccurrencesMapper.class,
              IntWritable.class,
              VectorWritable.class,
              SimilarityReducer.class,
              IntWritable.class,
              VectorWritable.class);
      pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);
      Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
      pairwiseConf.set(THRESHOLD, String.valueOf(threshold));
      pairwiseConf.set(NORMS_PATH, normsPath.toString());
      pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
      pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());
      pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);
      pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
      pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);
      boolean succeeded = pairwiseSimilarity.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job asMatrix =
          prepareJob(
              pairwiseSimilarityPath,
              getOutputPath(),
              UnsymmetrifyMapper.class,
              IntWritable.class,
              VectorWritable.class,
              MergeToTopKSimilaritiesReducer.class,
              IntWritable.class,
              VectorWritable.class);
      asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
      asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
      boolean succeeded = asMatrix.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    return 0;
  }
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException, InterruptedException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withRequired(true)
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();
    Option analyzerOpt =
        obuilder
            .withLongName("analyzer")
            .withRequired(false)
            .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
            .withDescription("The analyzer to use, must have a no argument constructor")
            .withShortName("a")
            .create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(analyzerOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
      String catFile = (String) cmdLine.getValue(categoriesOpt);
      Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class;
      if (cmdLine.hasOption(analyzerOpt)) {
        String className = cmdLine.getValue(analyzerOpt).toString();
        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
        // try instantiating it, b/c there isn't any point in setting it if
        // you can't instantiate it
        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }