Example #1
0
  private static boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();

    Option auc = builder.withLongName("auc").withDescription("print AUC").create();
    Option confusion =
        builder.withLongName("confusion").withDescription("print confusion matrix").create();

    Option scores = builder.withLongName("scores").withDescription("print scores").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFileOption =
        builder
            .withLongName("input")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data")
            .create();

    Option modelFileOption =
        builder
            .withLongName("model")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
            .withDescription("where to get a model")
            .create();

    Group normalArgs =
        new GroupBuilder()
            .withOption(help)
            .withOption(quiet)
            .withOption(auc)
            .withOption(scores)
            .withOption(confusion)
            .withOption(inputFileOption)
            .withOption(modelFileOption)
            .create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);

    if (cmdLine == null) {
      return false;
    }

    inputFile = getStringArgument(cmdLine, inputFileOption);
    modelFile = getStringArgument(cmdLine, modelFileOption);
    showAuc = getBooleanArgument(cmdLine, auc);
    showScores = getBooleanArgument(cmdLine, scores);
    showConfusion = getBooleanArgument(cmdLine, confusion);

    return true;
  }
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt =
        obuilder
            .withLongName("input")
            .withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The input directory path")
            .withShortName("i")
            .create();

    Option dirOutputPathOpt =
        obuilder
            .withLongName("output")
            .withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory Path")
            .withShortName("o")
            .create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();

    Option allOpt =
        obuilder
            .withLongName("all")
            .withDescription("If set, Select all files. Default is false")
            .withShortName("all")
            .create();

    Option helpOpt =
        obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(allOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    parser.setHelpOption(helpOpt);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

      String catFile = "";
      if (cmdLine.hasOption(categoriesOpt)) {
        catFile = (String) cmdLine.getValue(categoriesOpt);
      }

      boolean all = false;
      if (cmdLine.hasOption(allOpt)) {
        all = true;
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (InterruptedException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }
  public static boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile =
        builder
            .withLongName("input")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data")
            .create();

    Option outputFile =
        builder
            .withLongName("output")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get training data")
            .create();

    Option passes =
        builder
            .withLongName("passes")
            .withArgument(
                argumentBuilder.withName("passes").withDefault("2").withMaximum(1).create())
            .withDescription("the number of times to pass over the input data")
            .create();

    Option lambda =
        builder
            .withLongName("lambda")
            .withArgument(
                argumentBuilder.withName("lambda").withDefault("1e-4").withMaximum(1).create())
            .withDescription("the amount of coefficient decay to use")
            .create();

    Option rate =
        builder
            .withLongName("rate")
            .withArgument(
                argumentBuilder
                    .withName("learningRate")
                    .withDefault("1e-3")
                    .withMaximum(1)
                    .create())
            .withDescription("the learning rate")
            .create();

    Group normalArgs =
        new GroupBuilder()
            .withOption(help)
            .withOption(inputFile)
            .withOption(outputFile)
            .withOption(passes)
            .withOption(lambda)
            .withOption(rate)
            .create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);

    if (cmdLine == null) {
      return false;
    }

    TrainLogistic.inputFile = getStringArgument(cmdLine, inputFile);
    TrainLogistic.outputFile = getStringArgument(cmdLine, outputFile);
    TrainLogistic.passes = getIntegerArgument(cmdLine, passes);

    lrs.optimizer()
        .setStepSize(getDoubleArgument(cmdLine, rate))
        .setUpdater(new L1Updater())
        .setRegParam(getDoubleArgument(cmdLine, lambda))
        .setNumIterations(TrainLogistic.passes)
        .setMiniBatchFraction(1.0);
    lrs.setIntercept(true);

    return true;
  }
  private static boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFileOption =
        builder
            .withLongName("input")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data")
            .create();

    Option modelFileOption =
        builder
            .withLongName("model")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
            .withDescription("where to get the trained model")
            .create();

    Option outputFileOption =
        builder
            .withLongName("output")
            .withRequired(true)
            .withDescription("the file path to output scores")
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .create();

    Option idColumnOption =
        builder
            .withLongName("idcolumn")
            .withRequired(true)
            .withDescription("the name of the id column for each record")
            .withArgument(argumentBuilder.withName("idcolumn").withMaximum(1).create())
            .create();

    Option maxScoreOnlyOption =
        builder
            .withLongName("maxscoreonly")
            .withDescription("only output the target label with max scores")
            .create();

    Group normalArgs =
        new GroupBuilder()
            .withOption(help)
            .withOption(quiet)
            .withOption(inputFileOption)
            .withOption(modelFileOption)
            .withOption(outputFileOption)
            .withOption(idColumnOption)
            .withOption(maxScoreOnlyOption)
            .create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);

    if (cmdLine == null) {
      return false;
    }

    inputFile = getStringArgument(cmdLine, inputFileOption);
    modelFile = getStringArgument(cmdLine, modelFileOption);
    outputFile = getStringArgument(cmdLine, outputFileOption);
    idColumn = getStringArgument(cmdLine, idColumnOption);
    maxScoreOnly = getBooleanArgument(cmdLine, maxScoreOnlyOption);
    return true;
  }
  public static int main2(String[] args, Configuration conf) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Option inputDirOpt =
        obuilder
            .withLongName("input")
            .withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "The Directory on HDFS containing the collapsed, properly formatted files having "
                    + "one doc per line")
            .withShortName("i")
            .create();

    Option dictOpt =
        obuilder
            .withLongName("dictionary")
            .withRequired(false)
            .withArgument(abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the term-dictionary format is ... ")
            .withShortName("d")
            .create();

    Option dfsOpt =
        obuilder
            .withLongName("dfs")
            .withRequired(false)
            .withArgument(abuilder.withName("dfs").withMinimum(1).withMaximum(1).create())
            .withDescription("HDFS namenode URI")
            .withShortName("dfs")
            .create();

    Option numTopicsOpt =
        obuilder
            .withLongName("numTopics")
            .withRequired(true)
            .withArgument(abuilder.withName("numTopics").withMinimum(1).withMaximum(1).create())
            .withDescription("Number of topics to learn")
            .withShortName("top")
            .create();

    Option outputTopicFileOpt =
        obuilder
            .withLongName("topicOutputFile")
            .withRequired(true)
            .withArgument(
                abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create())
            .withDescription("File to write out p(term | topic)")
            .withShortName("to")
            .create();

    Option outputDocFileOpt =
        obuilder
            .withLongName("docOutputFile")
            .withRequired(true)
            .withArgument(abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create())
            .withDescription("File to write out p(topic | docid)")
            .withShortName("do")
            .create();

    Option alphaOpt =
        obuilder
            .withLongName("alpha")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("alpha")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("0.1")
                    .create())
            .withDescription("Smoothing parameter for p(topic | document) prior")
            .withShortName("a")
            .create();

    Option etaOpt =
        obuilder
            .withLongName("eta")
            .withRequired(false)
            .withArgument(
                abuilder.withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create())
            .withDescription("Smoothing parameter for p(term | topic)")
            .withShortName("e")
            .create();

    Option maxIterOpt =
        obuilder
            .withLongName("maxIterations")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("maxIterations")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(10)
                    .create())
            .withDescription("Maximum number of training passes")
            .withShortName("m")
            .create();

    Option modelCorpusFractionOption =
        obuilder
            .withLongName("modelCorpusFraction")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("modelCorpusFraction")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(0.0)
                    .create())
            .withShortName("mcf")
            .withDescription("For online updates, initial value of |model|/|corpus|")
            .create();

    Option burnInOpt =
        obuilder
            .withLongName("burnInIterations")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("burnInIterations")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(5)
                    .create())
            .withDescription("Minimum number of iterations")
            .withShortName("b")
            .create();

    Option convergenceOpt =
        obuilder
            .withLongName("convergence")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("convergence")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("0.0")
                    .create())
            .withDescription("Fractional rate of perplexity to consider convergence")
            .withShortName("c")
            .create();

    Option reInferDocTopicsOpt =
        obuilder
            .withLongName("reInferDocTopics")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("reInferDocTopics")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("no")
                    .create())
            .withDescription("re-infer p(topic | doc) : [no | randstart | continue]")
            .withShortName("rdt")
            .create();

    Option numTrainThreadsOpt =
        obuilder
            .withLongName("numTrainThreads")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("numTrainThreads")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("1")
                    .create())
            .withDescription("number of threads to train with")
            .withShortName("ntt")
            .create();

    Option numUpdateThreadsOpt =
        obuilder
            .withLongName("numUpdateThreads")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("numUpdateThreads")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("1")
                    .create())
            .withDescription("number of threads to update the model with")
            .withShortName("nut")
            .create();

    Option verboseOpt =
        obuilder
            .withLongName("verbose")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("verbose")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("false")
                    .create())
            .withDescription(
                "print verbose information, like top-terms in each topic, during iteration")
            .withShortName("v")
            .create();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(inputDirOpt)
            .withOption(numTopicsOpt)
            .withOption(alphaOpt)
            .withOption(etaOpt)
            .withOption(maxIterOpt)
            .withOption(burnInOpt)
            .withOption(convergenceOpt)
            .withOption(dictOpt)
            .withOption(reInferDocTopicsOpt)
            .withOption(outputDocFileOpt)
            .withOption(outputTopicFileOpt)
            .withOption(dfsOpt)
            .withOption(numTrainThreadsOpt)
            .withOption(numUpdateThreadsOpt)
            .withOption(modelCorpusFractionOption)
            .withOption(verboseOpt)
            .create();

    try {
      Parser parser = new Parser();

      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return -1;
      }

      String inputDirString = (String) cmdLine.getValue(inputDirOpt);
      String dictDirString = cmdLine.hasOption(dictOpt) ? (String) cmdLine.getValue(dictOpt) : null;
      int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt));
      double alpha = Double.parseDouble((String) cmdLine.getValue(alphaOpt));
      double eta = Double.parseDouble((String) cmdLine.getValue(etaOpt));
      int maxIterations = Integer.parseInt((String) cmdLine.getValue(maxIterOpt));
      int burnInIterations = (Integer) cmdLine.getValue(burnInOpt);
      double minFractionalErrorChange =
          Double.parseDouble((String) cmdLine.getValue(convergenceOpt));
      int numTrainThreads = Integer.parseInt((String) cmdLine.getValue(numTrainThreadsOpt));
      int numUpdateThreads = Integer.parseInt((String) cmdLine.getValue(numUpdateThreadsOpt));
      String topicOutFile = (String) cmdLine.getValue(outputTopicFileOpt);
      String docOutFile = (String) cmdLine.getValue(outputDocFileOpt);
      // String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt);
      boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt));
      double modelCorpusFraction = (Double) cmdLine.getValue(modelCorpusFractionOption);

      long start = System.nanoTime();

      if (conf.get("fs.default.name") == null) {
        String dfsNameNode = (String) cmdLine.getValue(dfsOpt);
        conf.set("fs.default.name", dfsNameNode);
      }
      String[] terms = loadDictionary(dictDirString, conf);
      logTime("dictionary loading", System.nanoTime() - start);
      start = System.nanoTime();
      Matrix corpus = loadVectors(inputDirString, conf);
      logTime("vector seqfile corpus loading", System.nanoTime() - start);
      start = System.nanoTime();
      InMemoryCollapsedVariationalBayes0 cvb0 =
          new InMemoryCollapsedVariationalBayes0(
              corpus,
              terms,
              numTopics,
              alpha,
              eta,
              numTrainThreads,
              numUpdateThreads,
              modelCorpusFraction);
      logTime("cvb0 init", System.nanoTime() - start);

      start = System.nanoTime();
      cvb0.setVerbose(verbose);
      cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations);
      logTime("total training time", System.nanoTime() - start);

      /*
      if ("randstart".equalsIgnoreCase(reInferDocTopics)) {
        cvb0.inferDocuments(0.0, 100, true);
      } else if ("continue".equalsIgnoreCase(reInferDocTopics)) {
        cvb0.inferDocuments(0.0, 100, false);
      }
       */

      start = System.nanoTime();
      cvb0.writeModel(new Path(topicOutFile));
      DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts);
      logTime("printTopics", System.nanoTime() - start);
    } catch (OptionException e) {
      log.error("Error while parsing options", e);
      CommandLineUtil.printHelp(group);
    }
    return 0;
  }