private static boolean parseArgs(String[] args) { DefaultOptionBuilder builder = new DefaultOptionBuilder(); Option help = builder.withLongName("help").withDescription("print this list").create(); Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create(); Option auc = builder.withLongName("auc").withDescription("print AUC").create(); Option confusion = builder.withLongName("confusion").withDescription("print confusion matrix").create(); Option scores = builder.withLongName("scores").withDescription("print scores").create(); ArgumentBuilder argumentBuilder = new ArgumentBuilder(); Option inputFileOption = builder .withLongName("input") .withRequired(true) .withArgument(argumentBuilder.withName("input").withMaximum(1).create()) .withDescription("where to get training data") .create(); Option modelFileOption = builder .withLongName("model") .withRequired(true) .withArgument(argumentBuilder.withName("model").withMaximum(1).create()) .withDescription("where to get a model") .create(); Group normalArgs = new GroupBuilder() .withOption(help) .withOption(quiet) .withOption(auc) .withOption(scores) .withOption(confusion) .withOption(inputFileOption) .withOption(modelFileOption) .create(); Parser parser = new Parser(); parser.setHelpOption(help); parser.setHelpTrigger("--help"); parser.setGroup(normalArgs); parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130)); CommandLine cmdLine = parser.parseAndHelp(args); if (cmdLine == null) { return false; } inputFile = getStringArgument(cmdLine, inputFileOption); modelFile = getStringArgument(cmdLine, modelFileOption); showAuc = getBooleanArgument(cmdLine, auc); showScores = getBooleanArgument(cmdLine, scores); showConfusion = getBooleanArgument(cmdLine, confusion); return true; }
/** * Takes in two arguments: * * <ol> * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link * org.apache.hadoop.io.SequenceFile} * </ol> */ public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dirInputPathOpt = obuilder .withLongName("input") .withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("The input directory path") .withShortName("i") .create(); Option dirOutputPathOpt = obuilder .withLongName("output") .withRequired(true) .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory Path") .withShortName("o") .create(); Option categoriesOpt = obuilder .withLongName("categories") .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create()) .withDescription( "Location of the categories file. One entry per line. " + "Will be used to make a string match in Wikipedia Category field") .withShortName("c") .create(); Option exactMatchOpt = obuilder .withLongName("exactMatch") .withDescription( "If set, then the category name must exactly match the " + "entry in the categories file. Default is false") .withShortName("e") .create(); Option allOpt = obuilder .withLongName("all") .withDescription("If set, Select all files. Default is false") .withShortName("all") .create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create(); Group group = gbuilder .withName("Options") .withOption(categoriesOpt) .withOption(dirInputPathOpt) .withOption(dirOutputPathOpt) .withOption(exactMatchOpt) .withOption(allOpt) .withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String inputPath = (String) cmdLine.getValue(dirInputPathOpt); String outputPath = (String) cmdLine.getValue(dirOutputPathOpt); String catFile = ""; if (cmdLine.hasOption(categoriesOpt)) { catFile = (String) cmdLine.getValue(categoriesOpt); } boolean all = false; if (cmdLine.hasOption(allOpt)) { all = true; } runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (InterruptedException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (ClassNotFoundException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
public static boolean parseArgs(String[] args) { DefaultOptionBuilder builder = new DefaultOptionBuilder(); Option help = builder.withLongName("help").withDescription("print this list").create(); ArgumentBuilder argumentBuilder = new ArgumentBuilder(); Option inputFile = builder .withLongName("input") .withRequired(true) .withArgument(argumentBuilder.withName("input").withMaximum(1).create()) .withDescription("where to get training data") .create(); Option outputFile = builder .withLongName("output") .withRequired(true) .withArgument(argumentBuilder.withName("output").withMaximum(1).create()) .withDescription("where to get training data") .create(); Option passes = builder .withLongName("passes") .withArgument( argumentBuilder.withName("passes").withDefault("2").withMaximum(1).create()) .withDescription("the number of times to pass over the input data") .create(); Option lambda = builder .withLongName("lambda") .withArgument( argumentBuilder.withName("lambda").withDefault("1e-4").withMaximum(1).create()) .withDescription("the amount of coefficient decay to use") .create(); Option rate = builder .withLongName("rate") .withArgument( argumentBuilder .withName("learningRate") .withDefault("1e-3") .withMaximum(1) .create()) .withDescription("the learning rate") .create(); Group normalArgs = new GroupBuilder() .withOption(help) .withOption(inputFile) .withOption(outputFile) .withOption(passes) .withOption(lambda) .withOption(rate) .create(); Parser parser = new Parser(); parser.setHelpOption(help); parser.setHelpTrigger("--help"); parser.setGroup(normalArgs); parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130)); CommandLine cmdLine = parser.parseAndHelp(args); if (cmdLine == null) { return false; } TrainLogistic.inputFile = getStringArgument(cmdLine, inputFile); TrainLogistic.outputFile = getStringArgument(cmdLine, outputFile); TrainLogistic.passes = getIntegerArgument(cmdLine, passes); lrs.optimizer() .setStepSize(getDoubleArgument(cmdLine, rate)) .setUpdater(new L1Updater()) .setRegParam(getDoubleArgument(cmdLine, lambda)) .setNumIterations(TrainLogistic.passes) .setMiniBatchFraction(1.0); lrs.setIntercept(true); return true; }
private static boolean parseArgs(String[] args) { DefaultOptionBuilder builder = new DefaultOptionBuilder(); Option help = builder.withLongName("help").withDescription("print this list").create(); Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create(); ArgumentBuilder argumentBuilder = new ArgumentBuilder(); Option inputFileOption = builder .withLongName("input") .withRequired(true) .withArgument(argumentBuilder.withName("input").withMaximum(1).create()) .withDescription("where to get training data") .create(); Option modelFileOption = builder .withLongName("model") .withRequired(true) .withArgument(argumentBuilder.withName("model").withMaximum(1).create()) .withDescription("where to get the trained model") .create(); Option outputFileOption = builder .withLongName("output") .withRequired(true) .withDescription("the file path to output scores") .withArgument(argumentBuilder.withName("output").withMaximum(1).create()) .create(); Option idColumnOption = builder .withLongName("idcolumn") .withRequired(true) .withDescription("the name of the id column for each record") .withArgument(argumentBuilder.withName("idcolumn").withMaximum(1).create()) .create(); Option maxScoreOnlyOption = builder .withLongName("maxscoreonly") .withDescription("only output the target label with max scores") .create(); Group normalArgs = new GroupBuilder() .withOption(help) .withOption(quiet) .withOption(inputFileOption) .withOption(modelFileOption) .withOption(outputFileOption) .withOption(idColumnOption) .withOption(maxScoreOnlyOption) .create(); Parser parser = new Parser(); parser.setHelpOption(help); parser.setHelpTrigger("--help"); parser.setGroup(normalArgs); parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130)); CommandLine cmdLine = parser.parseAndHelp(args); if (cmdLine == null) { return false; } inputFile = getStringArgument(cmdLine, inputFileOption); modelFile = getStringArgument(cmdLine, modelFileOption); outputFile = getStringArgument(cmdLine, outputFileOption); idColumn = getStringArgument(cmdLine, idColumnOption); maxScoreOnly = getBooleanArgument(cmdLine, maxScoreOnlyOption); return true; }
public static int main2(String[] args, Configuration conf) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option helpOpt = DefaultOptionCreator.helpOption(); Option inputDirOpt = obuilder .withLongName("input") .withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription( "The Directory on HDFS containing the collapsed, properly formatted files having " + "one doc per line") .withShortName("i") .create(); Option dictOpt = obuilder .withLongName("dictionary") .withRequired(false) .withArgument(abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()) .withDescription("The path to the term-dictionary format is ... ") .withShortName("d") .create(); Option dfsOpt = obuilder .withLongName("dfs") .withRequired(false) .withArgument(abuilder.withName("dfs").withMinimum(1).withMaximum(1).create()) .withDescription("HDFS namenode URI") .withShortName("dfs") .create(); Option numTopicsOpt = obuilder .withLongName("numTopics") .withRequired(true) .withArgument(abuilder.withName("numTopics").withMinimum(1).withMaximum(1).create()) .withDescription("Number of topics to learn") .withShortName("top") .create(); Option outputTopicFileOpt = obuilder .withLongName("topicOutputFile") .withRequired(true) .withArgument( abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create()) .withDescription("File to write out p(term | topic)") .withShortName("to") .create(); Option outputDocFileOpt = obuilder .withLongName("docOutputFile") .withRequired(true) .withArgument(abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create()) .withDescription("File to write out p(topic | docid)") .withShortName("do") .create(); Option alphaOpt = obuilder .withLongName("alpha") .withRequired(false) .withArgument( abuilder .withName("alpha") .withMinimum(1) .withMaximum(1) .withDefault("0.1") .create()) .withDescription("Smoothing parameter for p(topic | document) prior") .withShortName("a") .create(); Option etaOpt = obuilder .withLongName("eta") .withRequired(false) .withArgument( abuilder.withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create()) .withDescription("Smoothing parameter for p(term | topic)") .withShortName("e") .create(); Option maxIterOpt = obuilder .withLongName("maxIterations") .withRequired(false) .withArgument( abuilder .withName("maxIterations") .withMinimum(1) .withMaximum(1) .withDefault(10) .create()) .withDescription("Maximum number of training passes") .withShortName("m") .create(); Option modelCorpusFractionOption = obuilder .withLongName("modelCorpusFraction") .withRequired(false) .withArgument( abuilder .withName("modelCorpusFraction") .withMinimum(1) .withMaximum(1) .withDefault(0.0) .create()) .withShortName("mcf") .withDescription("For online updates, initial value of |model|/|corpus|") .create(); Option burnInOpt = obuilder .withLongName("burnInIterations") .withRequired(false) .withArgument( abuilder .withName("burnInIterations") .withMinimum(1) .withMaximum(1) .withDefault(5) .create()) .withDescription("Minimum number of iterations") .withShortName("b") .create(); Option convergenceOpt = obuilder .withLongName("convergence") .withRequired(false) .withArgument( abuilder .withName("convergence") .withMinimum(1) .withMaximum(1) .withDefault("0.0") .create()) .withDescription("Fractional rate of perplexity to consider convergence") .withShortName("c") .create(); Option reInferDocTopicsOpt = obuilder .withLongName("reInferDocTopics") .withRequired(false) .withArgument( abuilder .withName("reInferDocTopics") .withMinimum(1) .withMaximum(1) .withDefault("no") .create()) .withDescription("re-infer p(topic | doc) : [no | randstart | continue]") .withShortName("rdt") .create(); Option numTrainThreadsOpt = obuilder .withLongName("numTrainThreads") .withRequired(false) .withArgument( abuilder .withName("numTrainThreads") .withMinimum(1) .withMaximum(1) .withDefault("1") .create()) .withDescription("number of threads to train with") .withShortName("ntt") .create(); Option numUpdateThreadsOpt = obuilder .withLongName("numUpdateThreads") .withRequired(false) .withArgument( abuilder .withName("numUpdateThreads") .withMinimum(1) .withMaximum(1) .withDefault("1") .create()) .withDescription("number of threads to update the model with") .withShortName("nut") .create(); Option verboseOpt = obuilder .withLongName("verbose") .withRequired(false) .withArgument( abuilder .withName("verbose") .withMinimum(1) .withMaximum(1) .withDefault("false") .create()) .withDescription( "print verbose information, like top-terms in each topic, during iteration") .withShortName("v") .create(); Group group = gbuilder .withName("Options") .withOption(inputDirOpt) .withOption(numTopicsOpt) .withOption(alphaOpt) .withOption(etaOpt) .withOption(maxIterOpt) .withOption(burnInOpt) .withOption(convergenceOpt) .withOption(dictOpt) .withOption(reInferDocTopicsOpt) .withOption(outputDocFileOpt) .withOption(outputTopicFileOpt) .withOption(dfsOpt) .withOption(numTrainThreadsOpt) .withOption(numUpdateThreadsOpt) .withOption(modelCorpusFractionOption) .withOption(verboseOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return -1; } String inputDirString = (String) cmdLine.getValue(inputDirOpt); String dictDirString = cmdLine.hasOption(dictOpt) ? (String) cmdLine.getValue(dictOpt) : null; int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt)); double alpha = Double.parseDouble((String) cmdLine.getValue(alphaOpt)); double eta = Double.parseDouble((String) cmdLine.getValue(etaOpt)); int maxIterations = Integer.parseInt((String) cmdLine.getValue(maxIterOpt)); int burnInIterations = (Integer) cmdLine.getValue(burnInOpt); double minFractionalErrorChange = Double.parseDouble((String) cmdLine.getValue(convergenceOpt)); int numTrainThreads = Integer.parseInt((String) cmdLine.getValue(numTrainThreadsOpt)); int numUpdateThreads = Integer.parseInt((String) cmdLine.getValue(numUpdateThreadsOpt)); String topicOutFile = (String) cmdLine.getValue(outputTopicFileOpt); String docOutFile = (String) cmdLine.getValue(outputDocFileOpt); // String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt); boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt)); double modelCorpusFraction = (Double) cmdLine.getValue(modelCorpusFractionOption); long start = System.nanoTime(); if (conf.get("fs.default.name") == null) { String dfsNameNode = (String) cmdLine.getValue(dfsOpt); conf.set("fs.default.name", dfsNameNode); } String[] terms = loadDictionary(dictDirString, conf); logTime("dictionary loading", System.nanoTime() - start); start = System.nanoTime(); Matrix corpus = loadVectors(inputDirString, conf); logTime("vector seqfile corpus loading", System.nanoTime() - start); start = System.nanoTime(); InMemoryCollapsedVariationalBayes0 cvb0 = new InMemoryCollapsedVariationalBayes0( corpus, terms, numTopics, alpha, eta, numTrainThreads, numUpdateThreads, modelCorpusFraction); logTime("cvb0 init", System.nanoTime() - start); start = System.nanoTime(); cvb0.setVerbose(verbose); cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations); logTime("total training time", System.nanoTime() - start); /* if ("randstart".equalsIgnoreCase(reInferDocTopics)) { cvb0.inferDocuments(0.0, 100, true); } else if ("continue".equalsIgnoreCase(reInferDocTopics)) { cvb0.inferDocuments(0.0, 100, false); } */ start = System.nanoTime(); cvb0.writeModel(new Path(topicOutFile)); DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts); logTime("printTopics", System.nanoTime() - start); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); } return 0; }