Esempio n. 1
0
  private static boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();

    Option auc = builder.withLongName("auc").withDescription("print AUC").create();
    Option confusion =
        builder.withLongName("confusion").withDescription("print confusion matrix").create();

    Option scores = builder.withLongName("scores").withDescription("print scores").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFileOption =
        builder
            .withLongName("input")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data")
            .create();

    Option modelFileOption =
        builder
            .withLongName("model")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
            .withDescription("where to get a model")
            .create();

    Group normalArgs =
        new GroupBuilder()
            .withOption(help)
            .withOption(quiet)
            .withOption(auc)
            .withOption(scores)
            .withOption(confusion)
            .withOption(inputFileOption)
            .withOption(modelFileOption)
            .create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);

    if (cmdLine == null) {
      return false;
    }

    inputFile = getStringArgument(cmdLine, inputFileOption);
    modelFile = getStringArgument(cmdLine, modelFileOption);
    showAuc = getBooleanArgument(cmdLine, auc);
    showScores = getBooleanArgument(cmdLine, scores);
    showConfusion = getBooleanArgument(cmdLine, confusion);

    return true;
  }
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt =
        obuilder
            .withLongName("input")
            .withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The input directory path")
            .withShortName("i")
            .create();

    Option dirOutputPathOpt =
        obuilder
            .withLongName("output")
            .withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory Path")
            .withShortName("o")
            .create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();

    Option allOpt =
        obuilder
            .withLongName("all")
            .withDescription("If set, Select all files. Default is false")
            .withShortName("all")
            .create();

    Option helpOpt =
        obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(allOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    parser.setHelpOption(helpOpt);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

      String catFile = "";
      if (cmdLine.hasOption(categoriesOpt)) {
        catFile = (String) cmdLine.getValue(categoriesOpt);
      }

      boolean all = false;
      if (cmdLine.hasOption(allOpt)) {
        all = true;
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (InterruptedException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }
Esempio n. 3
0
  public static boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile =
        builder
            .withLongName("input")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data")
            .create();

    Option outputFile =
        builder
            .withLongName("output")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get training data")
            .create();

    Option passes =
        builder
            .withLongName("passes")
            .withArgument(
                argumentBuilder.withName("passes").withDefault("2").withMaximum(1).create())
            .withDescription("the number of times to pass over the input data")
            .create();

    Option lambda =
        builder
            .withLongName("lambda")
            .withArgument(
                argumentBuilder.withName("lambda").withDefault("1e-4").withMaximum(1).create())
            .withDescription("the amount of coefficient decay to use")
            .create();

    Option rate =
        builder
            .withLongName("rate")
            .withArgument(
                argumentBuilder
                    .withName("learningRate")
                    .withDefault("1e-3")
                    .withMaximum(1)
                    .create())
            .withDescription("the learning rate")
            .create();

    Group normalArgs =
        new GroupBuilder()
            .withOption(help)
            .withOption(inputFile)
            .withOption(outputFile)
            .withOption(passes)
            .withOption(lambda)
            .withOption(rate)
            .create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);

    if (cmdLine == null) {
      return false;
    }

    TrainLogistic.inputFile = getStringArgument(cmdLine, inputFile);
    TrainLogistic.outputFile = getStringArgument(cmdLine, outputFile);
    TrainLogistic.passes = getIntegerArgument(cmdLine, passes);

    lrs.optimizer()
        .setStepSize(getDoubleArgument(cmdLine, rate))
        .setUpdater(new L1Updater())
        .setRegParam(getDoubleArgument(cmdLine, lambda))
        .setNumIterations(TrainLogistic.passes)
        .setMiniBatchFraction(1.0);
    lrs.setIntercept(true);

    return true;
  }
Esempio n. 4
0
  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputOpt = DefaultOptionCreator.outputOption().create();

    Option helpOpt = DefaultOptionCreator.helpOption();
    Option recordSplitterOpt =
        obuilder
            .withLongName("splitterPattern")
            .withArgument(
                abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Regular Expression pattern used to split given line into fields."
                    + " Default value splits comma or tab separated fields."
                    + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ")
            .withShortName("regex")
            .create();
    Option encodingOpt =
        obuilder
            .withLongName("encoding")
            .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The file encoding.  Default value: UTF-8")
            .withShortName("e")
            .create();
    Group group =
        gbuilder
            .withName("Options")
            .withOption(inputDirOpt)
            .withOption(outputOpt)
            .withOption(helpOpt)
            .withOption(recordSplitterOpt)
            .withOption(encodingOpt)
            .create();

    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);

      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }
      Parameters params = new Parameters();
      if (cmdLine.hasOption(recordSplitterOpt)) {
        params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt));
      }

      String encoding = "UTF-8";
      if (cmdLine.hasOption(encodingOpt)) {
        encoding = (String) cmdLine.getValue(encodingOpt);
      }
      params.set("encoding", encoding);
      String inputDir = (String) cmdLine.getValue(inputDirOpt);
      String outputDir = (String) cmdLine.getValue(outputOpt);
      params.set("input", inputDir);
      params.set("output", outputDir);
      params.set("groupingFieldCount", "2");
      params.set("gfield0", "1");
      params.set("gfield1", "2");
      params.set("selectedFieldCount", "1");
      params.set("field0", "3");
      params.set("maxTransactionLength", "100");
      KeyBasedStringTupleGrouper.startJob(params);

    } catch (OptionException ex) {
      CommandLineUtil.printHelp(group);
    }
  }
Esempio n. 5
0
  private static boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFileOption =
        builder
            .withLongName("input")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data")
            .create();

    Option modelFileOption =
        builder
            .withLongName("model")
            .withRequired(true)
            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
            .withDescription("where to get the trained model")
            .create();

    Option outputFileOption =
        builder
            .withLongName("output")
            .withRequired(true)
            .withDescription("the file path to output scores")
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .create();

    Option idColumnOption =
        builder
            .withLongName("idcolumn")
            .withRequired(true)
            .withDescription("the name of the id column for each record")
            .withArgument(argumentBuilder.withName("idcolumn").withMaximum(1).create())
            .create();

    Option maxScoreOnlyOption =
        builder
            .withLongName("maxscoreonly")
            .withDescription("only output the target label with max scores")
            .create();

    Group normalArgs =
        new GroupBuilder()
            .withOption(help)
            .withOption(quiet)
            .withOption(inputFileOption)
            .withOption(modelFileOption)
            .withOption(outputFileOption)
            .withOption(idColumnOption)
            .withOption(maxScoreOnlyOption)
            .create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);

    if (cmdLine == null) {
      return false;
    }

    inputFile = getStringArgument(cmdLine, inputFileOption);
    modelFile = getStringArgument(cmdLine, modelFileOption);
    outputFile = getStringArgument(cmdLine, outputFileOption);
    idColumn = getStringArgument(cmdLine, idColumnOption);
    maxScoreOnly = getBooleanArgument(cmdLine, maxScoreOnlyOption);
    return true;
  }
Esempio n. 6
0
  public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt =
        obuilder
            .withLongName("input")
            .withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "The file or directory containing the ARFF files.  If it is a directory, all .arff files will be converted")
            .withShortName("d")
            .create();

    Option outputOpt =
        obuilder
            .withLongName("output")
            .withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "The output directory.  Files will have the same name as the input, but with the extension .mvc")
            .withShortName("o")
            .create();

    Option maxOpt =
        obuilder
            .withLongName("max")
            .withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
            .withShortName("m")
            .create();

    Option dictOutOpt =
        obuilder
            .withLongName("dictOut")
            .withRequired(true)
            .withArgument(abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create())
            .withDescription("The file to output the label bindings")
            .withShortName("t")
            .create();

    Option jsonDictonaryOpt =
        obuilder
            .withLongName("json-dictonary")
            .withRequired(false)
            .withDescription("Write dictonary in JSON format")
            .withShortName("j")
            .create();

    Option delimiterOpt =
        obuilder
            .withLongName("delimiter")
            .withRequired(false)
            .withArgument(abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create())
            .withDescription("The delimiter for outputing the dictionary")
            .withShortName("l")
            .create();

    Option helpOpt =
        obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
    Group group =
        gbuilder
            .withName("Options")
            .withOption(inputOpt)
            .withOption(outputOpt)
            .withOption(maxOpt)
            .withOption(helpOpt)
            .withOption(dictOutOpt)
            .withOption(jsonDictonaryOpt)
            .withOption(delimiterOpt)
            .create();

    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);

      if (cmdLine.hasOption(helpOpt)) {

        CommandLineUtil.printHelp(group);
        return;
      }
      if (cmdLine.hasOption(inputOpt)) { // Lucene case
        File input = new File(cmdLine.getValue(inputOpt).toString());
        long maxDocs = Long.MAX_VALUE;
        if (cmdLine.hasOption(maxOpt)) {
          maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
        }
        if (maxDocs < 0) {
          throw new IllegalArgumentException("maxDocs must be >= 0");
        }
        String outDir = cmdLine.getValue(outputOpt).toString();
        log.info("Output Dir: {}", outDir);

        String delimiter =
            cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
        File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
        boolean jsonDictonary = cmdLine.hasOption(jsonDictonaryOpt);
        ARFFModel model = new MapBackedARFFModel();
        if (input.exists() && input.isDirectory()) {
          File[] files =
              input.listFiles(
                  new FilenameFilter() {
                    @Override
                    public boolean accept(File file, String name) {
                      return name.endsWith(".arff");
                    }
                  });

          for (File file : files) {
            writeFile(outDir, file, maxDocs, model, dictOut, delimiter, jsonDictonary);
          }
        } else {
          writeFile(outDir, input, maxDocs, model, dictOut, delimiter, jsonDictonary);
        }
      }

    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }
  public static int main2(String[] args, Configuration conf) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Option inputDirOpt =
        obuilder
            .withLongName("input")
            .withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "The Directory on HDFS containing the collapsed, properly formatted files having "
                    + "one doc per line")
            .withShortName("i")
            .create();

    Option dictOpt =
        obuilder
            .withLongName("dictionary")
            .withRequired(false)
            .withArgument(abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the term-dictionary format is ... ")
            .withShortName("d")
            .create();

    Option dfsOpt =
        obuilder
            .withLongName("dfs")
            .withRequired(false)
            .withArgument(abuilder.withName("dfs").withMinimum(1).withMaximum(1).create())
            .withDescription("HDFS namenode URI")
            .withShortName("dfs")
            .create();

    Option numTopicsOpt =
        obuilder
            .withLongName("numTopics")
            .withRequired(true)
            .withArgument(abuilder.withName("numTopics").withMinimum(1).withMaximum(1).create())
            .withDescription("Number of topics to learn")
            .withShortName("top")
            .create();

    Option outputTopicFileOpt =
        obuilder
            .withLongName("topicOutputFile")
            .withRequired(true)
            .withArgument(
                abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create())
            .withDescription("File to write out p(term | topic)")
            .withShortName("to")
            .create();

    Option outputDocFileOpt =
        obuilder
            .withLongName("docOutputFile")
            .withRequired(true)
            .withArgument(abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create())
            .withDescription("File to write out p(topic | docid)")
            .withShortName("do")
            .create();

    Option alphaOpt =
        obuilder
            .withLongName("alpha")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("alpha")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("0.1")
                    .create())
            .withDescription("Smoothing parameter for p(topic | document) prior")
            .withShortName("a")
            .create();

    Option etaOpt =
        obuilder
            .withLongName("eta")
            .withRequired(false)
            .withArgument(
                abuilder.withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create())
            .withDescription("Smoothing parameter for p(term | topic)")
            .withShortName("e")
            .create();

    Option maxIterOpt =
        obuilder
            .withLongName("maxIterations")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("maxIterations")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(10)
                    .create())
            .withDescription("Maximum number of training passes")
            .withShortName("m")
            .create();

    Option modelCorpusFractionOption =
        obuilder
            .withLongName("modelCorpusFraction")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("modelCorpusFraction")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(0.0)
                    .create())
            .withShortName("mcf")
            .withDescription("For online updates, initial value of |model|/|corpus|")
            .create();

    Option burnInOpt =
        obuilder
            .withLongName("burnInIterations")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("burnInIterations")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(5)
                    .create())
            .withDescription("Minimum number of iterations")
            .withShortName("b")
            .create();

    Option convergenceOpt =
        obuilder
            .withLongName("convergence")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("convergence")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("0.0")
                    .create())
            .withDescription("Fractional rate of perplexity to consider convergence")
            .withShortName("c")
            .create();

    Option reInferDocTopicsOpt =
        obuilder
            .withLongName("reInferDocTopics")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("reInferDocTopics")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("no")
                    .create())
            .withDescription("re-infer p(topic | doc) : [no | randstart | continue]")
            .withShortName("rdt")
            .create();

    Option numTrainThreadsOpt =
        obuilder
            .withLongName("numTrainThreads")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("numTrainThreads")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("1")
                    .create())
            .withDescription("number of threads to train with")
            .withShortName("ntt")
            .create();

    Option numUpdateThreadsOpt =
        obuilder
            .withLongName("numUpdateThreads")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("numUpdateThreads")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("1")
                    .create())
            .withDescription("number of threads to update the model with")
            .withShortName("nut")
            .create();

    Option verboseOpt =
        obuilder
            .withLongName("verbose")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("verbose")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("false")
                    .create())
            .withDescription(
                "print verbose information, like top-terms in each topic, during iteration")
            .withShortName("v")
            .create();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(inputDirOpt)
            .withOption(numTopicsOpt)
            .withOption(alphaOpt)
            .withOption(etaOpt)
            .withOption(maxIterOpt)
            .withOption(burnInOpt)
            .withOption(convergenceOpt)
            .withOption(dictOpt)
            .withOption(reInferDocTopicsOpt)
            .withOption(outputDocFileOpt)
            .withOption(outputTopicFileOpt)
            .withOption(dfsOpt)
            .withOption(numTrainThreadsOpt)
            .withOption(numUpdateThreadsOpt)
            .withOption(modelCorpusFractionOption)
            .withOption(verboseOpt)
            .create();

    try {
      Parser parser = new Parser();

      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return -1;
      }

      String inputDirString = (String) cmdLine.getValue(inputDirOpt);
      String dictDirString = cmdLine.hasOption(dictOpt) ? (String) cmdLine.getValue(dictOpt) : null;
      int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt));
      double alpha = Double.parseDouble((String) cmdLine.getValue(alphaOpt));
      double eta = Double.parseDouble((String) cmdLine.getValue(etaOpt));
      int maxIterations = Integer.parseInt((String) cmdLine.getValue(maxIterOpt));
      int burnInIterations = (Integer) cmdLine.getValue(burnInOpt);
      double minFractionalErrorChange =
          Double.parseDouble((String) cmdLine.getValue(convergenceOpt));
      int numTrainThreads = Integer.parseInt((String) cmdLine.getValue(numTrainThreadsOpt));
      int numUpdateThreads = Integer.parseInt((String) cmdLine.getValue(numUpdateThreadsOpt));
      String topicOutFile = (String) cmdLine.getValue(outputTopicFileOpt);
      String docOutFile = (String) cmdLine.getValue(outputDocFileOpt);
      // String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt);
      boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt));
      double modelCorpusFraction = (Double) cmdLine.getValue(modelCorpusFractionOption);

      long start = System.nanoTime();

      if (conf.get("fs.default.name") == null) {
        String dfsNameNode = (String) cmdLine.getValue(dfsOpt);
        conf.set("fs.default.name", dfsNameNode);
      }
      String[] terms = loadDictionary(dictDirString, conf);
      logTime("dictionary loading", System.nanoTime() - start);
      start = System.nanoTime();
      Matrix corpus = loadVectors(inputDirString, conf);
      logTime("vector seqfile corpus loading", System.nanoTime() - start);
      start = System.nanoTime();
      InMemoryCollapsedVariationalBayes0 cvb0 =
          new InMemoryCollapsedVariationalBayes0(
              corpus,
              terms,
              numTopics,
              alpha,
              eta,
              numTrainThreads,
              numUpdateThreads,
              modelCorpusFraction);
      logTime("cvb0 init", System.nanoTime() - start);

      start = System.nanoTime();
      cvb0.setVerbose(verbose);
      cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations);
      logTime("total training time", System.nanoTime() - start);

      /*
      if ("randstart".equalsIgnoreCase(reInferDocTopics)) {
        cvb0.inferDocuments(0.0, 100, true);
      } else if ("continue".equalsIgnoreCase(reInferDocTopics)) {
        cvb0.inferDocuments(0.0, 100, false);
      }
       */

      start = System.nanoTime();
      cvb0.writeModel(new Path(topicOutFile));
      DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts);
      logTime("printTopics", System.nanoTime() - start);
    } catch (OptionException e) {
      log.error("Error while parsing options", e);
      CommandLineUtil.printHelp(group);
    }
    return 0;
  }
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException, InterruptedException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withRequired(true)
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();
    Option analyzerOpt =
        obuilder
            .withLongName("analyzer")
            .withRequired(false)
            .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
            .withDescription("The analyzer to use, must have a no argument constructor")
            .withShortName("a")
            .create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(analyzerOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
      String catFile = (String) cmdLine.getValue(categoriesOpt);
      Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class;
      if (cmdLine.hasOption(analyzerOpt)) {
        String className = cmdLine.getValue(analyzerOpt).toString();
        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
        // try instantiating it, b/c there isn't any point in setting it if
        // you can't instantiate it
        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }
Esempio n. 9
0
  public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt =
        obuilder
            .withLongName("input")
            .withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("Path to an LDA output (a state)")
            .withShortName("i")
            .create();

    Option dictOpt =
        obuilder
            .withLongName("dict")
            .withRequired(true)
            .withArgument(abuilder.withName("dict").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Dictionary to read in, in the same format as one created by "
                    + "org.apache.mahout.utils.vectors.lucene.Driver")
            .withShortName("d")
            .create();

    Option outOpt =
        obuilder
            .withLongName("output")
            .withRequired(false)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("Output directory to write top words")
            .withShortName("o")
            .create();

    Option wordOpt =
        obuilder
            .withLongName("words")
            .withRequired(false)
            .withArgument(
                abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault("20").create())
            .withDescription("Number of words to print")
            .withShortName("w")
            .create();
    Option dictTypeOpt =
        obuilder
            .withLongName("dictionaryType")
            .withRequired(false)
            .withArgument(
                abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create())
            .withDescription("The dictionary file type (text|sequencefile)")
            .withShortName("dt")
            .create();
    Option helpOpt =
        obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(dictOpt)
            .withOption(outOpt)
            .withOption(wordOpt)
            .withOption(inputOpt)
            .withOption(dictTypeOpt)
            .create();
    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);

      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String input = cmdLine.getValue(inputOpt).toString();
      String dictFile = cmdLine.getValue(dictOpt).toString();
      int numWords = 20;
      if (cmdLine.hasOption(wordOpt)) {
        numWords = Integer.parseInt(cmdLine.getValue(wordOpt).toString());
      }
      Configuration config = new Configuration();

      String dictionaryType = "text";
      if (cmdLine.hasOption(dictTypeOpt)) {
        dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
      }

      List<String> wordList;
      if ("text".equals(dictionaryType)) {
        wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(dictFile)));
      } else if ("sequencefile".equals(dictionaryType)) {
        wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, dictFile));
      } else {
        throw new IllegalArgumentException("Invalid dictionary format");
      }

      List<Queue<Pair<String, Double>>> topWords =
          topWordsForTopics(input, config, wordList, numWords);

      File output = null;
      if (cmdLine.hasOption(outOpt)) {
        output = new File(cmdLine.getValue(outOpt).toString());
        if (!output.exists() && !output.mkdirs()) {
          throw new IOException("Could not create directory: " + output);
        }
      }
      printTopWords(topWords, output);
    } catch (OptionException e) {
      CommandLineUtil.printHelp(group);
      throw e;
    }
  }