Ejemplo n.º 1
0
  private Map<String, List<String>> handleArgs(String[] args) throws IOException {
    addOutputOption();
    addOption(
        "eigenInput",
        "ei",
        "The Path for purported eigenVector input files (SequenceFile<WritableComparable,VectorWritable>.",
        null);
    addOption(
        "corpusInput",
        "ci",
        "The Path for corpus input files (SequenceFile<WritableComparable,VectorWritable>.");
    addOption(DefaultOptionCreator.outputOption().create());
    addOption(DefaultOptionCreator.helpOption());
    addOption("inMemory", "mem", "Buffer eigen matrix into memory (if you have enough!)", "false");
    addOption("maxError", "err", "Maximum acceptable error", "0.05");
    addOption("minEigenvalue", "mev", "Minimum eigenvalue to keep the vector for", "0.0");
    addOption("maxEigens", "max", "Maximum number of eigenvectors to keep (0 means all)", "0");

    return parseArguments(args);
  }
Ejemplo n.º 2
0
  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputOpt = DefaultOptionCreator.outputOption().create();

    Option helpOpt = DefaultOptionCreator.helpOption();
    Option recordSplitterOpt =
        obuilder
            .withLongName("splitterPattern")
            .withArgument(
                abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Regular Expression pattern used to split given line into fields."
                    + " Default value splits comma or tab separated fields."
                    + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ")
            .withShortName("regex")
            .create();
    Option encodingOpt =
        obuilder
            .withLongName("encoding")
            .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The file encoding.  Default value: UTF-8")
            .withShortName("e")
            .create();
    Group group =
        gbuilder
            .withName("Options")
            .withOption(inputDirOpt)
            .withOption(outputOpt)
            .withOption(helpOpt)
            .withOption(recordSplitterOpt)
            .withOption(encodingOpt)
            .create();

    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);

      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }
      Parameters params = new Parameters();
      if (cmdLine.hasOption(recordSplitterOpt)) {
        params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt));
      }

      String encoding = "UTF-8";
      if (cmdLine.hasOption(encodingOpt)) {
        encoding = (String) cmdLine.getValue(encodingOpt);
      }
      params.set("encoding", encoding);
      String inputDir = (String) cmdLine.getValue(inputDirOpt);
      String outputDir = (String) cmdLine.getValue(outputOpt);
      params.set("input", inputDir);
      params.set("output", outputDir);
      params.set("groupingFieldCount", "2");
      params.set("gfield0", "1");
      params.set("gfield1", "2");
      params.set("selectedFieldCount", "1");
      params.set("field0", "3");
      params.set("maxTransactionLength", "100");
      KeyBasedStringTupleGrouper.startJob(params);

    } catch (OptionException ex) {
      CommandLineUtil.printHelp(group);
    }
  }
  public static int main2(String[] args, Configuration conf) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Option inputDirOpt =
        obuilder
            .withLongName("input")
            .withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "The Directory on HDFS containing the collapsed, properly formatted files having "
                    + "one doc per line")
            .withShortName("i")
            .create();

    Option dictOpt =
        obuilder
            .withLongName("dictionary")
            .withRequired(false)
            .withArgument(abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the term-dictionary format is ... ")
            .withShortName("d")
            .create();

    Option dfsOpt =
        obuilder
            .withLongName("dfs")
            .withRequired(false)
            .withArgument(abuilder.withName("dfs").withMinimum(1).withMaximum(1).create())
            .withDescription("HDFS namenode URI")
            .withShortName("dfs")
            .create();

    Option numTopicsOpt =
        obuilder
            .withLongName("numTopics")
            .withRequired(true)
            .withArgument(abuilder.withName("numTopics").withMinimum(1).withMaximum(1).create())
            .withDescription("Number of topics to learn")
            .withShortName("top")
            .create();

    Option outputTopicFileOpt =
        obuilder
            .withLongName("topicOutputFile")
            .withRequired(true)
            .withArgument(
                abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create())
            .withDescription("File to write out p(term | topic)")
            .withShortName("to")
            .create();

    Option outputDocFileOpt =
        obuilder
            .withLongName("docOutputFile")
            .withRequired(true)
            .withArgument(abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create())
            .withDescription("File to write out p(topic | docid)")
            .withShortName("do")
            .create();

    Option alphaOpt =
        obuilder
            .withLongName("alpha")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("alpha")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("0.1")
                    .create())
            .withDescription("Smoothing parameter for p(topic | document) prior")
            .withShortName("a")
            .create();

    Option etaOpt =
        obuilder
            .withLongName("eta")
            .withRequired(false)
            .withArgument(
                abuilder.withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create())
            .withDescription("Smoothing parameter for p(term | topic)")
            .withShortName("e")
            .create();

    Option maxIterOpt =
        obuilder
            .withLongName("maxIterations")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("maxIterations")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(10)
                    .create())
            .withDescription("Maximum number of training passes")
            .withShortName("m")
            .create();

    Option modelCorpusFractionOption =
        obuilder
            .withLongName("modelCorpusFraction")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("modelCorpusFraction")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(0.0)
                    .create())
            .withShortName("mcf")
            .withDescription("For online updates, initial value of |model|/|corpus|")
            .create();

    Option burnInOpt =
        obuilder
            .withLongName("burnInIterations")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("burnInIterations")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault(5)
                    .create())
            .withDescription("Minimum number of iterations")
            .withShortName("b")
            .create();

    Option convergenceOpt =
        obuilder
            .withLongName("convergence")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("convergence")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("0.0")
                    .create())
            .withDescription("Fractional rate of perplexity to consider convergence")
            .withShortName("c")
            .create();

    Option reInferDocTopicsOpt =
        obuilder
            .withLongName("reInferDocTopics")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("reInferDocTopics")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("no")
                    .create())
            .withDescription("re-infer p(topic | doc) : [no | randstart | continue]")
            .withShortName("rdt")
            .create();

    Option numTrainThreadsOpt =
        obuilder
            .withLongName("numTrainThreads")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("numTrainThreads")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("1")
                    .create())
            .withDescription("number of threads to train with")
            .withShortName("ntt")
            .create();

    Option numUpdateThreadsOpt =
        obuilder
            .withLongName("numUpdateThreads")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("numUpdateThreads")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("1")
                    .create())
            .withDescription("number of threads to update the model with")
            .withShortName("nut")
            .create();

    Option verboseOpt =
        obuilder
            .withLongName("verbose")
            .withRequired(false)
            .withArgument(
                abuilder
                    .withName("verbose")
                    .withMinimum(1)
                    .withMaximum(1)
                    .withDefault("false")
                    .create())
            .withDescription(
                "print verbose information, like top-terms in each topic, during iteration")
            .withShortName("v")
            .create();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(inputDirOpt)
            .withOption(numTopicsOpt)
            .withOption(alphaOpt)
            .withOption(etaOpt)
            .withOption(maxIterOpt)
            .withOption(burnInOpt)
            .withOption(convergenceOpt)
            .withOption(dictOpt)
            .withOption(reInferDocTopicsOpt)
            .withOption(outputDocFileOpt)
            .withOption(outputTopicFileOpt)
            .withOption(dfsOpt)
            .withOption(numTrainThreadsOpt)
            .withOption(numUpdateThreadsOpt)
            .withOption(modelCorpusFractionOption)
            .withOption(verboseOpt)
            .create();

    try {
      Parser parser = new Parser();

      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return -1;
      }

      String inputDirString = (String) cmdLine.getValue(inputDirOpt);
      String dictDirString = cmdLine.hasOption(dictOpt) ? (String) cmdLine.getValue(dictOpt) : null;
      int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt));
      double alpha = Double.parseDouble((String) cmdLine.getValue(alphaOpt));
      double eta = Double.parseDouble((String) cmdLine.getValue(etaOpt));
      int maxIterations = Integer.parseInt((String) cmdLine.getValue(maxIterOpt));
      int burnInIterations = (Integer) cmdLine.getValue(burnInOpt);
      double minFractionalErrorChange =
          Double.parseDouble((String) cmdLine.getValue(convergenceOpt));
      int numTrainThreads = Integer.parseInt((String) cmdLine.getValue(numTrainThreadsOpt));
      int numUpdateThreads = Integer.parseInt((String) cmdLine.getValue(numUpdateThreadsOpt));
      String topicOutFile = (String) cmdLine.getValue(outputTopicFileOpt);
      String docOutFile = (String) cmdLine.getValue(outputDocFileOpt);
      // String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt);
      boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt));
      double modelCorpusFraction = (Double) cmdLine.getValue(modelCorpusFractionOption);

      long start = System.nanoTime();

      if (conf.get("fs.default.name") == null) {
        String dfsNameNode = (String) cmdLine.getValue(dfsOpt);
        conf.set("fs.default.name", dfsNameNode);
      }
      String[] terms = loadDictionary(dictDirString, conf);
      logTime("dictionary loading", System.nanoTime() - start);
      start = System.nanoTime();
      Matrix corpus = loadVectors(inputDirString, conf);
      logTime("vector seqfile corpus loading", System.nanoTime() - start);
      start = System.nanoTime();
      InMemoryCollapsedVariationalBayes0 cvb0 =
          new InMemoryCollapsedVariationalBayes0(
              corpus,
              terms,
              numTopics,
              alpha,
              eta,
              numTrainThreads,
              numUpdateThreads,
              modelCorpusFraction);
      logTime("cvb0 init", System.nanoTime() - start);

      start = System.nanoTime();
      cvb0.setVerbose(verbose);
      cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations);
      logTime("total training time", System.nanoTime() - start);

      /*
      if ("randstart".equalsIgnoreCase(reInferDocTopics)) {
        cvb0.inferDocuments(0.0, 100, true);
      } else if ("continue".equalsIgnoreCase(reInferDocTopics)) {
        cvb0.inferDocuments(0.0, 100, false);
      }
       */

      start = System.nanoTime();
      cvb0.writeModel(new Path(topicOutFile));
      DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts);
      logTime("printTopics", System.nanoTime() - start);
    } catch (OptionException e) {
      log.error("Error while parsing options", e);
      CommandLineUtil.printHelp(group);
    }
    return 0;
  }
Ejemplo n.º 4
0
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();

    Option allOpt =
        obuilder
            .withLongName("all")
            .withDescription("If set, Select all files. Default is false")
            .withShortName("all")
            .create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(allOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    parser.setHelpOption(helpOpt);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

      String catFile = "";
      if (cmdLine.hasOption(categoriesOpt)) {
        catFile = (String) cmdLine.getValue(categoriesOpt);
      }

      boolean all = false;
      if (cmdLine.hasOption(allOpt)) {
        all = true;
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (InterruptedException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException, InterruptedException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withRequired(true)
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();
    Option analyzerOpt =
        obuilder
            .withLongName("analyzer")
            .withRequired(false)
            .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
            .withDescription("The analyzer to use, must have a no argument constructor")
            .withShortName("a")
            .create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(analyzerOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
      String catFile = (String) cmdLine.getValue(categoriesOpt);
      Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class;
      if (cmdLine.hasOption(analyzerOpt)) {
        String className = cmdLine.getValue(analyzerOpt).toString();
        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
        // try instantiating it, b/c there isn't any point in setting it if
        // you can't instantiate it
        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }