Пример #1
0
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();

    Option allOpt =
        obuilder
            .withLongName("all")
            .withDescription("If set, Select all files. Default is false")
            .withShortName("all")
            .create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(allOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    parser.setHelpOption(helpOpt);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

      String catFile = "";
      if (cmdLine.hasOption(categoriesOpt)) {
        catFile = (String) cmdLine.getValue(categoriesOpt);
      }

      boolean all = false;
      if (cmdLine.hasOption(allOpt)) {
        all = true;
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (InterruptedException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }
Пример #2
0
  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputOpt = DefaultOptionCreator.outputOption().create();

    Option helpOpt = DefaultOptionCreator.helpOption();
    Option recordSplitterOpt =
        obuilder
            .withLongName("splitterPattern")
            .withArgument(
                abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Regular Expression pattern used to split given line into fields."
                    + " Default value splits comma or tab separated fields."
                    + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ")
            .withShortName("regex")
            .create();
    Option encodingOpt =
        obuilder
            .withLongName("encoding")
            .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The file encoding.  Default value: UTF-8")
            .withShortName("e")
            .create();
    Group group =
        gbuilder
            .withName("Options")
            .withOption(inputDirOpt)
            .withOption(outputOpt)
            .withOption(helpOpt)
            .withOption(recordSplitterOpt)
            .withOption(encodingOpt)
            .create();

    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);

      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }
      Parameters params = new Parameters();
      if (cmdLine.hasOption(recordSplitterOpt)) {
        params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt));
      }

      String encoding = "UTF-8";
      if (cmdLine.hasOption(encodingOpt)) {
        encoding = (String) cmdLine.getValue(encodingOpt);
      }
      params.set("encoding", encoding);
      String inputDir = (String) cmdLine.getValue(inputDirOpt);
      String outputDir = (String) cmdLine.getValue(outputOpt);
      params.set("input", inputDir);
      params.set("output", outputDir);
      params.set("groupingFieldCount", "2");
      params.set("gfield0", "1");
      params.set("gfield1", "2");
      params.set("selectedFieldCount", "1");
      params.set("field0", "3");
      params.set("maxTransactionLength", "100");
      KeyBasedStringTupleGrouper.startJob(params);

    } catch (OptionException ex) {
      CommandLineUtil.printHelp(group);
    }
  }
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException, InterruptedException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withRequired(true)
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();
    Option analyzerOpt =
        obuilder
            .withLongName("analyzer")
            .withRequired(false)
            .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
            .withDescription("The analyzer to use, must have a no argument constructor")
            .withShortName("a")
            .create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(analyzerOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
      String catFile = (String) cmdLine.getValue(categoriesOpt);
      Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class;
      if (cmdLine.hasOption(analyzerOpt)) {
        String className = cmdLine.getValue(analyzerOpt).toString();
        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
        // try instantiating it, b/c there isn't any point in setting it if
        // you can't instantiate it
        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }