/** * Takes in two arguments: * * <ol> * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link * org.apache.hadoop.io.SequenceFile} * </ol> */ public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dirInputPathOpt = DefaultOptionCreator.inputOption().create(); Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create(); Option categoriesOpt = obuilder .withLongName("categories") .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create()) .withDescription( "Location of the categories file. One entry per line. " + "Will be used to make a string match in Wikipedia Category field") .withShortName("c") .create(); Option exactMatchOpt = obuilder .withLongName("exactMatch") .withDescription( "If set, then the category name must exactly match the " + "entry in the categories file. Default is false") .withShortName("e") .create(); Option allOpt = obuilder .withLongName("all") .withDescription("If set, Select all files. Default is false") .withShortName("all") .create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder .withName("Options") .withOption(categoriesOpt) .withOption(dirInputPathOpt) .withOption(dirOutputPathOpt) .withOption(exactMatchOpt) .withOption(allOpt) .withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String inputPath = (String) cmdLine.getValue(dirInputPathOpt); String outputPath = (String) cmdLine.getValue(dirOutputPathOpt); String catFile = ""; if (cmdLine.hasOption(categoriesOpt)) { catFile = (String) cmdLine.getValue(categoriesOpt); } boolean all = false; if (cmdLine.hasOption(allOpt)) { all = true; } runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (InterruptedException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (ClassNotFoundException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputDirOpt = DefaultOptionCreator.inputOption().create(); Option outputOpt = DefaultOptionCreator.outputOption().create(); Option helpOpt = DefaultOptionCreator.helpOption(); Option recordSplitterOpt = obuilder .withLongName("splitterPattern") .withArgument( abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()) .withDescription( "Regular Expression pattern used to split given line into fields." + " Default value splits comma or tab separated fields." + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ") .withShortName("regex") .create(); Option encodingOpt = obuilder .withLongName("encoding") .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) The file encoding. Default value: UTF-8") .withShortName("e") .create(); Group group = gbuilder .withName("Options") .withOption(inputDirOpt) .withOption(outputOpt) .withOption(helpOpt) .withOption(recordSplitterOpt) .withOption(encodingOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } Parameters params = new Parameters(); if (cmdLine.hasOption(recordSplitterOpt)) { params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt)); } String encoding = "UTF-8"; if (cmdLine.hasOption(encodingOpt)) { encoding = (String) cmdLine.getValue(encodingOpt); } params.set("encoding", encoding); String inputDir = (String) cmdLine.getValue(inputDirOpt); String outputDir = (String) cmdLine.getValue(outputOpt); params.set("input", inputDir); params.set("output", outputDir); params.set("groupingFieldCount", "2"); params.set("gfield0", "1"); params.set("gfield1", "2"); params.set("selectedFieldCount", "1"); params.set("field0", "3"); params.set("maxTransactionLength", "100"); KeyBasedStringTupleGrouper.startJob(params); } catch (OptionException ex) { CommandLineUtil.printHelp(group); } }
/** * Takes in two arguments: * * <ol> * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link * org.apache.hadoop.io.SequenceFile} * </ol> */ public static void main(String[] args) throws IOException, InterruptedException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dirInputPathOpt = DefaultOptionCreator.inputOption().create(); Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create(); Option categoriesOpt = obuilder .withLongName("categories") .withRequired(true) .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create()) .withDescription( "Location of the categories file. One entry per line. " + "Will be used to make a string match in Wikipedia Category field") .withShortName("c") .create(); Option exactMatchOpt = obuilder .withLongName("exactMatch") .withDescription( "If set, then the category name must exactly match the " + "entry in the categories file. Default is false") .withShortName("e") .create(); Option analyzerOpt = obuilder .withLongName("analyzer") .withRequired(false) .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()) .withDescription("The analyzer to use, must have a no argument constructor") .withShortName("a") .create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder .withName("Options") .withOption(categoriesOpt) .withOption(dirInputPathOpt) .withOption(dirOutputPathOpt) .withOption(exactMatchOpt) .withOption(analyzerOpt) .withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String inputPath = (String) cmdLine.getValue(dirInputPathOpt); String outputPath = (String) cmdLine.getValue(dirOutputPathOpt); String catFile = (String) cmdLine.getValue(categoriesOpt); Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class; if (cmdLine.hasOption(analyzerOpt)) { String className = cmdLine.getValue(analyzerOpt).toString(); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it ClassUtils.instantiateAs(analyzerClass, Analyzer.class); } runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (ClassNotFoundException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }