/** * Takes in two arguments: * * <ol> * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link * org.apache.hadoop.io.SequenceFile} * </ol> */ public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dirInputPathOpt = obuilder .withLongName("input") .withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("The input directory path") .withShortName("i") .create(); Option dirOutputPathOpt = obuilder .withLongName("output") .withRequired(true) .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory Path") .withShortName("o") .create(); Option categoriesOpt = obuilder .withLongName("categories") .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create()) .withDescription( "Location of the categories file. One entry per line. " + "Will be used to make a string match in Wikipedia Category field") .withShortName("c") .create(); Option exactMatchOpt = obuilder .withLongName("exactMatch") .withDescription( "If set, then the category name must exactly match the " + "entry in the categories file. Default is false") .withShortName("e") .create(); Option allOpt = obuilder .withLongName("all") .withDescription("If set, Select all files. Default is false") .withShortName("all") .create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create(); Group group = gbuilder .withName("Options") .withOption(categoriesOpt) .withOption(dirInputPathOpt) .withOption(dirOutputPathOpt) .withOption(exactMatchOpt) .withOption(allOpt) .withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String inputPath = (String) cmdLine.getValue(dirInputPathOpt); String outputPath = (String) cmdLine.getValue(dirOutputPathOpt); String catFile = ""; if (cmdLine.hasOption(categoriesOpt)) { catFile = (String) cmdLine.getValue(categoriesOpt); } boolean all = false; if (cmdLine.hasOption(allOpt)) { all = true; } runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (InterruptedException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (ClassNotFoundException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputDirOpt = DefaultOptionCreator.inputOption().create(); Option outputOpt = DefaultOptionCreator.outputOption().create(); Option helpOpt = DefaultOptionCreator.helpOption(); Option recordSplitterOpt = obuilder .withLongName("splitterPattern") .withArgument( abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()) .withDescription( "Regular Expression pattern used to split given line into fields." + " Default value splits comma or tab separated fields." + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ") .withShortName("regex") .create(); Option encodingOpt = obuilder .withLongName("encoding") .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) The file encoding. Default value: UTF-8") .withShortName("e") .create(); Group group = gbuilder .withName("Options") .withOption(inputDirOpt) .withOption(outputOpt) .withOption(helpOpt) .withOption(recordSplitterOpt) .withOption(encodingOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } Parameters params = new Parameters(); if (cmdLine.hasOption(recordSplitterOpt)) { params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt)); } String encoding = "UTF-8"; if (cmdLine.hasOption(encodingOpt)) { encoding = (String) cmdLine.getValue(encodingOpt); } params.set("encoding", encoding); String inputDir = (String) cmdLine.getValue(inputDirOpt); String outputDir = (String) cmdLine.getValue(outputOpt); params.set("input", inputDir); params.set("output", outputDir); params.set("groupingFieldCount", "2"); params.set("gfield0", "1"); params.set("gfield1", "2"); params.set("selectedFieldCount", "1"); params.set("field0", "3"); params.set("maxTransactionLength", "100"); KeyBasedStringTupleGrouper.startJob(params); } catch (OptionException ex) { CommandLineUtil.printHelp(group); } }
public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder .withLongName("input") .withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription( "The file or directory containing the ARFF files. If it is a directory, all .arff files will be converted") .withShortName("d") .create(); Option outputOpt = obuilder .withLongName("output") .withRequired(true) .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()) .withDescription( "The output directory. Files will have the same name as the input, but with the extension .mvc") .withShortName("o") .create(); Option maxOpt = obuilder .withLongName("max") .withRequired(false) .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create()) .withDescription( "The maximum number of vectors to output. If not specified, then it will loop over all docs") .withShortName("m") .create(); Option dictOutOpt = obuilder .withLongName("dictOut") .withRequired(true) .withArgument(abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()) .withDescription("The file to output the label bindings") .withShortName("t") .create(); Option jsonDictonaryOpt = obuilder .withLongName("json-dictonary") .withRequired(false) .withDescription("Write dictonary in JSON format") .withShortName("j") .create(); Option delimiterOpt = obuilder .withLongName("delimiter") .withRequired(false) .withArgument(abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()) .withDescription("The delimiter for outputing the dictionary") .withShortName("l") .create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create(); Group group = gbuilder .withName("Options") .withOption(inputOpt) .withOption(outputOpt) .withOption(maxOpt) .withOption(helpOpt) .withOption(dictOutOpt) .withOption(jsonDictonaryOpt) .withOption(delimiterOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } if (cmdLine.hasOption(inputOpt)) { // Lucene case File input = new File(cmdLine.getValue(inputOpt).toString()); long maxDocs = Long.MAX_VALUE; if (cmdLine.hasOption(maxOpt)) { maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString()); } if (maxDocs < 0) { throw new IllegalArgumentException("maxDocs must be >= 0"); } String outDir = cmdLine.getValue(outputOpt).toString(); log.info("Output Dir: {}", outDir); String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t"; File dictOut = new File(cmdLine.getValue(dictOutOpt).toString()); boolean jsonDictonary = cmdLine.hasOption(jsonDictonaryOpt); ARFFModel model = new MapBackedARFFModel(); if (input.exists() && input.isDirectory()) { File[] files = input.listFiles( new FilenameFilter() { @Override public boolean accept(File file, String name) { return name.endsWith(".arff"); } }); for (File file : files) { writeFile(outDir, file, maxDocs, model, dictOut, delimiter, jsonDictonary); } } else { writeFile(outDir, input, maxDocs, model, dictOut, delimiter, jsonDictonary); } } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
public static int main2(String[] args, Configuration conf) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option helpOpt = DefaultOptionCreator.helpOption(); Option inputDirOpt = obuilder .withLongName("input") .withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription( "The Directory on HDFS containing the collapsed, properly formatted files having " + "one doc per line") .withShortName("i") .create(); Option dictOpt = obuilder .withLongName("dictionary") .withRequired(false) .withArgument(abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()) .withDescription("The path to the term-dictionary format is ... ") .withShortName("d") .create(); Option dfsOpt = obuilder .withLongName("dfs") .withRequired(false) .withArgument(abuilder.withName("dfs").withMinimum(1).withMaximum(1).create()) .withDescription("HDFS namenode URI") .withShortName("dfs") .create(); Option numTopicsOpt = obuilder .withLongName("numTopics") .withRequired(true) .withArgument(abuilder.withName("numTopics").withMinimum(1).withMaximum(1).create()) .withDescription("Number of topics to learn") .withShortName("top") .create(); Option outputTopicFileOpt = obuilder .withLongName("topicOutputFile") .withRequired(true) .withArgument( abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create()) .withDescription("File to write out p(term | topic)") .withShortName("to") .create(); Option outputDocFileOpt = obuilder .withLongName("docOutputFile") .withRequired(true) .withArgument(abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create()) .withDescription("File to write out p(topic | docid)") .withShortName("do") .create(); Option alphaOpt = obuilder .withLongName("alpha") .withRequired(false) .withArgument( abuilder .withName("alpha") .withMinimum(1) .withMaximum(1) .withDefault("0.1") .create()) .withDescription("Smoothing parameter for p(topic | document) prior") .withShortName("a") .create(); Option etaOpt = obuilder .withLongName("eta") .withRequired(false) .withArgument( abuilder.withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create()) .withDescription("Smoothing parameter for p(term | topic)") .withShortName("e") .create(); Option maxIterOpt = obuilder .withLongName("maxIterations") .withRequired(false) .withArgument( abuilder .withName("maxIterations") .withMinimum(1) .withMaximum(1) .withDefault(10) .create()) .withDescription("Maximum number of training passes") .withShortName("m") .create(); Option modelCorpusFractionOption = obuilder .withLongName("modelCorpusFraction") .withRequired(false) .withArgument( abuilder .withName("modelCorpusFraction") .withMinimum(1) .withMaximum(1) .withDefault(0.0) .create()) .withShortName("mcf") .withDescription("For online updates, initial value of |model|/|corpus|") .create(); Option burnInOpt = obuilder .withLongName("burnInIterations") .withRequired(false) .withArgument( abuilder .withName("burnInIterations") .withMinimum(1) .withMaximum(1) .withDefault(5) .create()) .withDescription("Minimum number of iterations") .withShortName("b") .create(); Option convergenceOpt = obuilder .withLongName("convergence") .withRequired(false) .withArgument( abuilder .withName("convergence") .withMinimum(1) .withMaximum(1) .withDefault("0.0") .create()) .withDescription("Fractional rate of perplexity to consider convergence") .withShortName("c") .create(); Option reInferDocTopicsOpt = obuilder .withLongName("reInferDocTopics") .withRequired(false) .withArgument( abuilder .withName("reInferDocTopics") .withMinimum(1) .withMaximum(1) .withDefault("no") .create()) .withDescription("re-infer p(topic | doc) : [no | randstart | continue]") .withShortName("rdt") .create(); Option numTrainThreadsOpt = obuilder .withLongName("numTrainThreads") .withRequired(false) .withArgument( abuilder .withName("numTrainThreads") .withMinimum(1) .withMaximum(1) .withDefault("1") .create()) .withDescription("number of threads to train with") .withShortName("ntt") .create(); Option numUpdateThreadsOpt = obuilder .withLongName("numUpdateThreads") .withRequired(false) .withArgument( abuilder .withName("numUpdateThreads") .withMinimum(1) .withMaximum(1) .withDefault("1") .create()) .withDescription("number of threads to update the model with") .withShortName("nut") .create(); Option verboseOpt = obuilder .withLongName("verbose") .withRequired(false) .withArgument( abuilder .withName("verbose") .withMinimum(1) .withMaximum(1) .withDefault("false") .create()) .withDescription( "print verbose information, like top-terms in each topic, during iteration") .withShortName("v") .create(); Group group = gbuilder .withName("Options") .withOption(inputDirOpt) .withOption(numTopicsOpt) .withOption(alphaOpt) .withOption(etaOpt) .withOption(maxIterOpt) .withOption(burnInOpt) .withOption(convergenceOpt) .withOption(dictOpt) .withOption(reInferDocTopicsOpt) .withOption(outputDocFileOpt) .withOption(outputTopicFileOpt) .withOption(dfsOpt) .withOption(numTrainThreadsOpt) .withOption(numUpdateThreadsOpt) .withOption(modelCorpusFractionOption) .withOption(verboseOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return -1; } String inputDirString = (String) cmdLine.getValue(inputDirOpt); String dictDirString = cmdLine.hasOption(dictOpt) ? (String) cmdLine.getValue(dictOpt) : null; int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt)); double alpha = Double.parseDouble((String) cmdLine.getValue(alphaOpt)); double eta = Double.parseDouble((String) cmdLine.getValue(etaOpt)); int maxIterations = Integer.parseInt((String) cmdLine.getValue(maxIterOpt)); int burnInIterations = (Integer) cmdLine.getValue(burnInOpt); double minFractionalErrorChange = Double.parseDouble((String) cmdLine.getValue(convergenceOpt)); int numTrainThreads = Integer.parseInt((String) cmdLine.getValue(numTrainThreadsOpt)); int numUpdateThreads = Integer.parseInt((String) cmdLine.getValue(numUpdateThreadsOpt)); String topicOutFile = (String) cmdLine.getValue(outputTopicFileOpt); String docOutFile = (String) cmdLine.getValue(outputDocFileOpt); // String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt); boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt)); double modelCorpusFraction = (Double) cmdLine.getValue(modelCorpusFractionOption); long start = System.nanoTime(); if (conf.get("fs.default.name") == null) { String dfsNameNode = (String) cmdLine.getValue(dfsOpt); conf.set("fs.default.name", dfsNameNode); } String[] terms = loadDictionary(dictDirString, conf); logTime("dictionary loading", System.nanoTime() - start); start = System.nanoTime(); Matrix corpus = loadVectors(inputDirString, conf); logTime("vector seqfile corpus loading", System.nanoTime() - start); start = System.nanoTime(); InMemoryCollapsedVariationalBayes0 cvb0 = new InMemoryCollapsedVariationalBayes0( corpus, terms, numTopics, alpha, eta, numTrainThreads, numUpdateThreads, modelCorpusFraction); logTime("cvb0 init", System.nanoTime() - start); start = System.nanoTime(); cvb0.setVerbose(verbose); cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations); logTime("total training time", System.nanoTime() - start); /* if ("randstart".equalsIgnoreCase(reInferDocTopics)) { cvb0.inferDocuments(0.0, 100, true); } else if ("continue".equalsIgnoreCase(reInferDocTopics)) { cvb0.inferDocuments(0.0, 100, false); } */ start = System.nanoTime(); cvb0.writeModel(new Path(topicOutFile)); DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts); logTime("printTopics", System.nanoTime() - start); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); } return 0; }
/** * Takes in two arguments: * * <ol> * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link * org.apache.hadoop.io.SequenceFile} * </ol> */ public static void main(String[] args) throws IOException, InterruptedException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dirInputPathOpt = DefaultOptionCreator.inputOption().create(); Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create(); Option categoriesOpt = obuilder .withLongName("categories") .withRequired(true) .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create()) .withDescription( "Location of the categories file. One entry per line. " + "Will be used to make a string match in Wikipedia Category field") .withShortName("c") .create(); Option exactMatchOpt = obuilder .withLongName("exactMatch") .withDescription( "If set, then the category name must exactly match the " + "entry in the categories file. Default is false") .withShortName("e") .create(); Option analyzerOpt = obuilder .withLongName("analyzer") .withRequired(false) .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()) .withDescription("The analyzer to use, must have a no argument constructor") .withShortName("a") .create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder .withName("Options") .withOption(categoriesOpt) .withOption(dirInputPathOpt) .withOption(dirOutputPathOpt) .withOption(exactMatchOpt) .withOption(analyzerOpt) .withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String inputPath = (String) cmdLine.getValue(dirInputPathOpt); String outputPath = (String) cmdLine.getValue(dirOutputPathOpt); String catFile = (String) cmdLine.getValue(categoriesOpt); Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class; if (cmdLine.hasOption(analyzerOpt)) { String className = cmdLine.getValue(analyzerOpt).toString(); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it ClassUtils.instantiateAs(analyzerClass, Analyzer.class); } runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (ClassNotFoundException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder .withLongName("input") .withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("Path to an LDA output (a state)") .withShortName("i") .create(); Option dictOpt = obuilder .withLongName("dict") .withRequired(true) .withArgument(abuilder.withName("dict").withMinimum(1).withMaximum(1).create()) .withDescription( "Dictionary to read in, in the same format as one created by " + "org.apache.mahout.utils.vectors.lucene.Driver") .withShortName("d") .create(); Option outOpt = obuilder .withLongName("output") .withRequired(false) .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()) .withDescription("Output directory to write top words") .withShortName("o") .create(); Option wordOpt = obuilder .withLongName("words") .withRequired(false) .withArgument( abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault("20").create()) .withDescription("Number of words to print") .withShortName("w") .create(); Option dictTypeOpt = obuilder .withLongName("dictionaryType") .withRequired(false) .withArgument( abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()) .withDescription("The dictionary file type (text|sequencefile)") .withShortName("dt") .create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create(); Group group = gbuilder .withName("Options") .withOption(dictOpt) .withOption(outOpt) .withOption(wordOpt) .withOption(inputOpt) .withOption(dictTypeOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String input = cmdLine.getValue(inputOpt).toString(); String dictFile = cmdLine.getValue(dictOpt).toString(); int numWords = 20; if (cmdLine.hasOption(wordOpt)) { numWords = Integer.parseInt(cmdLine.getValue(wordOpt).toString()); } Configuration config = new Configuration(); String dictionaryType = "text"; if (cmdLine.hasOption(dictTypeOpt)) { dictionaryType = cmdLine.getValue(dictTypeOpt).toString(); } List<String> wordList; if ("text".equals(dictionaryType)) { wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(dictFile))); } else if ("sequencefile".equals(dictionaryType)) { wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, dictFile)); } else { throw new IllegalArgumentException("Invalid dictionary format"); } List<Queue<Pair<String, Double>>> topWords = topWordsForTopics(input, config, wordList, numWords); File output = null; if (cmdLine.hasOption(outOpt)) { output = new File(cmdLine.getValue(outOpt).toString()); if (!output.exists() && !output.mkdirs()) { throw new IOException("Could not create directory: " + output); } } printTopWords(topWords, output); } catch (OptionException e) { CommandLineUtil.printHelp(group); throw e; } }