@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.t1Option().create()); addOption(DefaultOptionCreator.t2Option().create()); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> argMap = parseArguments(args); if (argMap == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(new Configuration(), output); } String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); run(input, output, measure, t1, t2); return 0; }
private Map<String, List<String>> handleArgs(String[] args) throws IOException { addOutputOption(); addOption( "eigenInput", "ei", "The Path for purported eigenVector input files (SequenceFile<WritableComparable,VectorWritable>.", null); addOption( "corpusInput", "ci", "The Path for corpus input files (SequenceFile<WritableComparable,VectorWritable>."); addOption(DefaultOptionCreator.outputOption().create()); addOption(DefaultOptionCreator.helpOption()); addOption("inMemory", "mem", "Buffer eigen matrix into memory (if you have enough!)", "false"); addOption("maxError", "err", "Maximum acceptable error", "0.05"); addOption("minEigenvalue", "mev", "Minimum eigenvalue to keep the vector for", "0.0"); addOption("maxEigens", "max", "Maximum number of eigenvectors to keep (0 means all)", "0"); return parseArguments(args); }
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption(); addOutputOption(); addOption(MinhashOptionCreator.minClusterSizeOption().create()); addOption(MinhashOptionCreator.minVectorSizeOption().create()); addOption(MinhashOptionCreator.hashTypeOption().create()); addOption(MinhashOptionCreator.numHashFunctionsOption().create()); addOption(MinhashOptionCreator.keyGroupsOption().create()); addOption(MinhashOptionCreator.numReducersOption().create()); addOption(MinhashOptionCreator.debugOutputOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } int minClusterSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_CLUSTER_SIZE)); int minVectorSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_VECTOR_SIZE)); String hashType = getOption(MinhashOptionCreator.HASH_TYPE); int numHashFunctions = Integer.valueOf(getOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS)); int keyGroups = Integer.valueOf(getOption(MinhashOptionCreator.KEY_GROUPS)); int numReduceTasks = Integer.parseInt(getOption(MinhashOptionCreator.NUM_REDUCERS)); boolean debugOutput = hasOption(MinhashOptionCreator.DEBUG_OUTPUT); runJob( input, output, minClusterSize, minVectorSize, hashType, numHashFunctions, keyGroups, numReduceTasks, debugOutput); return 0; }
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputDirOpt = DefaultOptionCreator.inputOption().create(); Option outputOpt = DefaultOptionCreator.outputOption().create(); Option helpOpt = DefaultOptionCreator.helpOption(); Option recordSplitterOpt = obuilder .withLongName("splitterPattern") .withArgument( abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()) .withDescription( "Regular Expression pattern used to split given line into fields." + " Default value splits comma or tab separated fields." + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ") .withShortName("regex") .create(); Option encodingOpt = obuilder .withLongName("encoding") .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) The file encoding. Default value: UTF-8") .withShortName("e") .create(); Group group = gbuilder .withName("Options") .withOption(inputDirOpt) .withOption(outputOpt) .withOption(helpOpt) .withOption(recordSplitterOpt) .withOption(encodingOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } Parameters params = new Parameters(); if (cmdLine.hasOption(recordSplitterOpt)) { params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt)); } String encoding = "UTF-8"; if (cmdLine.hasOption(encodingOpt)) { encoding = (String) cmdLine.getValue(encodingOpt); } params.set("encoding", encoding); String inputDir = (String) cmdLine.getValue(inputDirOpt); String outputDir = (String) cmdLine.getValue(outputOpt); params.set("input", inputDir); params.set("output", outputDir); params.set("groupingFieldCount", "2"); params.set("gfield0", "1"); params.set("gfield1", "2"); params.set("selectedFieldCount", "1"); params.set("field0", "3"); params.set("maxTransactionLength", "100"); KeyBasedStringTupleGrouper.startJob(params); } catch (OptionException ex) { CommandLineUtil.printHelp(group); } }
/** * Takes in two arguments: * * <ol> * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link * org.apache.hadoop.io.SequenceFile} * </ol> */ public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dirInputPathOpt = DefaultOptionCreator.inputOption().create(); Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create(); Option categoriesOpt = obuilder .withLongName("categories") .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create()) .withDescription( "Location of the categories file. One entry per line. " + "Will be used to make a string match in Wikipedia Category field") .withShortName("c") .create(); Option exactMatchOpt = obuilder .withLongName("exactMatch") .withDescription( "If set, then the category name must exactly match the " + "entry in the categories file. Default is false") .withShortName("e") .create(); Option allOpt = obuilder .withLongName("all") .withDescription("If set, Select all files. Default is false") .withShortName("all") .create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder .withName("Options") .withOption(categoriesOpt) .withOption(dirInputPathOpt) .withOption(dirOutputPathOpt) .withOption(exactMatchOpt) .withOption(allOpt) .withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String inputPath = (String) cmdLine.getValue(dirInputPathOpt); String outputPath = (String) cmdLine.getValue(dirOutputPathOpt); String catFile = ""; if (cmdLine.hasOption(categoriesOpt)) { catFile = (String) cmdLine.getValue(categoriesOpt); } boolean all = false; if (cmdLine.hasOption(allOpt)) { all = true; } runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (InterruptedException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (ClassNotFoundException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
@Override public int run(String[] args) throws Exception { String path = System.getProperty("user.dir"); addInputOption(); addOutputOption(); addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f)); addOption( buildOption( TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false))); addOption(LABEL_INDEX, "li", "The path to store the label index in", false); addOption(DefaultOptionCreator.overwriteOption().create()); Path labPath = new Path(path + "/../out/labelindex/"); long labelSize = createLabelIndex(labPath); float alphaI = 1.0F; boolean trainComplementary = true; HadoopUtil.setSerializations(getConf()); HadoopUtil.cacheFiles(labPath, getConf()); HadoopUtil.delete(getConf(), new Path("/tmp/summedObservations")); HadoopUtil.delete(getConf(), new Path("/tmp/weights")); HadoopUtil.delete(getConf(), new Path("/tmp/thetas")); // Add up all the vectors with the same labels, while mapping the labels into our index Job indexInstances = prepareJob( new Path(path + "/../out/training"), new Path("/tmp/summedObservations"), SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); indexInstances.setCombinerClass(VectorSumReducer.class); boolean succeeded = indexInstances.waitForCompletion(true); if (!succeeded) { return -1; } // Sum up all the weights from the previous step, per label and per feature Job weightSummer = prepareJob( new Path("/tmp/summedObservations"), new Path("/tmp/weights"), SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize)); weightSummer.setCombinerClass(VectorSumReducer.class); succeeded = weightSummer.waitForCompletion(true); if (!succeeded) { return -1; } // Put the per label and per feature vectors into the cache HadoopUtil.cacheFiles(new Path("/tmp/weights"), getConf()); if (trainComplementary) { // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight // Magnitude Errors Job thetaSummer = prepareJob( new Path("/tmp/summedObservations"), new Path("/tmp/thetas"), SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); thetaSummer.setCombinerClass(VectorSumReducer.class); thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI); thetaSummer .getConfiguration() .setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary); succeeded = thetaSummer.waitForCompletion(true); if (!succeeded) { return -1; } } // Put the per label theta normalizers into the cache HadoopUtil.cacheFiles(new Path("/tmp/thetas"), getConf()); // Validate our model and then write it out to the official output getConf().setFloat(ThetaMapper.ALPHA_I, alphaI); getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path("/tmp/"), getConf()); naiveBayesModel.validate(); naiveBayesModel.serialize(new Path(path + "/../out/model"), getConf()); return 0; }
public static int main2(String[] args, Configuration conf) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option helpOpt = DefaultOptionCreator.helpOption(); Option inputDirOpt = obuilder .withLongName("input") .withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription( "The Directory on HDFS containing the collapsed, properly formatted files having " + "one doc per line") .withShortName("i") .create(); Option dictOpt = obuilder .withLongName("dictionary") .withRequired(false) .withArgument(abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()) .withDescription("The path to the term-dictionary format is ... ") .withShortName("d") .create(); Option dfsOpt = obuilder .withLongName("dfs") .withRequired(false) .withArgument(abuilder.withName("dfs").withMinimum(1).withMaximum(1).create()) .withDescription("HDFS namenode URI") .withShortName("dfs") .create(); Option numTopicsOpt = obuilder .withLongName("numTopics") .withRequired(true) .withArgument(abuilder.withName("numTopics").withMinimum(1).withMaximum(1).create()) .withDescription("Number of topics to learn") .withShortName("top") .create(); Option outputTopicFileOpt = obuilder .withLongName("topicOutputFile") .withRequired(true) .withArgument( abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create()) .withDescription("File to write out p(term | topic)") .withShortName("to") .create(); Option outputDocFileOpt = obuilder .withLongName("docOutputFile") .withRequired(true) .withArgument(abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create()) .withDescription("File to write out p(topic | docid)") .withShortName("do") .create(); Option alphaOpt = obuilder .withLongName("alpha") .withRequired(false) .withArgument( abuilder .withName("alpha") .withMinimum(1) .withMaximum(1) .withDefault("0.1") .create()) .withDescription("Smoothing parameter for p(topic | document) prior") .withShortName("a") .create(); Option etaOpt = obuilder .withLongName("eta") .withRequired(false) .withArgument( abuilder.withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create()) .withDescription("Smoothing parameter for p(term | topic)") .withShortName("e") .create(); Option maxIterOpt = obuilder .withLongName("maxIterations") .withRequired(false) .withArgument( abuilder .withName("maxIterations") .withMinimum(1) .withMaximum(1) .withDefault(10) .create()) .withDescription("Maximum number of training passes") .withShortName("m") .create(); Option modelCorpusFractionOption = obuilder .withLongName("modelCorpusFraction") .withRequired(false) .withArgument( abuilder .withName("modelCorpusFraction") .withMinimum(1) .withMaximum(1) .withDefault(0.0) .create()) .withShortName("mcf") .withDescription("For online updates, initial value of |model|/|corpus|") .create(); Option burnInOpt = obuilder .withLongName("burnInIterations") .withRequired(false) .withArgument( abuilder .withName("burnInIterations") .withMinimum(1) .withMaximum(1) .withDefault(5) .create()) .withDescription("Minimum number of iterations") .withShortName("b") .create(); Option convergenceOpt = obuilder .withLongName("convergence") .withRequired(false) .withArgument( abuilder .withName("convergence") .withMinimum(1) .withMaximum(1) .withDefault("0.0") .create()) .withDescription("Fractional rate of perplexity to consider convergence") .withShortName("c") .create(); Option reInferDocTopicsOpt = obuilder .withLongName("reInferDocTopics") .withRequired(false) .withArgument( abuilder .withName("reInferDocTopics") .withMinimum(1) .withMaximum(1) .withDefault("no") .create()) .withDescription("re-infer p(topic | doc) : [no | randstart | continue]") .withShortName("rdt") .create(); Option numTrainThreadsOpt = obuilder .withLongName("numTrainThreads") .withRequired(false) .withArgument( abuilder .withName("numTrainThreads") .withMinimum(1) .withMaximum(1) .withDefault("1") .create()) .withDescription("number of threads to train with") .withShortName("ntt") .create(); Option numUpdateThreadsOpt = obuilder .withLongName("numUpdateThreads") .withRequired(false) .withArgument( abuilder .withName("numUpdateThreads") .withMinimum(1) .withMaximum(1) .withDefault("1") .create()) .withDescription("number of threads to update the model with") .withShortName("nut") .create(); Option verboseOpt = obuilder .withLongName("verbose") .withRequired(false) .withArgument( abuilder .withName("verbose") .withMinimum(1) .withMaximum(1) .withDefault("false") .create()) .withDescription( "print verbose information, like top-terms in each topic, during iteration") .withShortName("v") .create(); Group group = gbuilder .withName("Options") .withOption(inputDirOpt) .withOption(numTopicsOpt) .withOption(alphaOpt) .withOption(etaOpt) .withOption(maxIterOpt) .withOption(burnInOpt) .withOption(convergenceOpt) .withOption(dictOpt) .withOption(reInferDocTopicsOpt) .withOption(outputDocFileOpt) .withOption(outputTopicFileOpt) .withOption(dfsOpt) .withOption(numTrainThreadsOpt) .withOption(numUpdateThreadsOpt) .withOption(modelCorpusFractionOption) .withOption(verboseOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return -1; } String inputDirString = (String) cmdLine.getValue(inputDirOpt); String dictDirString = cmdLine.hasOption(dictOpt) ? (String) cmdLine.getValue(dictOpt) : null; int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt)); double alpha = Double.parseDouble((String) cmdLine.getValue(alphaOpt)); double eta = Double.parseDouble((String) cmdLine.getValue(etaOpt)); int maxIterations = Integer.parseInt((String) cmdLine.getValue(maxIterOpt)); int burnInIterations = (Integer) cmdLine.getValue(burnInOpt); double minFractionalErrorChange = Double.parseDouble((String) cmdLine.getValue(convergenceOpt)); int numTrainThreads = Integer.parseInt((String) cmdLine.getValue(numTrainThreadsOpt)); int numUpdateThreads = Integer.parseInt((String) cmdLine.getValue(numUpdateThreadsOpt)); String topicOutFile = (String) cmdLine.getValue(outputTopicFileOpt); String docOutFile = (String) cmdLine.getValue(outputDocFileOpt); // String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt); boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt)); double modelCorpusFraction = (Double) cmdLine.getValue(modelCorpusFractionOption); long start = System.nanoTime(); if (conf.get("fs.default.name") == null) { String dfsNameNode = (String) cmdLine.getValue(dfsOpt); conf.set("fs.default.name", dfsNameNode); } String[] terms = loadDictionary(dictDirString, conf); logTime("dictionary loading", System.nanoTime() - start); start = System.nanoTime(); Matrix corpus = loadVectors(inputDirString, conf); logTime("vector seqfile corpus loading", System.nanoTime() - start); start = System.nanoTime(); InMemoryCollapsedVariationalBayes0 cvb0 = new InMemoryCollapsedVariationalBayes0( corpus, terms, numTopics, alpha, eta, numTrainThreads, numUpdateThreads, modelCorpusFraction); logTime("cvb0 init", System.nanoTime() - start); start = System.nanoTime(); cvb0.setVerbose(verbose); cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations); logTime("total training time", System.nanoTime() - start); /* if ("randstart".equalsIgnoreCase(reInferDocTopics)) { cvb0.inferDocuments(0.0, 100, true); } else if ("continue".equalsIgnoreCase(reInferDocTopics)) { cvb0.inferDocuments(0.0, 100, false); } */ start = System.nanoTime(); cvb0.writeModel(new Path(topicOutFile)); DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts); logTime("printTopics", System.nanoTime() - start); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); } return 0; }
@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix", false); addOption( "similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption( "maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); addOption( "excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?", String.valueOf(false)); addOption("threshold", "tr", "discard row pairs with a similarity value below this", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numberOfColumns; if (hasOption("numberOfColumns")) { // Number of columns explicitly specified via CLI numberOfColumns = Integer.parseInt(getOption("numberOfColumns")); } else { // else get the number of columns by determining the cardinality of a vector in the input // matrix numberOfColumns = getDimensions(getInputPath()); } String similarityClassnameArg = getOption("similarityClassname"); String similarityClassname; try { similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname(); } catch (IllegalArgumentException iae) { similarityClassname = similarityClassnameArg; } // Clear the output and temp paths if the overwrite option has been set if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { // Clear the temp path HadoopUtil.delete(getConf(), getTempPath()); // Clear the output path HadoopUtil.delete(getConf(), getOutputPath()); } int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow")); boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD; Path weightsPath = getTempPath("weights"); Path normsPath = getTempPath("norms.bin"); Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin"); Path maxValuesPath = getTempPath("maxValues.bin"); Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job normsAndTranspose = prepareJob( getInputPath(), weightsPath, VectorNormMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class); Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration(); normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold)); normsAndTransposeConf.set(NORMS_PATH, normsPath.toString()); normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString()); normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname); boolean succeeded = normsAndTranspose.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob( weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class, IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class, VectorWritable.class); pairwiseSimilarity.setCombinerClass(VectorSumReducer.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(THRESHOLD, String.valueOf(threshold)); pairwiseConf.set(NORMS_PATH, normsPath.toString()); pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString()); pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname); pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns); pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity); boolean succeeded = pairwiseSimilarity.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob( pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class, IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class, IntWritable.class, VectorWritable.class); asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); boolean succeeded = asMatrix.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; }
/** * Takes in two arguments: * * <ol> * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link * org.apache.hadoop.io.SequenceFile} * </ol> */ public static void main(String[] args) throws IOException, InterruptedException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dirInputPathOpt = DefaultOptionCreator.inputOption().create(); Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create(); Option categoriesOpt = obuilder .withLongName("categories") .withRequired(true) .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create()) .withDescription( "Location of the categories file. One entry per line. " + "Will be used to make a string match in Wikipedia Category field") .withShortName("c") .create(); Option exactMatchOpt = obuilder .withLongName("exactMatch") .withDescription( "If set, then the category name must exactly match the " + "entry in the categories file. Default is false") .withShortName("e") .create(); Option analyzerOpt = obuilder .withLongName("analyzer") .withRequired(false) .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()) .withDescription("The analyzer to use, must have a no argument constructor") .withShortName("a") .create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder .withName("Options") .withOption(categoriesOpt) .withOption(dirInputPathOpt) .withOption(dirOutputPathOpt) .withOption(exactMatchOpt) .withOption(analyzerOpt) .withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String inputPath = (String) cmdLine.getValue(dirInputPathOpt); String outputPath = (String) cmdLine.getValue(dirOutputPathOpt); String catFile = (String) cmdLine.getValue(categoriesOpt); Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class; if (cmdLine.hasOption(analyzerOpt)) { String className = cmdLine.getValue(analyzerOpt).toString(); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it ClassUtils.instantiateAs(analyzerClass, Analyzer.class); } runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (ClassNotFoundException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }