/** * Run the job * * @param params The Job parameters containing the gramSize, input output folders, defaultCat, * encoding */ public static void runJob(Parameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesClassifierDriver.class); conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath")); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath"))); Path outPath = new Path(params.get("testDirPath") + "-output"); FileOutputFormat.setOutputPath(conf, outPath); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setMapperClass(BayesClassifierMapper.class); conf.setCombinerClass(BayesClassifierReducer.class); conf.setReducerClass(BayesClassifierReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); HadoopUtil.overwriteOutput(outPath); conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); Path outputFiles = new Path(outPath, "part*"); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params); log.info("{}", matrix.summarize()); }
/** * @param args * @throws Exception */ public static void main(String args[]) throws Exception { log.setLevel(Level.INFO); Configuration v_HadoopConf = new Configuration(); s_HadoopConf = v_HadoopConf; s_Config = Config.getInstance(); String v_PathPrefix = s_Config.getBaseDir() + s_Config.getDataDir() + "mahout/"; String v_TextDir = s_Config.getTextPath(); Path m_DocumentDir = new Path(v_TextDir); Path m_SequenceDir = new Path(v_PathPrefix, "sequence/"); Path m_TokensDir = new Path(v_PathPrefix, "tokens"); Path m_TF = new Path(v_PathPrefix, "termfreq/"); String m_VecFolder = "Vectors"; Path m_tf_idf = new Path(v_PathPrefix, "tfidf/"); boolean m_Sequential = true; HadoopUtil.delete(v_HadoopConf, new Path(v_PathPrefix, "clusters/")); if (!s_Config.getReuseTFIDF()) { createTFIDF( v_HadoopConf, m_DocumentDir, m_SequenceDir, m_TokensDir, m_TF, m_VecFolder, m_tf_idf); } HierarchicalKMeansClusterer v_Hkmc = new HierarchicalKMeansClusterer(); SetTree<ClusterDescriptor> v_Tree = v_Hkmc.run(s_HadoopConf, m_Sequential); saveAsTree(v_Tree); saveAsXml(v_Tree); }
@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.t1Option().create()); addOption(DefaultOptionCreator.t2Option().create()); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> argMap = parseArguments(args); if (argMap == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(new Configuration(), output); } String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); run(input, output, measure, t1, t2); return 0; }
/** * Count the document frequencies of features in parallel using Map/Reduce. The input documents * have to be in {@link SequenceFile} format */ private static void startDFCounting(Path input, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Job job = new Job(conf); job.setJobName("VectorTfIdf Document Frequency Count running over input: " + input); job.setJarByClass(TFIDFConverter.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(TermDocumentCountMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setCombinerClass(TermDocumentCountReducer.class); job.setReducerClass(TermDocumentCountReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
@Override protected void setup(Context ctx) throws IOException, InterruptedException { try { Configuration conf = ctx.getConfiguration(); Path[] localFiles = DistributedCache.getLocalCacheFiles(conf); Preconditions.checkArgument( localFiles != null && localFiles.length >= 1, "missing paths from the DistributedCache"); Path inputVectorPath = HadoopUtil.getSingleCachedFile(conf); SequenceFileValueIterator<VectorWritable> iterator = new SequenceFileValueIterator<VectorWritable>(inputVectorPath, true, conf); try { inputVector = iterator.next().get(); } finally { Closeables.close(iterator, true); } int outDim = conf.getInt(OUTPUT_VECTOR_DIMENSION, Integer.MAX_VALUE); outputVector = conf.getBoolean(IS_SPARSE_OUTPUT, false) ? new RandomAccessSparseVector(outDim, 10) : new DenseVector(outDim); } catch (IOException ioe) { throw new IllegalStateException(ioe); } }
/** * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set of vectors * in {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the * feature chunk per node thereby splitting the process across multiple map/reduces. Before using * this method calculateDF should be called * * @param input input directory of the vectors in {@link SequenceFile} format * @param output output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s * of the document are generated * @param datasetFeatures Document frequencies information calculated by calculateDF * @param minDf The minimum document frequency. Default 1 * @param maxDF The max percentage of vectors for the DF. Can be used to remove really high * frequency features. Expressed as an integer between 0 and 100. Default 99 * @param numReducers The number of reducers to spawn. This also affects the possible parallelism * since each reducer will typically produce a single output file containing tf-idf vectors * for a subset of the documents in the corpus. */ public static void processTfIdf( Path input, Path output, Configuration baseConf, Pair<Long[], List<Path>> datasetFeatures, int minDf, long maxDF, float normPower, boolean logNormalize, boolean sequentialAccessOutput, boolean namedVector, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Preconditions.checkArgument( normPower == PartialVectorMerger.NO_NORMALIZING || normPower >= 0, "If specified normPower must be nonnegative", normPower); Preconditions.checkArgument( normPower == PartialVectorMerger.NO_NORMALIZING || (normPower > 1 && !Double.isInfinite(normPower)) || !logNormalize, "normPower must be > 1 and not infinite if log normalization is chosen", normPower); int partialVectorIndex = 0; List<Path> partialVectorPaths = Lists.newArrayList(); List<Path> dictionaryChunks = datasetFeatures.getSecond(); for (Path dictionaryChunk : dictionaryChunks) { Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++); partialVectorPaths.add(partialVectorOutputPath); makePartialVectors( input, baseConf, datasetFeatures.getFirst()[0], datasetFeatures.getFirst()[1], minDf, maxDF, dictionaryChunk, partialVectorOutputPath, sequentialAccessOutput, namedVector); } Configuration conf = new Configuration(baseConf); Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER); PartialVectorMerger.mergePartialVectors( partialVectorPaths, outputDir, baseConf, normPower, logNormalize, datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, namedVector, numReducers); HadoopUtil.delete(conf, partialVectorPaths); }
/** * Merge all the partial {@link org.apache.mahout.math.RandomAccessSparseVector}s into the * complete Document {@link org.apache.mahout.math.RandomAccessSparseVector} * * @param partialVectorPaths input directory of the vectors in {@link * org.apache.hadoop.io.SequenceFile} format * @param output output directory were the partial vectors have to be created * @param baseConf job configuration * @param normPower The normalization value. Must be greater than or equal to 0 or equal to {@link * #NO_NORMALIZING} * @param dimension * @param sequentialAccess output vectors should be optimized for sequential access * @param namedVector output vectors should be named, retaining key (doc id) as a label * @param numReducers The number of reducers to spawn */ public static void mergePartialVectors( Iterable<Path> partialVectorPaths, Path output, Configuration baseConf, float normPower, boolean logNormalize, int dimension, boolean sequentialAccess, boolean namedVector, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Preconditions.checkArgument( normPower == NO_NORMALIZING || normPower >= 0, "If specified normPower must be nonnegative", normPower); Preconditions.checkArgument( normPower == NO_NORMALIZING || (normPower > 1 && !Double.isInfinite(normPower)) || !logNormalize, "normPower must be > 1 and not infinite if log normalization is chosen", normPower); Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(NAMED_VECTOR, namedVector); conf.setInt(DIMENSION, dimension); conf.setFloat(NORMALIZATION_POWER, normPower); conf.setBoolean(LOG_NORMALIZE, logNormalize); Job job = new Job(conf); job.setJobName("PartialVectorMerger::MergePartialVectors"); job.setJarByClass(PartialVectorMerger.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, getCommaSeparatedPaths(partialVectorPaths)); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(PartialVectorMergeReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
public static void main(String[] args) throws Exception { if (args.length > 0) { log.info("Running with only user-supplied arguments"); ToolRunner.run(new Configuration(), new Job(), args); } else { log.info("Running with default arguments"); Path output = new Path("output"); HadoopUtil.delete(new Configuration(), output); run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55); } }
public static Job createTimesSquaredJob( Configuration initialConf, Vector v, int outputVectorDim, Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass, Class<? extends VectorSummingReducer> redClass) throws IOException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf); matrixInputPath = fs.makeQualified(matrixInputPath); outputVectorPathBase = fs.makeQualified(outputVectorPathBase); long now = System.nanoTime(); Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now); SequenceFile.Writer inputVectorPathWriter = null; try { inputVectorPathWriter = new SequenceFile.Writer( fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class); inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v)); } finally { Closeables.close(inputVectorPathWriter, false); } URI ivpURI = inputVectorPath.toUri(); DistributedCache.setCacheFiles(new URI[] {ivpURI}, initialConf); Job job = HadoopUtil.prepareJob( matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME), SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(redClass); job.setJobName("TimesSquaredJob: " + matrixInputPath); Configuration conf = job.getConfiguration(); conf.set(INPUT_VECTOR, ivpURI.toString()); conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense()); conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim); return job; }
/** * Run the job * * @param input the input pathname String * @param output the output pathname String * @param catFile the file containing the Wikipedia categories * @param exactMatchOnly if true, then the Wikipedia category must match exactly instead of simply * containing the category string */ public static void runJob( String input, String output, String catFile, boolean exactMatchOnly, Class<? extends Analyzer> analyzerClass) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("key.value.separator.in.input.line", " "); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); conf.setBoolean("exact.match.only", exactMatchOnly); conf.set("analyzer.class", analyzerClass.getName()); conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters can make or break a piece of code Set<String> categories = Sets.newHashSet(); for (String line : new FileLineIterable(new File(catFile))) { categories.add(line.trim().toLowerCase(Locale.ENGLISH)); } Stringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories)); String categoriesStr = setStringifier.toString(categories); conf.set("wikipedia.categories", categoriesStr); Job job = new Job(conf); log.info("Input: {} Out: {} Categories: {}", input, output, catFile); job.setJarByClass(WikipediaDatasetCreatorDriver.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WikipediaDatasetCreatorMapper.class); // TODO: job.setNumMapTasks(100); job.setInputFormatClass(XmlInputFormat.class); job.setReducerClass(WikipediaDatasetCreatorReducer.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
/** * Run the job * * @param input the input pathname String * @param output the output pathname String * @param catFile the file containing the Wikipedia categories * @param exactMatchOnly if true, then the Wikipedia category must match exactly instead of simply * containing the category string * @param all if true select all categories * @throws ClassNotFoundException * @throws InterruptedException */ public static void runJob( String input, String output, String catFile, boolean exactMatchOnly, boolean all) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); conf.setBoolean("exact.match.only", exactMatchOnly); conf.setBoolean("all.files", all); conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Job job = new Job(conf); if (WikipediaToSequenceFile.log.isInfoEnabled()) { log.info( "Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(WikipediaMapper.class); job.setInputFormatClass(XmlInputFormat.class); job.setReducerClass(Reducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(WikipediaToSequenceFile.class); /* * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type", * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type", * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); */ HadoopUtil.overwriteOutput(outPath); Set<String> categories = new HashSet<String>(); if (catFile.length() > 0) { for (String line : new FileLineIterable(new File(catFile))) { categories.add(line.trim().toLowerCase(Locale.ENGLISH)); } } DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories)); String categoriesStr = setStringifier.toString(categories); conf.set("wikipedia.categories", categoriesStr); job.waitForCompletion(true); }
/** * Create a partial tfidf vector using a chunk of features from the input vectors. The input * vectors has to be in the {@link SequenceFile} format * * @param input input directory of the vectors in {@link SequenceFile} format * @param featureCount Number of unique features in the dataset * @param vectorCount Number of vectors in the dataset * @param minDf The minimum document frequency. Default 1 * @param maxDF The max percentage of vectors for the DF. Can be used to remove really high * frequency features. Expressed as an integer between 0 and 100. Default 99 * @param dictionaryFilePath location of the chunk of features and the id's * @param output output directory were the partial vectors have to be created * @param sequentialAccess output vectors should be optimized for sequential access * @param namedVector output vectors should be named, retaining key (doc id) as a label */ private static void makePartialVectors( Path input, Configuration baseConf, Long featureCount, Long vectorCount, int minDf, long maxDF, Path dictionaryFilePath, Path output, boolean sequentialAccess, boolean namedVector) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setLong(FEATURE_COUNT, featureCount); conf.setLong(VECTOR_COUNT, vectorCount); conf.setInt(MIN_DF, minDf); conf.setLong(MAX_DF, maxDF); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVector); DistributedCache.addCacheFile(dictionaryFilePath.toUri(), conf); Job job = new Job(conf); job.setJobName( ": MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString()); job.setJarByClass(TFIDFConverter.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFIDFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
private static void createTFIDF( Configuration p_HadoopConf, Path p_DocumentDir, Path p_SequenceDir, Path p_TokensDir, Path p_TF, String p_VecFolder, Path p_tf_idf) throws IOException, Exception, ClassNotFoundException, InterruptedException { log.info(new Date() + " - Creating TF-IDF vectors started."); HadoopUtil.delete(p_HadoopConf, p_SequenceDir); HadoopUtil.delete(p_HadoopConf, p_TokensDir); HadoopUtil.delete(p_HadoopConf, p_TF); createSequence(p_DocumentDir.toString(), p_SequenceDir.toString()); createTokens(p_SequenceDir, p_TokensDir, s_HadoopConf); createTFVectors(p_TokensDir, p_TF, p_VecFolder); createTFIDFVectors(new Path(p_TF, p_VecFolder), p_tf_idf, s_HadoopConf); log.info("Creating TF-IDF vectors finished."); }
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption(); addOutputOption(); addOption(MinhashOptionCreator.minClusterSizeOption().create()); addOption(MinhashOptionCreator.minVectorSizeOption().create()); addOption(MinhashOptionCreator.hashTypeOption().create()); addOption(MinhashOptionCreator.numHashFunctionsOption().create()); addOption(MinhashOptionCreator.keyGroupsOption().create()); addOption(MinhashOptionCreator.numReducersOption().create()); addOption(MinhashOptionCreator.debugOutputOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } int minClusterSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_CLUSTER_SIZE)); int minVectorSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_VECTOR_SIZE)); String hashType = getOption(MinhashOptionCreator.HASH_TYPE); int numHashFunctions = Integer.valueOf(getOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS)); int keyGroups = Integer.valueOf(getOption(MinhashOptionCreator.KEY_GROUPS)); int numReduceTasks = Integer.parseInt(getOption(MinhashOptionCreator.NUM_REDUCERS)); boolean debugOutput = hasOption(MinhashOptionCreator.DEBUG_OUTPUT); runJob( input, output, minClusterSize, minVectorSize, hashType, numHashFunctions, keyGroups, numReduceTasks, debugOutput); return 0; }
@Override public int run(String[] args) throws Exception { String path = System.getProperty("user.dir"); addInputOption(); addOutputOption(); addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f)); addOption( buildOption( TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false))); addOption(LABEL_INDEX, "li", "The path to store the label index in", false); addOption(DefaultOptionCreator.overwriteOption().create()); Path labPath = new Path(path + "/../out/labelindex/"); long labelSize = createLabelIndex(labPath); float alphaI = 1.0F; boolean trainComplementary = true; HadoopUtil.setSerializations(getConf()); HadoopUtil.cacheFiles(labPath, getConf()); HadoopUtil.delete(getConf(), new Path("/tmp/summedObservations")); HadoopUtil.delete(getConf(), new Path("/tmp/weights")); HadoopUtil.delete(getConf(), new Path("/tmp/thetas")); // Add up all the vectors with the same labels, while mapping the labels into our index Job indexInstances = prepareJob( new Path(path + "/../out/training"), new Path("/tmp/summedObservations"), SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); indexInstances.setCombinerClass(VectorSumReducer.class); boolean succeeded = indexInstances.waitForCompletion(true); if (!succeeded) { return -1; } // Sum up all the weights from the previous step, per label and per feature Job weightSummer = prepareJob( new Path("/tmp/summedObservations"), new Path("/tmp/weights"), SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize)); weightSummer.setCombinerClass(VectorSumReducer.class); succeeded = weightSummer.waitForCompletion(true); if (!succeeded) { return -1; } // Put the per label and per feature vectors into the cache HadoopUtil.cacheFiles(new Path("/tmp/weights"), getConf()); if (trainComplementary) { // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight // Magnitude Errors Job thetaSummer = prepareJob( new Path("/tmp/summedObservations"), new Path("/tmp/thetas"), SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); thetaSummer.setCombinerClass(VectorSumReducer.class); thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI); thetaSummer .getConfiguration() .setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary); succeeded = thetaSummer.waitForCompletion(true); if (!succeeded) { return -1; } } // Put the per label theta normalizers into the cache HadoopUtil.cacheFiles(new Path("/tmp/thetas"), getConf()); // Validate our model and then write it out to the official output getConf().setFloat(ThetaMapper.ALPHA_I, alphaI); getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path("/tmp/"), getConf()); naiveBayesModel.validate(); naiveBayesModel.serialize(new Path(path + "/../out/model"), getConf()); return 0; }
@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix", false); addOption( "similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption( "maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); addOption( "excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?", String.valueOf(false)); addOption("threshold", "tr", "discard row pairs with a similarity value below this", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numberOfColumns; if (hasOption("numberOfColumns")) { // Number of columns explicitly specified via CLI numberOfColumns = Integer.parseInt(getOption("numberOfColumns")); } else { // else get the number of columns by determining the cardinality of a vector in the input // matrix numberOfColumns = getDimensions(getInputPath()); } String similarityClassnameArg = getOption("similarityClassname"); String similarityClassname; try { similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname(); } catch (IllegalArgumentException iae) { similarityClassname = similarityClassnameArg; } // Clear the output and temp paths if the overwrite option has been set if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { // Clear the temp path HadoopUtil.delete(getConf(), getTempPath()); // Clear the output path HadoopUtil.delete(getConf(), getOutputPath()); } int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow")); boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD; Path weightsPath = getTempPath("weights"); Path normsPath = getTempPath("norms.bin"); Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin"); Path maxValuesPath = getTempPath("maxValues.bin"); Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job normsAndTranspose = prepareJob( getInputPath(), weightsPath, VectorNormMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class); Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration(); normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold)); normsAndTransposeConf.set(NORMS_PATH, normsPath.toString()); normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString()); normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname); boolean succeeded = normsAndTranspose.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob( weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class, IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class, VectorWritable.class); pairwiseSimilarity.setCombinerClass(VectorSumReducer.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(THRESHOLD, String.valueOf(threshold)); pairwiseConf.set(NORMS_PATH, normsPath.toString()); pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString()); pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname); pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns); pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity); boolean succeeded = pairwiseSimilarity.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob( pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class, IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class, IntWritable.class, VectorWritable.class); asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); boolean succeeded = asMatrix.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; }