public int run(String[] args) throws Exception { if (args.length != 5) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int mapTasks = Integer.parseInt(args[2]); int reduceTasks = Integer.parseInt(args[3]); String stoplistPath = args[4]; sLogger.info("Tool: AFormatter"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + mapTasks); sLogger.info(" - number of reducers: " + reduceTasks); JobConf conf = new JobConf(AFormatterWG.class); conf.setJobName("Authority Formatter -- Web Graph"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); // conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HITSNode.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setCompressMapOutput(true); conf.setSpeculativeExecution(false); // InputSampler.Sampler<IntWritable, Text> sampler = new // InputSampler.RandomSampler<IntWritable, Text>(0.1, 10, 10); // InputSampler.writePartitionFile(conf, sampler); // conf.setPartitionerClass(TotalOrderPartitioner.class); conf.setMapperClass(AFormatMapperIMC.class); conf.setCombinerClass(AFormatReducer.class); conf.setReducerClass(AFormatReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); Path stopList = new Path(stoplistPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); sLogger.info("Starting job"); DistributedCache.addCacheFile(stopList.toUri(), conf); JobClient.runJob(conf); sLogger.info( "Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
public JobBuilder speculativeExecution(boolean enabled) throws IOException { _jobConf.setSpeculativeExecution(enabled); return this; }
@Override public int run(String[] args) throws IOException { OptionParser p = new OptionParser(); OptionSpec<String> maxwiOpt = p.accepts(maxwiOptName, "location of maxWi map file (HDFS) REQUIRED") .withRequiredArg() .ofType(String.class); OptionSpec<Float> thresholdOpt = p.accepts(thresholdOptName, "similarity threshold") .withRequiredArg() .ofType(Float.class) .defaultsTo(DEFAULT_THRESHOLD); OptionSpec<Integer> stripesOpt = p.accepts(stripesOptName, "number of stripes to divide the similarity matrix") .withRequiredArg() .ofType(Integer.class) .defaultsTo(1); OptionSpec<Integer> spreadOpt = p.accepts(spreadOptName, "number of reducers per stripe") .withRequiredArg() .ofType(Integer.class) .defaultsTo(DEFAULT_SPREAD); OptionSpec<Integer> factorOpt = p.accepts(factorOptName, "number of mappers per reducer") .withRequiredArg() .ofType(Integer.class) .defaultsTo(DEFAULT_FACTOR); OptionSpec<Integer> maxVectorIDOpt = p.accepts(maxVectorIDOptName, "maximum vector ID").withRequiredArg().ofType(Integer.class); p.acceptsAll(Arrays.asList("h", "?"), "show help"); OptionSet options = parseOptions(p, args); // to distinguish indexes built in successive runs DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss"); Date date = new Date(); float threshold = options.valueOf(thresholdOpt); // threshold if (threshold < 0 || threshold >= 1) { System.err.println(thresholdOptName + " should be between 0 and 1"); System.exit(1); } int numStripes = options.valueOf(stripesOpt); // number of stripes if (numStripes < 1) { System.err.println(stripesOptName + " should be > 0"); System.exit(1); } // MapReduce parameters int spread = options.valueOf(spreadOpt); // how many reducers per stripe if (spread < 1) { System.err.println(spreadOptName + " should be > 0"); System.exit(1); } int factor = options.valueOf(factorOpt); // how many mappers per reducer if (factor < 1) { System.err.println(factorOptName + " should be > 0"); System.exit(1); } int maxKey = 0; if (options.has(maxVectorIDOpt)) { maxKey = options.valueOf(maxVectorIDOpt); // maximum value of the vector ID if (maxKey < 1) { System.err.println(maxVectorIDOptName + " should be > 0"); System.exit(1); } } int numReducers = GenericKey.StripePartitioner.numReducers(numStripes, spread); int numMappers = numReducers * factor; int numBuckets = numMappers; // pick the file with max weights from command line String maxWiDir = options.valueOf(maxwiOpt); List<String> nonOptArgs = options.nonOptionArguments(); LOG.info("Threshold set to " + threshold); LOG.info( String.format( "Buckets: %1$-10s Factor: %2$-10s Stripes: %3$-10s Spread: %4$-10s Reducers: %5$-10s", numBuckets, factor, numStripes, spread, numReducers)); // start building the jobs JobConf conf1 = new JobConf(getConf(), Similarity.class); conf1.setFloat(PARAM_APS_THRESHOLD, threshold); conf1.setInt(PARAM_APS_STRIPES, numStripes); DistributedCache.addCacheFile(URI.create(maxWiDir), conf1); Path inputPath = new Path(nonOptArgs.get(0)); Path indexPath = new Path( nonOptArgs.get(0) + "-index-" + threshold + "-s" + numStripes + "_" + df.format(date)); // index filtering pruned nested directory Path indexOnlyPath = new Path(indexPath, "part*"); Path outputPath = new Path(nonOptArgs.get(1) + "-" + threshold + "-s" + numStripes); FileInputFormat.setInputPaths(conf1, inputPath); FileOutputFormat.setOutputPath(conf1, indexPath); conf1.setInputFormat(SequenceFileInputFormat.class); conf1.setOutputFormat(SequenceFileOutputFormat.class); conf1.setMapOutputKeyClass(LongWritable.class); conf1.setMapOutputValueClass(IndexItem.class); conf1.setOutputKeyClass(LongWritable.class); conf1.setOutputValueClass(IndexItemArrayWritable.class); conf1.setMapperClass(IndexerMapper.class); conf1.setReducerClass(IndexerReducer.class); // assuming input is sorted according to the key (vectorID) so that the // part files are locally sorted MultipleOutputs.addNamedOutput( conf1, PRUNED, SequenceFileOutputFormat.class, IntWritable.class, VectorComponentArrayWritable.class); // remove the stuff we added from the job name conf1.set( "mapred.job.name", "APS-" + indexPath.getName().substring(0, indexPath.getName().length() - 16)); conf1.setNumTasksToExecutePerJvm(-1); // JVM reuse conf1.setSpeculativeExecution(false); conf1.setCompressMapOutput(true); // hash the posting lists in different buckets to distribute the load conf1.setNumReduceTasks(numBuckets); RunningJob job1 = JobClient.runJob(conf1); // part 2 JobConf conf2 = new JobConf(getConf(), Similarity.class); if (numStripes > 0) FileUtils.mergeRestFile(conf2, indexPath, PRUNED, INDEX_INTERVAL); MultipleInputs.addInputPath( conf2, indexOnlyPath, SequenceFileInputFormat.class, SimilarityMapperIndex.class); MultipleInputs.addInputPath( conf2, inputPath, SequenceFileInputFormat.class, SimilarityMapperInput.class); FileOutputFormat.setOutputPath(conf2, outputPath); conf2.setCombinerClass(SimilarityCombiner.class); conf2.setReducerClass(SimilarityReducer.class); conf2.setPartitionerClass(GenericKey.StripePartitioner.class); conf2.setOutputKeyComparatorClass(GenericKey.Comparator.class); conf2.setOutputValueGroupingComparator(GenericKey.PrimaryComparator.class); conf2.setMapOutputKeyClass(GenericKey.class); conf2.setMapOutputValueClass(GenericValue.class); conf2.setOutputKeyClass(VectorPair.class); conf2.setOutputValueClass(NullWritable.class); Counter numDocs = job1.getCounters() .findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS"); maxKey = maxKey > 0 ? maxKey : (int) numDocs.getValue(); LOG.info("Setting max key value in input to " + maxKey); conf2.setInt(PARAM_APS_MAXKEY, maxKey); conf2.setInt(PARAM_APS_STRIPES, numStripes); conf2.setFloat(PARAM_APS_THRESHOLD, threshold); conf2.setInt(PARAM_APS_REDUCER_PER_STRIPE, spread); conf2.set("mapred.job.name", "APS-" + outputPath.getName()); conf2.setNumTasksToExecutePerJvm(-1); // JVM reuse conf2.setSpeculativeExecution(false); conf2.setCompressMapOutput(true); conf2.setNumReduceTasks(numReducers); JobClient.runJob(conf2); return 0; }
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException { // setup job conf jobConf.setJobName(PiEstimator.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); // setup input/output directories final Path inDir = new Path(TMP_DIR, "in"); final Path outDir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException( "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { // generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter( fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i); } // start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job Finished in " + duration + " seconds"); // read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } // compute estimated value return BigDecimal.valueOf(4) .setScale(20) .multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)) .divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }