@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf( "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Max temperature"); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MaxTemperatureMapper.class); conf.setCombinerClass(MaxTemperatureReducer.class); conf.setReducerClass(MaxTemperatureReducer.class); // vv MaxTemperatureDriverV6 conf.setProfileEnabled(true); conf.setProfileParams( "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s"); conf.setProfileTaskRange(true, "0-2"); // ^^ MaxTemperatureDriverV6 JobClient.runJob(conf); return 0; }
public void testComplexNameWithRegex() throws Exception { OutputStream os = getFileSystem().create(new Path(getInputDir(), "text.txt")); Writer wr = new OutputStreamWriter(os); wr.write("b a\n"); wr.close(); JobConf conf = createJobConf(); conf.setJobName("name \\Evalue]"); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); FileInputFormat.setInputPaths(conf, getInputDir()); FileOutputFormat.setOutputPath(conf, getOutputDir()); JobClient.runJob(conf); Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(getOutputDir(), new OutputLogFilter())); assertEquals(1, outputFiles.length); InputStream is = getFileSystem().open(outputFiles[0]); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); assertEquals("0\tb a", reader.readLine()); assertNull(reader.readLine()); reader.close(); }
@Override public int run(String[] args) throws Exception { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); Path input = FileInputFormat.getInputPaths(conf)[0]; input = input.makeQualified(input.getFileSystem(conf)); Path partitionFile = new Path(input, "_partitions"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.writePartitionFile(conf, sampler); // Add to DistributedCache URI partitionUri = new URI(partitionFile.toString() + "#_partitions"); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); JobClient.runJob(conf); return 0; }
public static void runJob(String[] args) { JobConf conf = new JobConf(CassandraBulkLoader.class); if (args.length >= 4) { conf.setNumReduceTasks(new Integer(args[3])); } try { // We store the cassandra storage-conf.xml on the HDFS cluster DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf); } catch (URISyntaxException e) { throw new RuntimeException(e); } conf.setInputFormat(KeyValueTextInputFormat.class); conf.setJobName("CassandraBulkLoader_v2"); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[1])); FileOutputFormat.setOutputPath(conf, new Path(args[2])); try { JobClient.runJob(conf); } catch (IOException e) { throw new RuntimeException(e); } }
private String runJob() throws Exception { OutputStream os = getFileSystem().create(new Path(getInputDir(), "text.txt")); Writer wr = new OutputStreamWriter(os); wr.write("hello1\n"); wr.write("hello2\n"); wr.write("hello3\n"); wr.close(); JobConf conf = createJobConf(); conf.setJobName("mr"); conf.setJobPriority(JobPriority.HIGH); conf.setInputFormat(TextInputFormat.class); conf.setMapOutputKeyClass(LongWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(org.apache.hadoop.mapred.lib.IdentityMapper.class); conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class); FileInputFormat.setInputPaths(conf, getInputDir()); FileOutputFormat.setOutputPath(conf, getOutputDir()); return JobClient.runJob(conf).getID().toString(); }
public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
public static void main(String[] args) throws IOException { /*JobConf conf = new JobConf(); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(IpCounterMapper.class); conf.setCombinerClass(IpCounterReducer.class); conf.setReducerClass(IpCounterReducer.class); String inputDir = args[0]; String outputDir = args[1]; FileInputFormat.setInputPaths(conf, inputDir); FileOutputFormat.setOutputPath(conf, new Path(outputDir)); boolean flag = JobClient.runJob(conf).isSuccessful(); System.out.println(args.length);*/ if (args.length < 2) { System.out.println("args not right!"); return; } JobConf conf = new JobConf(IpCount1.class); // set output key class conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); // set mapper & reducer class conf.setMapperClass(IpCounterMapper.class); conf.setCombinerClass(IpCounterReducer.class); conf.setReducerClass(IpCounterReducer.class); // set format conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); String inputDir = args[0]; String outputDir = args[1]; // FileInputFormat.setInputPaths(conf, "/user/hadoop/rongxin/locationinput/"); FileInputFormat.setInputPaths(conf, inputDir); FileOutputFormat.setOutputPath(conf, new Path(outputDir)); boolean flag = JobClient.runJob(conf).isSuccessful(); }
/** Configure a job given argv. */ public static boolean parseArgs(String[] argv, JobConf job) throws IOException { if (argv.length < 1) { return 0 == printUsage(); } for (int i = 0; i < argv.length; ++i) { if (argv.length == i + 1) { System.out.println("ERROR: Required parameter missing from " + argv[i]); return 0 == printUsage(); } try { if ("-m".equals(argv[i])) { job.setNumMapTasks(Integer.parseInt(argv[++i])); } else if ("-r".equals(argv[i])) { job.setNumReduceTasks(Integer.parseInt(argv[++i])); } else if ("-inFormat".equals(argv[i])) { job.setInputFormat(Class.forName(argv[++i]).asSubclass(InputFormat.class)); } else if ("-outFormat".equals(argv[i])) { job.setOutputFormat(Class.forName(argv[++i]).asSubclass(OutputFormat.class)); } else if ("-outKey".equals(argv[i])) { job.setOutputKeyClass(Class.forName(argv[++i]).asSubclass(WritableComparable.class)); } else if ("-outValue".equals(argv[i])) { job.setOutputValueClass(Class.forName(argv[++i]).asSubclass(Writable.class)); } else if ("-keepmap".equals(argv[i])) { job.set( org.apache.hadoop.mapreduce.GenericMRLoadGenerator.MAP_PRESERVE_PERCENT, argv[++i]); } else if ("-keepred".equals(argv[i])) { job.set( org.apache.hadoop.mapreduce.GenericMRLoadGenerator.REDUCE_PRESERVE_PERCENT, argv[++i]); } else if ("-outdir".equals(argv[i])) { FileOutputFormat.setOutputPath(job, new Path(argv[++i])); } else if ("-indir".equals(argv[i])) { FileInputFormat.addInputPaths(job, argv[++i]); } else if ("-inFormatIndirect".equals(argv[i])) { job.setClass( org.apache.hadoop.mapreduce.GenericMRLoadGenerator.INDIRECT_INPUT_FORMAT, Class.forName(argv[++i]).asSubclass(InputFormat.class), InputFormat.class); job.setInputFormat(IndirectInputFormat.class); } else { System.out.println("Unexpected argument: " + argv[i]); return 0 == printUsage(); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + argv[i]); return 0 == printUsage(); } catch (Exception e) { throw (IOException) new IOException().initCause(e); } } return true; }
// Job configuration private static JobConf createJobConf(Configuration conf) { JobConf jobconf = new JobConf(conf, DistCp.class); jobconf.setJobName(NAME); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobconf.setMapSpeculativeExecution(false); jobconf.setInputFormat(CopyInputFormat.class); jobconf.setOutputKeyClass(Text.class); jobconf.setOutputValueClass(Text.class); jobconf.setMapperClass(CopyFilesMapper.class); jobconf.setNumReduceTasks(0); return jobconf; }
private static void runIOTest(Class<? extends Mapper> mapperClass, Path outputDir) throws IOException { JobConf job = new JobConf(fsConfig, DFSCIOTest.class); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass); job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(Add1.class); conf.setJobName("sumar1"); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
// Main function public static void main(String[] args) throws Exception { // TODO Auto-generated method stub JobConf conf = new JobConf(ProcessUnits.class); conf.setJobName("max_eletricityunits"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(EE_Mapper.class); conf.setCombinerClass(EE_Reducer.class); conf.setReducerClass(EE_Reducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
@Override public int run(String[] args) throws IOException { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setMapperClass(MaxTemperatureMapper.class); /*[*/ conf.setPartitionerClass(FirstPartitioner.class); /*]*/ /*[*/ conf.setOutputKeyComparatorClass(KeyComparator.class); /*]*/ /*[*/ conf.setOutputValueGroupingComparator(GroupComparator.class); /*]*/ conf.setReducerClass(MaxTemperatureReducer.class); conf.setOutputKeyClass(IntPair.class); conf.setOutputValueClass(NullWritable.class); JobClient.runJob(conf); return 0; }
@Override public int run(String[] args) throws IOException { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setMapperClass(StationMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setReducerClass(MultipleOutputsReducer.class); conf.setOutputKeyClass(NullWritable.class); conf.setOutputFormat(NullOutputFormat.class); // suppress empty part file MultipleOutputs.addMultiNamedOutput( conf, "station", TextOutputFormat.class, NullWritable.class, Text.class); JobClient.runJob(conf); return 0; }
public static void seekTest(FileSystem fs, boolean fastCheck) throws Exception { fs.delete(READ_DIR, true); JobConf job = new JobConf(conf, TestFileSystem.class); job.setBoolean("fs.test.fastCheck", fastCheck); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(SeekMapper.class); job.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(job, READ_DIR); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
public int run(String[] args) throws Exception { if (args.length < 1) { args = new String[] {DateStringUtils.now()}; System.out.println( "ERROR: Please Enter Date , eg. 20101010 ! now use default => " + DateStringUtils.now()); } JobConf config = new JobConf(getConf(), getClass()); config.set("user.args", Utils.asString(args)); config.setJobName(getClass() + "-" + System.currentTimeMillis()); config.setNumReduceTasks(100); config.setMapperClass(getClass()); config.setReducerClass(getClass()); config.setInputFormat(getInputFormat()); config.setMapOutputKeyClass(Text.class); config.setMapOutputValueClass(Text.class); // add input paths for (String path : getInputPath(args)) { if (TextInputFormat.class.equals(getInputFormat())) { TextInputFormat.addInputPath(config, new Path(path)); } else if (SequenceFileInputFormat.class.equals(getInputFormat())) { SequenceFileInputFormat.addInputPath(config, new Path(path)); } } config.setOutputKeyClass(Text.class); config.setOutputValueClass(Text.class); // if output path exists then return FileSystem fs = FileSystem.get(config); Path outputPath = new Path(getOutputPath(args)); FileOutputFormat.setOutputPath(config, outputPath); if (!fs.exists(outputPath)) { JobClient.runJob(config); } else { System.out.println("You has finished this job today ! " + outputPath); } return JobClient.SUCCESS; }
public static void main(String args[]) throws IOException { JobConf job = new JobConf(WordCountJob.class); job.setJobName("Word Count Example"); FileInputFormat.setInputPaths(job, args[0]); job.setInputFormat(TextInputFormat.class); job.setMapperClass(MapTask.class); job.setCombinerClass(ReduceTask.class); job.setReducerClass(ReduceTask.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); }
public int run(String[] args) throws Exception { if (args.length < 4) { System.out.println("ERROR: Please Enter args : input output type(text|seq) splitChar(9=\t)"); return JobClient.SUCCESS; } String input = args[0]; String output = args[1]; String type = args[2]; String splitChar = args[3]; JobConf config = new JobConf(getConf(), getClass()); config.set("user.split", splitChar); config.setJobName("File Filter -" + System.currentTimeMillis()); config.setNumReduceTasks(10); config.setReducerClass(IdentityReducer.class); config.setMapperClass(FileTestMapper.class); if ("text".equals(type)) { config.setInputFormat(TextInputFormat.class); TextInputFormat.addInputPath(config, new Path(input)); } else { config.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(config, new Path(input)); } config.setMapOutputKeyClass(Text.class); config.setMapOutputValueClass(Text.class); config.setOutputKeyClass(Text.class); config.setOutputValueClass(Text.class); // if output path exists then return FileSystem fs = FileSystem.get(config); Path outputPath = new Path(output); FileOutputFormat.setOutputPath(config, outputPath); if (!fs.exists(outputPath)) { JobClient.runJob(config); } else { System.out.println("You has finished this job today ! " + outputPath); } return JobClient.SUCCESS; }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Usage: OldMaxTemperature <input path> <output path>"); System.exit(-1); } /*[*/ JobConf conf = new JobConf(OldMaxTemperature.class); /*]*/ /*[*/ conf /*]*/.setJobName("Max temperature"); FileInputFormat.addInputPath(/*[*/ conf /*]*/, new Path(args[0])); FileOutputFormat.setOutputPath(/*[*/ conf /*]*/, new Path(args[1])); /*[*/ conf /*]*/.setMapperClass(OldMaxTemperatureMapper.class); /*[*/ conf /*]*/.setReducerClass(OldMaxTemperatureReducer.class); /*[*/ conf /*]*/.setOutputKeyClass(Text.class); /*[*/ conf /*]*/.setOutputValueClass(IntWritable.class); /*[*/ JobClient.runJob(conf); /*]*/ }
public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), XiangLi1_exercise3.class); conf.setJobName("xiangli1_exercise3"); conf.setNumReduceTasks(0); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); // conf.setCombinerClass(Reduce.class); // conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(Main.class); conf.setJobName("feels-analysis"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(TheOutputClass.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setNumReduceTasks(1); conf.setInputFormat(CSVTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); // TODO: determine whether we need extra output MultipleOutputs.addMultiNamedOutput( conf, SECOND_OUTPUT, TextOutputFormat.class, Text.class, TheOutputClass.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), Sqrt2.class); conf.setJobName("sqrt2"); conf.setOutputKeyClass(DoubleWritable.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(Map.class); /*conf.setCombinerClass(Reduce.class);*/ conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); JobConf job = new JobConf(conf); job.setJarByClass(Jacobi.class); fs.delete(new Path("curX"), true); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path("preX")); FileOutputFormat.setOutputPath(job, new Path("curX")); JobClient.runJob(job); return 1; }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(NeighborSearch.class); conf.setJobName("star searching"); conf.setOutputKeyClass(BlockIDWritable.class); conf.setOutputValueClass(PairWritable.class); conf.setMapperClass(Map.class); // conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); // conf.setPartitionerClass(BlockPartitioner.class); // conf.setFloat("mapred.reduce.slowstart.completed.maps", (float) 1.0); conf.setInputFormat(StarInputFormat.class); conf.setOutputFormat(StarOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
/** * Sets the Reducer class to the chain job's JobConf. * * <p>It has to be specified how key and values are passed from one element of the chain to the * next, by value or by reference. If a Reducer leverages the assumed semantics that the key and * values are not modified by the collector 'by value' must be used. If the Reducer does not * expect this semantics, as an optimization to avoid serialization and deserialization 'by * reference' can be used. * * <p>For the added Reducer the configuration given for it, <code>reducerConf</code>, have * precedence over the job's JobConf. This precedence is in effect when the task is running. * * <p>IMPORTANT: There is no need to specify the output key/value classes for the ChainReducer, * this is done by the setReducer or the addMapper for the last element in the chain. * * @param job job's JobConf to add the Reducer class. * @param klass the Reducer class to add. * @param inputKeyClass reducer input key class. * @param inputValueClass reducer input value class. * @param outputKeyClass reducer output key class. * @param outputValueClass reducer output value class. * @param byValue indicates if key/values should be passed by value to the next Mapper in the * chain, if any. * @param reducerConf a JobConf with the configuration for the Reducer class. It is recommended to * use a JobConf without default values using the <code>JobConf(boolean loadDefaults)</code> * constructor with FALSE. */ public static <K1, V1, K2, V2> void setReducer( JobConf job, Class<? extends Reducer<K1, V1, K2, V2>> klass, Class<? extends K1> inputKeyClass, Class<? extends V1> inputValueClass, Class<? extends K2> outputKeyClass, Class<? extends V2> outputValueClass, boolean byValue, JobConf reducerConf) { job.setReducerClass(ChainReducer.class); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); Chain.setReducer( job, klass, inputKeyClass, inputValueClass, outputKeyClass, outputValueClass, byValue, reducerConf); }
/** * Adds a Mapper class to the chain job's JobConf. * * <p>It has to be specified how key and values are passed from one element of the chain to the * next, by value or by reference. If a Mapper leverages the assumed semantics that the key and * values are not modified by the collector 'by value' must be used. If the Mapper does not expect * this semantics, as an optimization to avoid serialization and deserialization 'by reference' * can be used. * * <p>For the added Mapper the configuration given for it, <code>mapperConf</code>, have * precedence over the job's JobConf. This precedence is in effect when the task is running. * * <p>IMPORTANT: There is no need to specify the output key/value classes for the ChainMapper, * this is done by the addMapper for the last mapper in the chain . * * @param job chain job's JobConf to add the Mapper class. * @param klass the Mapper class to add. * @param inputKeyClass mapper input key class. * @param inputValueClass mapper input value class. * @param outputKeyClass mapper output key class. * @param outputValueClass mapper output value class. * @param byValue indicates if key/values should be passed by value to the next Mapper in the * chain, if any. * @param mapperConf a JobConf with the configuration for the Mapper class. It is recommended to * use a JobConf without default values using the <code>JobConf(boolean loadDefaults)</code> * constructor with FALSE. */ public static <K1, V1, K2, V2> void addMapper( JobConf job, Class<? extends Mapper<K1, V1, K2, V2>> klass, Class<? extends K1> inputKeyClass, Class<? extends V1> inputValueClass, Class<? extends K2> outputKeyClass, Class<? extends V2> outputValueClass, boolean byValue, JobConf mapperConf) { job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); Chain.addMapper( false, job, klass, inputKeyClass, inputValueClass, outputKeyClass, outputValueClass, byValue, mapperConf); }
static void checkRecords(Configuration defaults, Path sortInput, Path sortOutput) throws IOException { FileSystem inputfs = sortInput.getFileSystem(defaults); FileSystem outputfs = sortOutput.getFileSystem(defaults); FileSystem defaultfs = FileSystem.get(defaults); JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class); jobConf.setJobName("sortvalidate-recordstats-checker"); int noSortReduceTasks = outputfs.listStatus(sortOutput, sortPathsFilter).length; jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks); int noSortInputpaths = inputfs.listStatus(sortInput).length; jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(IntWritable.class); jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class); jobConf.setMapperClass(Map.class); jobConf.setCombinerClass(Reduce.class); jobConf.setReducerClass(Reduce.class); jobConf.setNumMapTasks(noSortReduceTasks); jobConf.setNumReduceTasks(1); FileInputFormat.setInputPaths(jobConf, sortInput); FileInputFormat.addInputPath(jobConf, sortOutput); Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker"); if (defaultfs.exists(outputPath)) { defaultfs.delete(outputPath, true); } FileOutputFormat.setOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process // job_conf.set("mapred.job.tracker", "local"); Path[] inputPaths = FileInputFormat.getInputPaths(jobConf); System.out.println( "\nSortValidator.RecordStatsChecker: Validate sort " + "from " + inputPaths[0] + " (" + noSortInputpaths + " files), " + inputPaths[1] + " (" + noSortReduceTasks + " files) into " + FileOutputFormat.getOutputPath(jobConf) + " with 1 reducer."); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); // Check to ensure that the statistics of the // framework's sort-input and sort-output match SequenceFile.Reader stats = new SequenceFile.Reader(defaultfs, new Path(outputPath, "part-00000"), defaults); IntWritable k1 = new IntWritable(); IntWritable k2 = new IntWritable(); RecordStatsWritable v1 = new RecordStatsWritable(); RecordStatsWritable v2 = new RecordStatsWritable(); if (!stats.next(k1, v1)) { throw new IOException("Failed to read record #1 from reduce's output"); } if (!stats.next(k2, v2)) { throw new IOException("Failed to read record #2 from reduce's output"); } if ((v1.getBytes() != v2.getBytes()) || (v1.getRecords() != v2.getRecords()) || v1.getChecksum() != v2.getChecksum()) { throw new IOException( "(" + v1.getBytes() + ", " + v1.getRecords() + ", " + v1.getChecksum() + ") v/s (" + v2.getBytes() + ", " + v2.getRecords() + ", " + v2.getChecksum() + ")"); } }
/** * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce * job. * * @throws IOException When there is communication problems with the job tracker. */ public void testOldJobWithMapAndReducers() throws Exception { JobConf conf = new JobConf(TestJobCounters.class); conf.setJobName("wordcount-map-reducers"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(WordCount.MapClass.class); conf.setCombinerClass(WordCount.Reduce.class); conf.setReducerClass(WordCount.Reduce.class); conf.setNumMapTasks(3); conf.setNumReduceTasks(1); conf.setInt("io.sort.mb", 1); conf.setInt("io.sort.factor", 2); conf.set("io.sort.record.percent", "0.05"); conf.set("io.sort.spill.percent", "0.80"); FileSystem fs = FileSystem.get(conf); Path testDir = new Path(TEST_ROOT_DIR, "countertest"); conf.set("test.build.data", testDir.toString()); try { if (fs.exists(testDir)) { fs.delete(testDir, true); } if (!fs.mkdirs(testDir)) { throw new IOException("Mkdirs failed to create " + testDir.toString()); } String inDir = testDir + File.separator + "genins" + File.separator; String outDir = testDir + File.separator; Path wordsIns = new Path(inDir); if (!fs.mkdirs(wordsIns)) { throw new IOException("Mkdirs failed to create " + wordsIns.toString()); } long inputSize = 0; // create 3 input files each with 5*2k words File inpFile = new File(inDir + "input5_2k_1"); createWordsFile(inpFile); inputSize += inpFile.length(); inpFile = new File(inDir + "input5_2k_2"); createWordsFile(inpFile); inputSize += inpFile.length(); inpFile = new File(inDir + "input5_2k_3"); createWordsFile(inpFile); inputSize += inpFile.length(); FileInputFormat.setInputPaths(conf, inDir); Path outputPath1 = new Path(outDir, "output5_2k_3"); FileOutputFormat.setOutputPath(conf, outputPath1); RunningJob myJob = JobClient.runJob(conf); Counters c1 = myJob.getCounters(); // 3maps & in each map, 4 first level spills --- So total 12. // spilled records count: // Each Map: 1st level:2k+2k+2k+2k=8k;2ndlevel=4k+4k=8k; // 3rd level=2k(4k from 1st level & 4k from 2nd level & combineAndSpill) // So total 8k+8k+2k=18k // For 3 Maps, total = 3*18=54k // Reduce: each of the 3 map o/p's(2k each) will be spilled in shuffleToDisk() // So 3*2k=6k in 1st level; 2nd level:4k(2k+2k); // 3rd level directly given to reduce(4k+2k --- combineAndSpill => 2k. // So 0 records spilled to disk in 3rd level) // So total of 6k+4k=10k // Total job counter will be 54k+10k = 64k // 3 maps and 2.5k lines --- So total 7.5k map input records // 3 maps and 10k words in each --- So total of 30k map output recs validateMapredCounters(c1, 64000, 7500, 30000); validateMapredFileCounters(c1, inputSize, inputSize, 0, 0, 0); // create 4th input file each with 5*2k words and test with 4 maps inpFile = new File(inDir + "input5_2k_4"); createWordsFile(inpFile); inputSize += inpFile.length(); conf.setNumMapTasks(4); Path outputPath2 = new Path(outDir, "output5_2k_4"); FileOutputFormat.setOutputPath(conf, outputPath2); myJob = JobClient.runJob(conf); c1 = myJob.getCounters(); // 4maps & in each map 4 first level spills --- So total 16. // spilled records count: // Each Map: 1st level:2k+2k+2k+2k=8k;2ndlevel=4k+4k=8k; // 3rd level=2k(4k from 1st level & 4k from 2nd level & combineAndSpill) // So total 8k+8k+2k=18k // For 3 Maps, total = 4*18=72k // Reduce: each of the 4 map o/p's(2k each) will be spilled in shuffleToDisk() // So 4*2k=8k in 1st level; 2nd level:4k+4k=8k; // 3rd level directly given to reduce(4k+4k --- combineAndSpill => 2k. // So 0 records spilled to disk in 3rd level) // So total of 8k+8k=16k // Total job counter will be 72k+16k = 88k // 4 maps and 2.5k words in each --- So 10k map input records // 4 maps and 10k unique words --- So 40k map output records validateMapredCounters(c1, 88000, 10000, 40000); validateMapredFileCounters(c1, inputSize, inputSize, 0, 0, 0); // check for a map only job conf.setNumReduceTasks(0); Path outputPath3 = new Path(outDir, "output5_2k_5"); FileOutputFormat.setOutputPath(conf, outputPath3); myJob = JobClient.runJob(conf); c1 = myJob.getCounters(); // 4 maps and 2.5k words in each --- So 10k map input records // 4 maps and 10k unique words --- So 40k map output records validateMapredCounters(c1, 0, 10000, 40000); validateMapredFileCounters(c1, inputSize, inputSize, 0, -1, -1); } finally { // clean up the input and output files if (fs.exists(testDir)) { fs.delete(testDir, true); } } }
static void checkRecords( Configuration defaults, int noMaps, int noReduces, Path sortInput, Path sortOutput) throws IOException { JobConf jobConf = new JobConf(defaults, RecordChecker.class); jobConf.setJobName("sortvalidate-record-checker"); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(BytesWritable.class); jobConf.setOutputValueClass(IntWritable.class); jobConf.setMapperClass(Map.class); jobConf.setReducerClass(Reduce.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); if (noMaps == -1) { noMaps = cluster.getTaskTrackers() * jobConf.getInt("test.sortvalidate.maps_per_host", 10); } if (noReduces == -1) { noReduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sortReduces = jobConf.get("test.sortvalidate.reduces_per_host"); if (sortReduces != null) { noReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces); } } jobConf.setNumMapTasks(noMaps); jobConf.setNumReduceTasks(noReduces); FileInputFormat.setInputPaths(jobConf, sortInput); FileInputFormat.addInputPath(jobConf, sortOutput); Path outputPath = new Path("/tmp/sortvalidate/recordchecker"); FileSystem fs = FileSystem.get(defaults); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process // job_conf.set("mapred.job.tracker", "local"); Path[] inputPaths = FileInputFormat.getInputPaths(jobConf); System.out.println( "\nSortValidator.RecordChecker: Running on " + cluster.getTaskTrackers() + " nodes to validate sort from " + inputPaths[0] + ", " + inputPaths[1] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + noReduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); }