public int run(String[] args) throws Exception { if (args.length < 2) { printUsage(); return 1; } JobConf job = new JobConf(getConf(), MultiFileWordCount.class); job.setJobName("MultiFileWordCount"); // set the InputFormat of the job to our InputFormat job.setInputFormat(MyInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(IntWritable.class); // use the defined mapper job.setMapperClass(MapClass.class); // use the WordCount Reducer job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); return 0; }
/** * Create an Aggregate based map/reduce job. * * @param args the arguments used for job creation. Generic hadoop arguments are accepted. * @return a JobConf object ready for submission. * @throws IOException * @see GenericOptionsParser */ public static JobConf createValueAggregatorJob(String args[]) throws IOException { Configuration conf = new Configuration(); GenericOptionsParser genericParser = new GenericOptionsParser(conf, args); args = genericParser.getRemainingArgs(); if (args.length < 2) { System.out.println( "usage: inputDirs outDir " + "[numOfReducer [textinputformat|seq [specfile [jobName]]]]"); GenericOptionsParser.printGenericCommandUsage(System.out); System.exit(1); } String inputDir = args[0]; String outputDir = args[1]; int numOfReducers = 1; if (args.length > 2) { numOfReducers = Integer.parseInt(args[2]); } Class<? extends InputFormat> theInputFormat = TextInputFormat.class; if (args.length > 3 && args[3].compareToIgnoreCase("textinputformat") == 0) { theInputFormat = TextInputFormat.class; } else { theInputFormat = SequenceFileInputFormat.class; } Path specFile = null; if (args.length > 4) { specFile = new Path(args[4]); } String jobName = ""; if (args.length > 5) { jobName = args[5]; } JobConf theJob = new JobConf(conf); if (specFile != null) { theJob.addResource(specFile); } String userJarFile = theJob.get("user.jar.file"); if (userJarFile == null) { theJob.setJarByClass(ValueAggregator.class); } else { theJob.setJar(userJarFile); } theJob.setJobName("ValueAggregatorJob: " + jobName); FileInputFormat.addInputPaths(theJob, inputDir); theJob.setInputFormat(theInputFormat); theJob.setMapperClass(ValueAggregatorMapper.class); FileOutputFormat.setOutputPath(theJob, new Path(outputDir)); theJob.setOutputFormat(TextOutputFormat.class); theJob.setMapOutputKeyClass(Text.class); theJob.setMapOutputValueClass(Text.class); theJob.setOutputKeyClass(Text.class); theJob.setOutputValueClass(Text.class); theJob.setReducerClass(ValueAggregatorReducer.class); theJob.setCombinerClass(ValueAggregatorCombiner.class); theJob.setNumMapTasks(1); theJob.setNumReduceTasks(numOfReducers); return theJob; }
/** Runs this tool. */ @SuppressWarnings("deprecation") public int run(String[] args) throws Exception { JobConf job = new JobConf(getConf(), Docnos2Titles.class); // Read commandline arguments CommandLine cmdline = parseArgs(args); if (cmdline == null) { printUsage(); } String eCollectionPath = cmdline.getOptionValue(ECOLLECTION_OPTION); String fCollectionPath = cmdline.getOptionValue(FCOLLECTION_OPTION); String pwsimOutputPath = cmdline.getOptionValue(PWSIM_OPTION); String titlePairsPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION); String eLang = cmdline.getOptionValue(ELANG_OPTION); String fLang = cmdline.getOptionValue(FLANG_OPTION); String samplesFile = cmdline.getOptionValue(SAMPLEDOCNOS_OPTION); job.setJobName("Docnos2Titles_" + fLang + "-" + eLang); FileInputFormat.addInputPaths(job, eCollectionPath); FileInputFormat.addInputPaths(job, fCollectionPath); FileOutputFormat.setOutputPath(job, new Path(titlePairsPath)); DistributedCache.addCacheFile(new URI(pwsimOutputPath), job); DistributedCache.addCacheFile(new URI(samplesFile), job); job.set("eLang", eLang); job.set("fLang", fLang); job.set("PwsimPairs", pwsimOutputPath); job.set("Ivory.SampleFile", samplesFile); job.setInt("mapred.task.timeout", 60000000); job.set("mapreduce.map.memory.mb", "3000"); job.set("mapreduce.map.java.opts", "-Xmx3000m"); job.setBoolean("mapred.map.tasks.speculative.execution", false); job.setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setNumMapTasks(100); job.setNumReduceTasks(1); job.setInt("mapred.min.split.size", 2000000000); job.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(PairOfInts.class); job.setMapOutputValueClass(PairOfIntString.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); sLogger.info("Running job " + job.getJobName() + "..."); sLogger.info("E-collection path: " + eCollectionPath); sLogger.info("F-collection path: " + fCollectionPath); sLogger.info("Pwsim output path: " + pwsimOutputPath); sLogger.info("Output path: " + titlePairsPath); sLogger.info("Sample file?: " + ((samplesFile != null) ? samplesFile : "none")); long startTime = System.currentTimeMillis(); JobClient.runJob(job); System.out.println( "Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }