@SuppressWarnings("unchecked") @Override public void setStoreLocation(String location, Job job) throws IOException { log.debug("setStoreLocation({}, {})", location, job); job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(location)); if ("true".equals(job.getConfiguration().get("output.compression.enabled"))) { FileOutputFormat.setCompressOutput(job, true); String codec = job.getConfiguration().get("output.compression.codec"); try { FileOutputFormat.setOutputCompressorClass( job, (Class<? extends CompressionCodec>) Class.forName(codec)); } catch (ClassNotFoundException e) { throw new RuntimeException("Class not found: " + codec); } } else { if (location.endsWith(".bz2") || location.endsWith(".bz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (location.endsWith(".gz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { FileOutputFormat.setCompressOutput(job, false); } } }
public Job getJob(Configuration conf) throws IOException { Job job = new Job(conf, "pivoting"); job.setJarByClass(PivotingReducer.class); job.setMapperClass(Mapper.class); job.setReducerClass(PivotingReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(RuleWritable.class); job.setMapOutputValueClass(MapWritable.class); job.setOutputKeyClass(RuleWritable.class); job.setOutputValueClass(MapWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(RuleWritable.SourcePartitioner.class); FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "collected")); int maxSplitSize = conf.getInt("thrax.max-split-size", 0); if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize); int numReducers = conf.getInt("thrax.reducers", 4); job.setNumReduceTasks(numReducers); FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "pivoted")); FileOutputFormat.setCompressOutput(job, true); return job; }
/** Runs this tool. */ public int run(String[] args) throws IOException { DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args); if (options == null) { return -1; } // Temp directory. String tmpDir = "tmp-" + TrecDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000); LOG.info("Tool name: " + TrecDocnoMappingBuilder.class.getCanonicalName()); LOG.info(" - input path: " + options.collection); LOG.info(" - output file: " + options.docnoMapping); Job job = new Job( getConf(), TrecDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection); FileSystem fs = FileSystem.get(job.getConfiguration()); job.setJarByClass(TrecDocnoMappingBuilder.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(options.collection)); FileOutputFormat.setOutputPath(job, new Path(tmpDir)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(TrecDocumentInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpDir), true); try { job.waitForCompletion(true); } catch (Exception e) { throw new RuntimeException(e); } String input = tmpDir + (tmpDir.endsWith("/") ? "" : "/") + "/part-r-00000"; TrecDocnoMapping.writeMappingData( new Path(input), new Path(options.docnoMapping), FileSystem.get(getConf())); fs.delete(new Path(tmpDir), true); return 0; }
/** Runs this tool. */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String outputFile = args[2]; LOG.info("Tool: " + Aquaint2DocnoMappingBuilder.class.getCanonicalName()); LOG.info(" - Input path: " + inputPath); LOG.info(" - Output path: " + outputPath); LOG.info(" - Output file: " + outputFile); Job job = new Job(getConf(), Aquaint2DocnoMappingBuilder.class.getSimpleName()); job.setJarByClass(Aquaint2DocnoMappingBuilder.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(Aquaint2DocumentInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000"; Aquaint2DocnoMapping.writeDocnoData( new Path(input), new Path(outputFile), FileSystem.get(getConf())); return 0; }
public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException { Job job = new Job(); job.setJarByClass(MaxTemperature.class); String inputPath = "/home/cloudera/hd/data/ncdc_tmp/ftp.ncdc.noaa.gov/pub/data/noaa/2000/719043-99999-2000.gz"; String outputPath = "/home/cloudera/hd/data/output"; FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setMapperClass(MaxTemperatureMapper.class); job.setReducerClass(MaxTemperatureReducer.class); job.setCombinerClass(MaxTemperatureReducer.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
@SuppressWarnings("rawtypes") public void afterPropertiesSet() throws Exception { final Configuration cfg = ConfigurationUtils.createFrom(configuration, properties); buildGenericOptions(cfg); if (StringUtils.hasText(user)) { UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser()); ugi.doAs( new PrivilegedExceptionAction<Void>() { @Override public Void run() throws Exception { job = new Job(cfg); return null; } }); } else { job = new Job(cfg); } ClassLoader loader = (beanClassLoader != null ? beanClassLoader : org.springframework.util.ClassUtils.getDefaultClassLoader()); if (jar != null) { JobConf conf = (JobConf) job.getConfiguration(); conf.setJar(jar.getURI().toString()); loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg); conf.setClassLoader(loader); } // set first to enable auto-detection of K/V to skip the key/value types to be specified if (mapper != null) { Class<? extends Mapper> mapperClass = resolveClass(mapper, loader, Mapper.class); job.setMapperClass(mapperClass); configureMapperTypesIfPossible(job, mapperClass); } if (reducer != null) { Class<? extends Reducer> reducerClass = resolveClass(reducer, loader, Reducer.class); job.setReducerClass(reducerClass); configureReducerTypesIfPossible(job, reducerClass); } if (StringUtils.hasText(name)) { job.setJobName(name); } if (combiner != null) { job.setCombinerClass(resolveClass(combiner, loader, Reducer.class)); } if (groupingComparator != null) { job.setGroupingComparatorClass(resolveClass(groupingComparator, loader, RawComparator.class)); } if (inputFormat != null) { job.setInputFormatClass(resolveClass(inputFormat, loader, InputFormat.class)); } if (mapKey != null) { job.setMapOutputKeyClass(resolveClass(mapKey, loader, Object.class)); } if (mapValue != null) { job.setMapOutputValueClass(resolveClass(mapValue, loader, Object.class)); } if (numReduceTasks != null) { job.setNumReduceTasks(numReduceTasks); } if (key != null) { job.setOutputKeyClass(resolveClass(key, loader, Object.class)); } if (value != null) { job.setOutputValueClass(resolveClass(value, loader, Object.class)); } if (outputFormat != null) { job.setOutputFormatClass(resolveClass(outputFormat, loader, OutputFormat.class)); } if (partitioner != null) { job.setPartitionerClass(resolveClass(partitioner, loader, Partitioner.class)); } if (sortComparator != null) { job.setSortComparatorClass(resolveClass(sortComparator, loader, RawComparator.class)); } if (StringUtils.hasText(workingDir)) { job.setWorkingDirectory(new Path(workingDir)); } if (jarClass != null) { job.setJarByClass(jarClass); } if (!CollectionUtils.isEmpty(inputPaths)) { for (String path : inputPaths) { FileInputFormat.addInputPath(job, new Path(path)); } } if (StringUtils.hasText(outputPath)) { FileOutputFormat.setOutputPath(job, new Path(outputPath)); } if (compressOutput != null) { FileOutputFormat.setCompressOutput(job, compressOutput); } if (codecClass != null) { FileOutputFormat.setOutputCompressorClass( job, resolveClass(codecClass, loader, CompressionCodec.class)); } processJob(job); }
/** * Implmentation of Tool.run() method, which builds and runs the Hadoop job. * * @param args command line parameters, less common Hadoop job parameters stripped out and * interpreted by the Tool class. * @return 0 if the Hadoop job completes successfully, 1 if not. */ @Override public int run(String[] args) throws Exception { String inputPath = null; String outputPath = null; String configFile = null; boolean overwrite = false; int numReducers = 60; // Read the command line arguments. We're not using GenericOptionsParser // to prevent having to include commons.cli as a dependency. for (int i = 0; i < args.length; i++) { try { if (args[i].equals(ARGNAME_INPATH)) { inputPath = args[++i]; } else if (args[i].equals(ARGNAME_OUTPATH)) { outputPath = args[++i]; } else if (args[i].equals(ARGNAME_CONF)) { configFile = args[++i]; } else if (args[i].equals(ARGNAME_MAXFILES)) { SampleFilter.setMax(Long.parseLong(args[++i])); } else if (args[i].equals(ARGNAME_OVERWRITE)) { overwrite = true; } else if (args[i].equals(ARGNAME_NUMREDUCE)) { numReducers = Integer.parseInt(args[++i]); } else { LOG.warn("Unsupported argument: " + args[i]); } } catch (ArrayIndexOutOfBoundsException e) { usage(); throw new IllegalArgumentException(); } } if (inputPath == null || outputPath == null) { usage(); throw new IllegalArgumentException(); } // Read in any additional config parameters. if (configFile != null) { LOG.info("adding config parameters from '" + configFile + "'"); this.getConf().addResource(configFile); } // Create the Hadoop job. Configuration conf = getConf(); Job job = new Job(conf); job.setJarByClass(BigramFinder.class); job.setNumReduceTasks(numReducers); // Scan the provided input path for ARC files. LOG.info("setting input path to '" + inputPath + "'"); SampleFilter.setFilter(FILEFILTER); FileInputFormat.addInputPath(job, new Path(inputPath)); FileInputFormat.setInputPathFilter(job, SampleFilter.class); // Delete the output path directory if it already exists and user wants // to overwrite it. if (overwrite) { LOG.info("clearing the output path at '" + outputPath + "'"); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } } // Set the path where final output 'part' files will be saved. LOG.info("setting output path to '" + outputPath + "'"); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); // Set which InputFormat class to use. job.setInputFormatClass(ArcInputFormat.class); // SequenceFileInputFormat.class // Set which OutputFormat class to use. job.setOutputFormatClass(TextOutputFormat.class); // Set the output data types. job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // Set which Mapper and Reducer classes to use. job.setMapperClass(BigramFinderMapper.class); job.setReducerClass(PairGeneratorSumReducer.class); job.setCombinerClass(LongSumReducer.class); // Set the name of the job. job.setJobName("Norvig Award - Evil Bigram Finder"); if (job.waitForCompletion(true)) { return 0; } else { return 1; } }
public static void enableCompression(Job j, SequenceFile.CompressionType type) { Configuration conf = j.getConfiguration(); conf.setBoolean("mapred.compress.map.output", true); FileOutputFormat.setCompressOutput(j, true); SequenceFileOutputFormat.setOutputCompressionType(j, type); }
@Override public int run(String[] args) throws Exception { Job job1 = Job.getInstance(getConf(), "TermWordCountPerDocument"); job1.setJarByClass(getClass()); Configuration conf1 = job1.getConfiguration(); FileInputFormat.setInputPaths(job1, new Path("enron/mann.avro")); Path out1 = new Path("tfidf/step1"); out1.getFileSystem(conf1).delete(out1, true); FileOutputFormat.setOutputPath(job1, out1); FileOutputFormat.setOutputCompressorClass(job1, SnappyCodec.class); FileOutputFormat.setCompressOutput(job1, true); job1.setMapperClass(TermWordCountPerDocumentMapper.class); job1.setReducerClass(IntSumReducer.class); job1.setInputFormatClass(AvroKeyInputFormat.class); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(IntWritable.class); Job job2 = Job.getInstance(getConf(), "DocumentWordCount"); job2.setJarByClass(getClass()); Configuration conf2 = job2.getConfiguration(); FileInputFormat.setInputPaths(job2, new Path("tfidf/step1")); Path out2 = new Path("tfidf/step2"); out2.getFileSystem(conf2).delete(out2, true); FileOutputFormat.setOutputPath(job2, out2); FileOutputFormat.setOutputCompressorClass(job2, SnappyCodec.class); FileOutputFormat.setCompressOutput(job2, true); job2.setMapperClass(DocumentWordCountMapper.class); job2.setReducerClass(DocumentWordCountReducer.class); job2.setInputFormatClass(SequenceFileInputFormat.class); job2.setOutputFormatClass(SequenceFileOutputFormat.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); Job job3 = Job.getInstance(getConf(), "DocumentCountAndTfIdf"); job3.setJarByClass(getClass()); Configuration conf3 = job3.getConfiguration(); FileInputFormat.setInputPaths(job3, new Path("tfidf/step2")); Path out3 = new Path("tfidf/final"); out3.getFileSystem(conf3).delete(out3, true); FileOutputFormat.setOutputPath(job3, out3); FileOutputFormat.setOutputCompressorClass(job3, SnappyCodec.class); FileOutputFormat.setCompressOutput(job3, true); // Get the total document count from the Avro file metadata DataFileReader<Object> reader = new DataFileReader<Object>( new FsInput(new Path("enron/mann.avro"), conf3), new GenericDatumReader<Object>()); conf3.setLong("totalDocs", reader.getMetaLong("recordCount")); reader.close(); job3.setMapperClass(TermDocumentCountMapper.class); job3.setReducerClass(TfIdfReducer.class); job3.setInputFormatClass(SequenceFileInputFormat.class); job3.setOutputFormatClass(SequenceFileOutputFormat.class); job3.setOutputKeyClass(Text.class); job3.setOutputValueClass(Text.class); return 0; }
@SuppressWarnings("unchecked") public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf); String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf) .toLowerCase(); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf); // split examples conf.set("Mavuno.Split.InputPath", inputPath); conf.set("Mavuno.Split.OutputPath", outputPath + "/../split"); conf.set("Mavuno.Split.SplitKey", extractorTarget); new Split(conf).run(); // get splits FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split"); int split = 0; for (FileStatus file : files) { if (!file.getPath().getName().endsWith(".examples")) { continue; } conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString()); sLogger.info("Tool name: ExtractGlobalStats"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Examples path: " + file.getPath()); sLogger.info(" - Example split: " + split); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor class: " + extractorArgs); sLogger.info(" - Extractor target: " + extractorTarget); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("ExtractGlobalStats"); job.setJarByClass(ExtractGlobalStats.class); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); split++; } // combine splits conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split); conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/"); conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath); new CombineGlobalStats(conf).run(); MavunoUtils.removeDirectory(conf, outputPath + "/../split"); return 0; }