@SuppressWarnings("unchecked") @Override public void setStoreLocation(String location, Job job) throws IOException { log.debug("setStoreLocation({}, {})", location, job); job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(location)); if ("true".equals(job.getConfiguration().get("output.compression.enabled"))) { FileOutputFormat.setCompressOutput(job, true); String codec = job.getConfiguration().get("output.compression.codec"); try { FileOutputFormat.setOutputCompressorClass( job, (Class<? extends CompressionCodec>) Class.forName(codec)); } catch (ClassNotFoundException e) { throw new RuntimeException("Class not found: " + codec); } } else { if (location.endsWith(".bz2") || location.endsWith(".bz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (location.endsWith(".gz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { FileOutputFormat.setCompressOutput(job, false); } } }
public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException { Job job = new Job(); job.setJarByClass(MaxTemperature.class); String inputPath = "/home/cloudera/hd/data/ncdc_tmp/ftp.ncdc.noaa.gov/pub/data/noaa/2000/719043-99999-2000.gz"; String outputPath = "/home/cloudera/hd/data/output"; FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setMapperClass(MaxTemperatureMapper.class); job.setReducerClass(MaxTemperatureReducer.class); job.setCombinerClass(MaxTemperatureReducer.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
@SuppressWarnings("rawtypes") public void afterPropertiesSet() throws Exception { final Configuration cfg = ConfigurationUtils.createFrom(configuration, properties); buildGenericOptions(cfg); if (StringUtils.hasText(user)) { UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser()); ugi.doAs( new PrivilegedExceptionAction<Void>() { @Override public Void run() throws Exception { job = new Job(cfg); return null; } }); } else { job = new Job(cfg); } ClassLoader loader = (beanClassLoader != null ? beanClassLoader : org.springframework.util.ClassUtils.getDefaultClassLoader()); if (jar != null) { JobConf conf = (JobConf) job.getConfiguration(); conf.setJar(jar.getURI().toString()); loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg); conf.setClassLoader(loader); } // set first to enable auto-detection of K/V to skip the key/value types to be specified if (mapper != null) { Class<? extends Mapper> mapperClass = resolveClass(mapper, loader, Mapper.class); job.setMapperClass(mapperClass); configureMapperTypesIfPossible(job, mapperClass); } if (reducer != null) { Class<? extends Reducer> reducerClass = resolveClass(reducer, loader, Reducer.class); job.setReducerClass(reducerClass); configureReducerTypesIfPossible(job, reducerClass); } if (StringUtils.hasText(name)) { job.setJobName(name); } if (combiner != null) { job.setCombinerClass(resolveClass(combiner, loader, Reducer.class)); } if (groupingComparator != null) { job.setGroupingComparatorClass(resolveClass(groupingComparator, loader, RawComparator.class)); } if (inputFormat != null) { job.setInputFormatClass(resolveClass(inputFormat, loader, InputFormat.class)); } if (mapKey != null) { job.setMapOutputKeyClass(resolveClass(mapKey, loader, Object.class)); } if (mapValue != null) { job.setMapOutputValueClass(resolveClass(mapValue, loader, Object.class)); } if (numReduceTasks != null) { job.setNumReduceTasks(numReduceTasks); } if (key != null) { job.setOutputKeyClass(resolveClass(key, loader, Object.class)); } if (value != null) { job.setOutputValueClass(resolveClass(value, loader, Object.class)); } if (outputFormat != null) { job.setOutputFormatClass(resolveClass(outputFormat, loader, OutputFormat.class)); } if (partitioner != null) { job.setPartitionerClass(resolveClass(partitioner, loader, Partitioner.class)); } if (sortComparator != null) { job.setSortComparatorClass(resolveClass(sortComparator, loader, RawComparator.class)); } if (StringUtils.hasText(workingDir)) { job.setWorkingDirectory(new Path(workingDir)); } if (jarClass != null) { job.setJarByClass(jarClass); } if (!CollectionUtils.isEmpty(inputPaths)) { for (String path : inputPaths) { FileInputFormat.addInputPath(job, new Path(path)); } } if (StringUtils.hasText(outputPath)) { FileOutputFormat.setOutputPath(job, new Path(outputPath)); } if (compressOutput != null) { FileOutputFormat.setCompressOutput(job, compressOutput); } if (codecClass != null) { FileOutputFormat.setOutputCompressorClass( job, resolveClass(codecClass, loader, CompressionCodec.class)); } processJob(job); }
@Override public int run(String[] args) throws Exception { Job job1 = Job.getInstance(getConf(), "TermWordCountPerDocument"); job1.setJarByClass(getClass()); Configuration conf1 = job1.getConfiguration(); FileInputFormat.setInputPaths(job1, new Path("enron/mann.avro")); Path out1 = new Path("tfidf/step1"); out1.getFileSystem(conf1).delete(out1, true); FileOutputFormat.setOutputPath(job1, out1); FileOutputFormat.setOutputCompressorClass(job1, SnappyCodec.class); FileOutputFormat.setCompressOutput(job1, true); job1.setMapperClass(TermWordCountPerDocumentMapper.class); job1.setReducerClass(IntSumReducer.class); job1.setInputFormatClass(AvroKeyInputFormat.class); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(IntWritable.class); Job job2 = Job.getInstance(getConf(), "DocumentWordCount"); job2.setJarByClass(getClass()); Configuration conf2 = job2.getConfiguration(); FileInputFormat.setInputPaths(job2, new Path("tfidf/step1")); Path out2 = new Path("tfidf/step2"); out2.getFileSystem(conf2).delete(out2, true); FileOutputFormat.setOutputPath(job2, out2); FileOutputFormat.setOutputCompressorClass(job2, SnappyCodec.class); FileOutputFormat.setCompressOutput(job2, true); job2.setMapperClass(DocumentWordCountMapper.class); job2.setReducerClass(DocumentWordCountReducer.class); job2.setInputFormatClass(SequenceFileInputFormat.class); job2.setOutputFormatClass(SequenceFileOutputFormat.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); Job job3 = Job.getInstance(getConf(), "DocumentCountAndTfIdf"); job3.setJarByClass(getClass()); Configuration conf3 = job3.getConfiguration(); FileInputFormat.setInputPaths(job3, new Path("tfidf/step2")); Path out3 = new Path("tfidf/final"); out3.getFileSystem(conf3).delete(out3, true); FileOutputFormat.setOutputPath(job3, out3); FileOutputFormat.setOutputCompressorClass(job3, SnappyCodec.class); FileOutputFormat.setCompressOutput(job3, true); // Get the total document count from the Avro file metadata DataFileReader<Object> reader = new DataFileReader<Object>( new FsInput(new Path("enron/mann.avro"), conf3), new GenericDatumReader<Object>()); conf3.setLong("totalDocs", reader.getMetaLong("recordCount")); reader.close(); job3.setMapperClass(TermDocumentCountMapper.class); job3.setReducerClass(TfIdfReducer.class); job3.setInputFormatClass(SequenceFileInputFormat.class); job3.setOutputFormatClass(SequenceFileOutputFormat.class); job3.setOutputKeyClass(Text.class); job3.setOutputValueClass(Text.class); return 0; }