/*package*/ static Job sortOne( Configuration conf, Path inputFile, Path outputDir, String commandName, String samplingInfo) throws IOException, ClassNotFoundException, InterruptedException { conf.set(Utils.WORK_FILENAME_PROPERTY, inputFile.getName()); Utils.configureSampling(outputDir, inputFile.getName(), conf); final Job job = new Job(conf); job.setJarByClass(Summarize.class); job.setMapperClass(Mapper.class); job.setReducerClass(SortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SortInputFormat.class); job.setOutputFormatClass(SortOutputFormat.class); FileInputFormat.setInputPaths(job, inputFile); FileOutputFormat.setOutputPath(job, outputDir); job.setPartitionerClass(TotalOrderPartitioner.class); final Timer t = new Timer(); System.out.printf("%s :: Sampling%s...\n", commandName, samplingInfo); t.start(); InputSampler.<LongWritable, Text>writePartitionFile( job, new InputSampler.SplitSampler<LongWritable, Text>( Math.max(1 << 16, conf.getInt("mapred.reduce.tasks", 1)), 10)); System.out.printf("%s :: Sampling complete in %d.%03d s.\n", commandName, t.stopS(), t.fms()); job.submit(); return job; }
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("summarysort :: WORKDIR not given."); return 3; } if (args.size() == 1) { System.err.println("summarysort :: INPATH not given."); return 3; } if (!cacheAndSetProperties(parser)) return 3; final Path wrkDir = new Path(args.get(0)), in = new Path(args.get(1)); final Configuration conf = getConf(); final Timer t = new Timer(); try { final Job job = sortOne(conf, in, wrkDir, "summarysort", ""); System.out.printf("summarysort :: Waiting for job completion...\n"); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("summarysort :: Job failed."); return 4; } System.out.printf("summarysort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("summarysort :: Hadoop error: %s\n", e); return 4; } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } if (outPath != null) try { System.out.println("summarysort :: Merging output..."); t.start(); final FileSystem dstFS = outPath.getFileSystem(conf); final OutputStream outs = dstFS.create(outPath); Utils.mergeInto(outs, wrkDir, "", "", conf, "summarysort"); // Remember the BGZF terminator. outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); outs.close(); System.out.printf("summarysort :: Merging complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("summarysort :: Output merging failed: %s\n", e); return 5; } return 0; }