/** * Driver for InputSampler from the command line. Configures a JobConf instance and calls {@link * #writePartitionFile}. */ public int run(String[] args) throws Exception { Job job = new Job(getConf()); ArrayList<String> otherArgs = new ArrayList<String>(); Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { job.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-inFormat".equals(args[i])) { job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class)); } else if ("-keyClass".equals(args[i])) { job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class)); } else if ("-splitSample".equals(args[i])) { int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new SplitSampler<K, V>(numSamples, maxSplits); } else if ("-splitRandom".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else if ("-splitInterval".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new IntervalSampler<K, V>(pcnt, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (job.getNumReduceTasks() <= 1) { System.err.println("Sampler requires more than one reducer"); return printUsage(); } if (otherArgs.size() < 2) { System.out.println("ERROR: Wrong number of parameters: "); return printUsage(); } if (null == sampler) { sampler = new RandomSampler<K, V>(0.1, 10000, 10); } Path outf = new Path(otherArgs.remove(otherArgs.size() - 1)); TotalOrderPartitioner.setPartitionFile(getConf(), outf); for (String s : otherArgs) { FileInputFormat.addInputPath(job, new Path(s)); } InputSampler.<K, V>writePartitionFile(job, sampler); return 0; }
/** * Write a partition file for the given job, using the Sampler provided. Queries the sampler for a * sample keyset, sorts by the output key comparator, selects the keys for each rank, and writes * to the destination returned from {@link TotalOrderPartitioner#getPartitionFile}. */ @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = job.getConfiguration(); final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks(); K[] samples = sampler.getSample(inf, job); RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for (int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
private static <T extends WritableComparable> Path writePartitionFile( String testname, JobConf conf, T[] splits) throws IOException { final FileSystem fs = FileSystem.getLocal(conf); final Path testdir = new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(fs); Path p = new Path(testdir, testname + "/_partition.lst"); TotalOrderPartitioner.setPartitionFile(conf, p); conf.setNumReduceTasks(splits.length + 1); SequenceFile.Writer w = null; try { NullWritable nw = NullWritable.get(); w = SequenceFile.createWriter( fs, conf, p, splits[0].getClass(), NullWritable.class, SequenceFile.CompressionType.NONE); for (int i = 0; i < splits.length; ++i) { w.append(splits[i], NullWritable.get()); } } finally { if (null != w) w.close(); } return p; }
@Override public int run(String[] args) throws Exception { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); Path input = FileInputFormat.getInputPaths(conf)[0]; input = input.makeQualified(input.getFileSystem(conf)); Path partitionFile = new Path(input, "_partitions"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.writePartitionFile(conf, sampler); // Add to DistributedCache URI partitionUri = new URI(partitionFile.toString() + "#_partitions"); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); JobClient.runJob(conf); return 0; }
public void testTotalOrderMemCmp() throws Exception { TotalOrderPartitioner<Text, NullWritable> partitioner = new TotalOrderPartitioner<Text, NullWritable>(); JobConf job = new JobConf(); Path p = TestTotalOrderPartitioner.<Text>writePartitionFile("totalordermemcmp", job, splitStrings); job.setMapOutputKeyClass(Text.class); try { partitioner.configure(job); NullWritable nw = NullWritable.get(); for (Check<Text> chk : testStrings) { assertEquals( chk.data.toString(), chk.part, partitioner.getPartition(chk.data, nw, splitStrings.length + 1)); } } finally { p.getFileSystem(job).delete(p); } }
public void testTotalOrderCustomComparator() throws Exception { TotalOrderPartitioner<Text, NullWritable> partitioner = new TotalOrderPartitioner<Text, NullWritable>(); JobConf job = new JobConf(); Text[] revSplitStrings = Arrays.copyOf(splitStrings, splitStrings.length); Arrays.sort(revSplitStrings, new ReverseStringComparator()); Path p = TestTotalOrderPartitioner.<Text>writePartitionFile( "totalordercustomcomparator", job, revSplitStrings); job.setBoolean("total.order.partitioner.natural.order", false); job.setMapOutputKeyClass(Text.class); job.setOutputKeyComparatorClass(ReverseStringComparator.class); ArrayList<Check<Text>> revCheck = new ArrayList<Check<Text>>(); revCheck.add(new Check<Text>(new Text("aaaaa"), 9)); revCheck.add(new Check<Text>(new Text("aaabb"), 9)); revCheck.add(new Check<Text>(new Text("aabbb"), 9)); revCheck.add(new Check<Text>(new Text("aaaaa"), 9)); revCheck.add(new Check<Text>(new Text("babbb"), 8)); revCheck.add(new Check<Text>(new Text("baabb"), 8)); revCheck.add(new Check<Text>(new Text("yai"), 1)); revCheck.add(new Check<Text>(new Text("yak"), 1)); revCheck.add(new Check<Text>(new Text("z"), 0)); revCheck.add(new Check<Text>(new Text("ddngo"), 4)); revCheck.add(new Check<Text>(new Text("hi"), 3)); try { partitioner.configure(job); NullWritable nw = NullWritable.get(); for (Check<Text> chk : revCheck) { assertEquals( chk.data.toString(), chk.part, partitioner.getPartition(chk.data, nw, splitStrings.length + 1)); } } finally { p.getFileSystem(job).delete(p); } }