Beispiel #1
0
  /**
   * Driver for InputSampler from the command line. Configures a JobConf instance and calls {@link
   * #writePartitionFile}.
   */
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    ArrayList<String> otherArgs = new ArrayList<String>();
    Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
      try {
        if ("-r".equals(args[i])) {
          job.setNumReduceTasks(Integer.parseInt(args[++i]));
        } else if ("-inFormat".equals(args[i])) {
          job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class));
        } else if ("-keyClass".equals(args[i])) {
          job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class));
        } else if ("-splitSample".equals(args[i])) {
          int numSamples = Integer.parseInt(args[++i]);
          int maxSplits = Integer.parseInt(args[++i]);
          if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE;
          sampler = new SplitSampler<K, V>(numSamples, maxSplits);
        } else if ("-splitRandom".equals(args[i])) {
          double pcnt = Double.parseDouble(args[++i]);
          int numSamples = Integer.parseInt(args[++i]);
          int maxSplits = Integer.parseInt(args[++i]);
          if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE;
          sampler = new RandomSampler<K, V>(pcnt, numSamples, maxSplits);
        } else if ("-splitInterval".equals(args[i])) {
          double pcnt = Double.parseDouble(args[++i]);
          int maxSplits = Integer.parseInt(args[++i]);
          if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE;
          sampler = new IntervalSampler<K, V>(pcnt, maxSplits);
        } else {
          otherArgs.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        return printUsage();
      }
    }
    if (job.getNumReduceTasks() <= 1) {
      System.err.println("Sampler requires more than one reducer");
      return printUsage();
    }
    if (otherArgs.size() < 2) {
      System.out.println("ERROR: Wrong number of parameters: ");
      return printUsage();
    }
    if (null == sampler) {
      sampler = new RandomSampler<K, V>(0.1, 10000, 10);
    }

    Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
    TotalOrderPartitioner.setPartitionFile(getConf(), outf);
    for (String s : otherArgs) {
      FileInputFormat.addInputPath(job, new Path(s));
    }
    InputSampler.<K, V>writePartitionFile(job, sampler);

    return 0;
  }
Beispiel #2
0
 /**
  * Write a partition file for the given job, using the Sampler provided. Queries the sampler for a
  * sample keyset, sorts by the output key comparator, selects the keys for each rank, and writes
  * to the destination returned from {@link TotalOrderPartitioner#getPartitionFile}.
  */
 @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
 public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
     throws IOException, ClassNotFoundException, InterruptedException {
   Configuration conf = job.getConfiguration();
   final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
   int numPartitions = job.getNumReduceTasks();
   K[] samples = sampler.getSample(inf, job);
   RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
   Arrays.sort(samples, comparator);
   Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
   FileSystem fs = dst.getFileSystem(conf);
   if (fs.exists(dst)) {
     fs.delete(dst, false);
   }
   SequenceFile.Writer writer =
       SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
   NullWritable nullValue = NullWritable.get();
   float stepSize = samples.length / (float) numPartitions;
   int last = -1;
   for (int i = 1; i < numPartitions; ++i) {
     int k = Math.round(stepSize * i);
     while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
       ++k;
     }
     writer.append(samples[k], nullValue);
     last = k;
   }
   writer.close();
 }
 private static <T extends WritableComparable> Path writePartitionFile(
     String testname, JobConf conf, T[] splits) throws IOException {
   final FileSystem fs = FileSystem.getLocal(conf);
   final Path testdir = new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(fs);
   Path p = new Path(testdir, testname + "/_partition.lst");
   TotalOrderPartitioner.setPartitionFile(conf, p);
   conf.setNumReduceTasks(splits.length + 1);
   SequenceFile.Writer w = null;
   try {
     NullWritable nw = NullWritable.get();
     w =
         SequenceFile.createWriter(
             fs,
             conf,
             p,
             splits[0].getClass(),
             NullWritable.class,
             SequenceFile.CompressionType.NONE);
     for (int i = 0; i < splits.length; ++i) {
       w.append(splits[i], NullWritable.get());
     }
   } finally {
     if (null != w) w.close();
   }
   return p;
 }
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (conf == null) {
      return -1;
    }

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);

    conf.setPartitionerClass(TotalOrderPartitioner.class);

    InputSampler.Sampler<IntWritable, Text> sampler =
        new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10);

    Path input = FileInputFormat.getInputPaths(conf)[0];
    input = input.makeQualified(input.getFileSystem(conf));

    Path partitionFile = new Path(input, "_partitions");
    TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
    InputSampler.writePartitionFile(conf, sampler);

    // Add to DistributedCache
    URI partitionUri = new URI(partitionFile.toString() + "#_partitions");
    DistributedCache.addCacheFile(partitionUri, conf);
    DistributedCache.createSymlink(conf);

    JobClient.runJob(conf);
    return 0;
  }
 public void testTotalOrderMemCmp() throws Exception {
   TotalOrderPartitioner<Text, NullWritable> partitioner =
       new TotalOrderPartitioner<Text, NullWritable>();
   JobConf job = new JobConf();
   Path p =
       TestTotalOrderPartitioner.<Text>writePartitionFile("totalordermemcmp", job, splitStrings);
   job.setMapOutputKeyClass(Text.class);
   try {
     partitioner.configure(job);
     NullWritable nw = NullWritable.get();
     for (Check<Text> chk : testStrings) {
       assertEquals(
           chk.data.toString(),
           chk.part,
           partitioner.getPartition(chk.data, nw, splitStrings.length + 1));
     }
   } finally {
     p.getFileSystem(job).delete(p);
   }
 }
 public void testTotalOrderCustomComparator() throws Exception {
   TotalOrderPartitioner<Text, NullWritable> partitioner =
       new TotalOrderPartitioner<Text, NullWritable>();
   JobConf job = new JobConf();
   Text[] revSplitStrings = Arrays.copyOf(splitStrings, splitStrings.length);
   Arrays.sort(revSplitStrings, new ReverseStringComparator());
   Path p =
       TestTotalOrderPartitioner.<Text>writePartitionFile(
           "totalordercustomcomparator", job, revSplitStrings);
   job.setBoolean("total.order.partitioner.natural.order", false);
   job.setMapOutputKeyClass(Text.class);
   job.setOutputKeyComparatorClass(ReverseStringComparator.class);
   ArrayList<Check<Text>> revCheck = new ArrayList<Check<Text>>();
   revCheck.add(new Check<Text>(new Text("aaaaa"), 9));
   revCheck.add(new Check<Text>(new Text("aaabb"), 9));
   revCheck.add(new Check<Text>(new Text("aabbb"), 9));
   revCheck.add(new Check<Text>(new Text("aaaaa"), 9));
   revCheck.add(new Check<Text>(new Text("babbb"), 8));
   revCheck.add(new Check<Text>(new Text("baabb"), 8));
   revCheck.add(new Check<Text>(new Text("yai"), 1));
   revCheck.add(new Check<Text>(new Text("yak"), 1));
   revCheck.add(new Check<Text>(new Text("z"), 0));
   revCheck.add(new Check<Text>(new Text("ddngo"), 4));
   revCheck.add(new Check<Text>(new Text("hi"), 3));
   try {
     partitioner.configure(job);
     NullWritable nw = NullWritable.get();
     for (Check<Text> chk : revCheck) {
       assertEquals(
           chk.data.toString(),
           chk.part,
           partitioner.getPartition(chk.data, nw, splitStrings.length + 1));
     }
   } finally {
     p.getFileSystem(job).delete(p);
   }
 }