Example #1
0
  /*package*/ static Job sortOne(
      Configuration conf, Path inputFile, Path outputDir, String commandName, String samplingInfo)
      throws IOException, ClassNotFoundException, InterruptedException {
    conf.set(Utils.WORK_FILENAME_PROPERTY, inputFile.getName());
    Utils.configureSampling(outputDir, inputFile.getName(), conf);
    final Job job = new Job(conf);

    job.setJarByClass(Summarize.class);
    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(SortInputFormat.class);
    job.setOutputFormatClass(SortOutputFormat.class);

    FileInputFormat.setInputPaths(job, inputFile);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    final Timer t = new Timer();

    System.out.printf("%s :: Sampling%s...\n", commandName, samplingInfo);
    t.start();

    InputSampler.<LongWritable, Text>writePartitionFile(
        job,
        new InputSampler.SplitSampler<LongWritable, Text>(
            Math.max(1 << 16, conf.getInt("mapred.reduce.tasks", 1)), 10));

    System.out.printf("%s :: Sampling complete in %d.%03d s.\n", commandName, t.stopS(), t.fms());
    job.submit();
    return job;
  }
Example #2
0
  @Override
  protected int run(CmdLineParser parser) {

    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
      System.err.println("summarysort :: WORKDIR not given.");
      return 3;
    }
    if (args.size() == 1) {
      System.err.println("summarysort :: INPATH not given.");
      return 3;
    }
    if (!cacheAndSetProperties(parser)) return 3;

    final Path wrkDir = new Path(args.get(0)), in = new Path(args.get(1));

    final Configuration conf = getConf();
    final Timer t = new Timer();

    try {
      final Job job = sortOne(conf, in, wrkDir, "summarysort", "");

      System.out.printf("summarysort :: Waiting for job completion...\n");
      t.start();

      if (!job.waitForCompletion(verbose)) {
        System.err.println("summarysort :: Job failed.");
        return 4;
      }
      System.out.printf("summarysort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
      System.err.printf("summarysort :: Hadoop error: %s\n", e);
      return 4;
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e);
    } catch (InterruptedException e) {
      throw new RuntimeException(e);
    }

    if (outPath != null)
      try {
        System.out.println("summarysort :: Merging output...");
        t.start();

        final FileSystem dstFS = outPath.getFileSystem(conf);

        final OutputStream outs = dstFS.create(outPath);

        Utils.mergeInto(outs, wrkDir, "", "", conf, "summarysort");

        // Remember the BGZF terminator.
        outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);

        outs.close();

        System.out.printf("summarysort :: Merging complete in %d.%03d s.\n", t.stopS(), t.fms());

      } catch (IOException e) {
        System.err.printf("summarysort :: Output merging failed: %s\n", e);
        return 5;
      }

    return 0;
  }