/**
  * Use the input splits to take samples of the input and generate sample keys. By default reads
  * 100,000 keys from 10 locations in the input, sorts them and picks N-1 keys to generate N
  * equally sized partitions.
  *
  * @param conf the job to sample
  * @param partFile where to write the output file to
  * @throws IOException if something goes wrong
  */
 public static void writePartitionFile(JobConf conf, Path partFile) throws IOException {
   TeraInputFormat inFormat = new TeraInputFormat();
   TextSampler sampler = new TextSampler();
   Text key = new Text();
   Text value = new Text();
   int partitions = conf.getNumReduceTasks();
   long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
   InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
   int samples = Math.min(10, splits.length);
   long recordsPerSample = sampleSize / samples;
   int sampleStep = splits.length / samples;
   long records = 0;
   // take N samples from different parts of the input
   for (int i = 0; i < samples; ++i) {
     RecordReader<Text, Text> reader =
         inFormat.getRecordReader(splits[sampleStep * i], conf, null);
     while (reader.next(key, value)) {
       sampler.addKey(key);
       records += 1;
       if ((i + 1) * recordsPerSample <= records) {
         break;
       }
     }
   }
   FileSystem outFs = partFile.getFileSystem(conf);
   if (outFs.exists(partFile)) {
     outFs.delete(partFile, false);
   }
   SequenceFile.Writer writer =
       SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class);
   NullWritable nullValue = NullWritable.get();
   for (Text split : sampler.createPartitions(partitions)) {
     writer.append(split, nullValue);
   }
   writer.close();
 }
Example #2
0
  /**
   * Use the input splits to take samples of the input and generate sample keys. By default reads
   * 100,000 keys from 10 locations in the input, sorts them and picks N-1 keys to generate N
   * equally sized partitions.
   *
   * @param job the job to sample
   * @param partFile where to write the output file to
   * @throws IOException if something goes wrong
   */
  public static void writePartitionFile(final JobContext job, Path partFile)
      throws IOException, InterruptedException {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
      final int idx = i;
      samplerReader[i] =
          new Thread("Sampler Reader " + idx) {
            {
              setDaemon(true);
            }

            public void run() {
              long records = 0;
              try {
                TaskAttemptContext context =
                    new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
                RecordReader<Text, Text> reader =
                    inFormat.createRecordReader(splits.get(sampleStep * idx), context);
                reader.initialize(splits.get(sampleStep * idx), context);
                while (reader.nextKeyValue()) {
                  sampler.addKey(new Text(reader.getCurrentKey()));
                  records += 1;
                  if (recordsPerSample <= records) {
                    break;
                  }
                }
              } catch (IOException ie) {
                System.err.println(
                    "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                System.exit(-1);
              } catch (InterruptedException e) {

              }
            }
          };
      samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer =
        outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize());
    for (int i = 0; i < samples; i++) {
      try {
        samplerReader[i].join();
      } catch (InterruptedException e) {
      }
    }
    for (Text split : sampler.createPartitions(partitions)) {
      split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
  }