@SuppressWarnings("unchecked")
 public DelegatingRecordReader(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
   // Find the InputFormat and then the RecordReader from the TaggedInputSplit.
   TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
   InputFormat<K, V> inputFormat =
       (InputFormat<K, V>)
           ReflectionUtils.newInstance(
               taggedInputSplit.getInputFormatClass(), context.getConfiguration());
   originalRR = inputFormat.createRecordReader(taggedInputSplit.getInputSplit(), context);
 }
 /**
  * Constructs the DelegatingRecordReader.
  *
  * @param split TaggegInputSplit object
  * @param context TaskAttemptContext object
  * @throws IOException
  * @throws InterruptedException
  */
 @SuppressWarnings("unchecked")
 public DelegatingRecordReader(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
   // Find the InputFormat and then the RecordReader from the
   // TaggedInputSplit.
   TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
   InputFormat<K, V> inputFormat =
       (InputFormat<K, V>)
           DCUtils.loadSerializedObjectInDC(
               context.getConfiguration(),
               InputFormat.class,
               taggedInputSplit.getInputFormatFile(),
               true);
   originalRR = inputFormat.createRecordReader(taggedInputSplit.getInputSplit(), context);
 }
Example #3
0
    /**
     * Randomize the split order, then take the specified number of keys from each split sampled,
     * where each key is selected with the specified probability and possibly replaced by a
     * subsequently selected key when the quota of keys from that split is satisfied.
     */
    @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
    public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
      List<InputSplit> splits = inf.getSplits(job);
      ArrayList<K> samples = new ArrayList<K>(numSamples);
      int splitsToSample = Math.min(maxSplitsSampled, splits.size());

      Random r = new Random();
      long seed = r.nextLong();
      r.setSeed(seed);
      // shuffle splits
      for (int i = 0; i < splits.size(); ++i) {
        InputSplit tmp = splits.get(i);
        int j = r.nextInt(splits.size());
        splits.set(i, splits.get(j));
        splits.set(j, tmp);
      }
      // our target rate is in terms of the maximum number of sample splits,
      // but we accept the possibility of sampling additional splits to hit
      // the target sample keyset
      for (int i = 0;
          i < splitsToSample || (i < splits.size() && samples.size() < numSamples);
          ++i) {
        RecordReader<K, V> reader =
            inf.createRecordReader(
                splits.get(i), new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
        while (reader.nextKeyValue()) {
          if (r.nextDouble() <= freq) {
            if (samples.size() < numSamples) {
              samples.add(reader.getCurrentKey());
            } else {
              // When exceeding the maximum number of samples, replace a
              // random element with this one, then adjust the frequency
              // to reflect the possibility of existing elements being
              // pushed out
              int ind = r.nextInt(numSamples);
              if (ind != numSamples) {
                samples.set(ind, reader.getCurrentKey());
              }
              freq *= (numSamples - 1) / (double) numSamples;
            }
          }
        }
        reader.close();
      }
      return (K[]) samples.toArray();
    }
  public static RecordReader getRecordReader(
      String cobolLocation, String datafileLocation, String delimiter, int fileFormat)
      throws IOException, InterruptedException {
    Configuration conf = new Configuration(false);
    conf.set("fs.default.name", "file:///");
    conf.set(Constants.COPYBOOK_INPUTFORMAT_CBL_HDFS_PATH_CONF, cobolLocation);
    conf.set(Constants.COPYBOOK_INPUTFORMAT_FIELD_DELIMITER, delimiter);
    conf.set(Constants.COPYBOOK_INPUTFORMAT_FILE_STRUCTURE, Integer.toString(fileFormat));

    File testFile = new File(datafileLocation);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(CopybookInputFormat.class, conf);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    RecordReader reader = inputFormat.createRecordReader(split, context);
    reader.initialize(split, context);

    return reader;
  }
Example #5
0
 /**
  * For each split sampled, emit when the ratio of the number of records retained to the total
  * record count is less than the specified frequency.
  */
 @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
 public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
   List<InputSplit> splits = inf.getSplits(job);
   ArrayList<K> samples = new ArrayList<K>();
   int splitsToSample = Math.min(maxSplitsSampled, splits.size());
   int splitStep = splits.size() / splitsToSample;
   long records = 0;
   long kept = 0;
   for (int i = 0; i < splitsToSample; ++i) {
     RecordReader<K, V> reader =
         inf.createRecordReader(
             splits.get(i * splitStep),
             new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
     while (reader.nextKeyValue()) {
       ++records;
       if ((double) kept / records < freq) {
         ++kept;
         samples.add(reader.getCurrentKey());
       }
     }
     reader.close();
   }
   return (K[]) samples.toArray();
 }
  public void testBinary() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = new Job(conf);

    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);

    FileOutputFormat.setOutputPath(job, outdir);

    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);

    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();

    TaskAttemptContext context =
        MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, BytesWritable> outputFormat =
        new SequenceFileAsBinaryOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);

    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
      for (int i = 0; i < RECORDS; ++i) {
        iwritable = new IntWritable(r.nextInt());
        iwritable.write(outbuf);
        bkey.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        dwritable = new DoubleWritable(r.nextDouble());
        dwritable.write(outbuf);
        bval.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        writer.write(bkey, bval);
      }
    } finally {
      writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);

    InputFormat<IntWritable, DoubleWritable> iformat =
        new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    SequenceFileInputFormat.setInputPaths(job, outdir);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job)) {
      RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
      MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext =
          new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(
              job.getConfiguration(),
              context.getTaskAttemptID(),
              reader,
              null,
              null,
              MapReduceTestUtil.createDummyReporter(),
              split);
      reader.initialize(split, mcontext);
      try {
        int sourceInt;
        double sourceDouble;
        while (reader.nextKeyValue()) {
          sourceInt = r.nextInt();
          sourceDouble = r.nextDouble();
          iwritable = reader.getCurrentKey();
          dwritable = reader.getCurrentValue();
          assertEquals(
              "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*",
              sourceInt,
              iwritable.get());
          assertTrue(
              "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*",
              Double.compare(dwritable.get(), sourceDouble) == 0);
          ++count;
        }
      } finally {
        reader.close();
      }
    }
    assertEquals("Some records not found", RECORDS, count);
  }