@SuppressWarnings("unchecked")
 public DelegatingRecordReader(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
   // Find the InputFormat and then the RecordReader from the TaggedInputSplit.
   TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
   InputFormat<K, V> inputFormat =
       (InputFormat<K, V>)
           ReflectionUtils.newInstance(
               taggedInputSplit.getInputFormatClass(), context.getConfiguration());
   originalRR = inputFormat.createRecordReader(taggedInputSplit.getInputSplit(), context);
 }
 /**
  * Constructs the DelegatingRecordReader.
  *
  * @param split TaggegInputSplit object
  * @param context TaskAttemptContext object
  * @throws IOException
  * @throws InterruptedException
  */
 @SuppressWarnings("unchecked")
 public DelegatingRecordReader(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
   // Find the InputFormat and then the RecordReader from the
   // TaggedInputSplit.
   TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
   InputFormat<K, V> inputFormat =
       (InputFormat<K, V>)
           DCUtils.loadSerializedObjectInDC(
               context.getConfiguration(),
               InputFormat.class,
               taggedInputSplit.getInputFormatFile(),
               true);
   originalRR = inputFormat.createRecordReader(taggedInputSplit.getInputSplit(), context);
 }
Beispiel #3
0
    /**
     * Randomize the split order, then take the specified number of keys from each split sampled,
     * where each key is selected with the specified probability and possibly replaced by a
     * subsequently selected key when the quota of keys from that split is satisfied.
     */
    @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
    public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
      List<InputSplit> splits = inf.getSplits(job);
      ArrayList<K> samples = new ArrayList<K>(numSamples);
      int splitsToSample = Math.min(maxSplitsSampled, splits.size());

      Random r = new Random();
      long seed = r.nextLong();
      r.setSeed(seed);
      // shuffle splits
      for (int i = 0; i < splits.size(); ++i) {
        InputSplit tmp = splits.get(i);
        int j = r.nextInt(splits.size());
        splits.set(i, splits.get(j));
        splits.set(j, tmp);
      }
      // our target rate is in terms of the maximum number of sample splits,
      // but we accept the possibility of sampling additional splits to hit
      // the target sample keyset
      for (int i = 0;
          i < splitsToSample || (i < splits.size() && samples.size() < numSamples);
          ++i) {
        RecordReader<K, V> reader =
            inf.createRecordReader(
                splits.get(i), new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
        while (reader.nextKeyValue()) {
          if (r.nextDouble() <= freq) {
            if (samples.size() < numSamples) {
              samples.add(reader.getCurrentKey());
            } else {
              // When exceeding the maximum number of samples, replace a
              // random element with this one, then adjust the frequency
              // to reflect the possibility of existing elements being
              // pushed out
              int ind = r.nextInt(numSamples);
              if (ind != numSamples) {
                samples.set(ind, reader.getCurrentKey());
              }
              freq *= (numSamples - 1) / (double) numSamples;
            }
          }
        }
        reader.close();
      }
      return (K[]) samples.toArray();
    }
  public static RecordReader getRecordReader(
      String cobolLocation, String datafileLocation, String delimiter, int fileFormat)
      throws IOException, InterruptedException {
    Configuration conf = new Configuration(false);
    conf.set("fs.default.name", "file:///");
    conf.set(Constants.COPYBOOK_INPUTFORMAT_CBL_HDFS_PATH_CONF, cobolLocation);
    conf.set(Constants.COPYBOOK_INPUTFORMAT_FIELD_DELIMITER, delimiter);
    conf.set(Constants.COPYBOOK_INPUTFORMAT_FILE_STRUCTURE, Integer.toString(fileFormat));

    File testFile = new File(datafileLocation);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(CopybookInputFormat.class, conf);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    RecordReader reader = inputFormat.createRecordReader(split, context);
    reader.initialize(split, context);

    return reader;
  }
Beispiel #5
0
  /**
   * Validates map phase progress after each record is processed by map task using custom task
   * reporter.
   */
  public void testMapProgress() throws Exception {
    JobConf job = new JobConf();
    fs = FileSystem.getLocal(job);
    Path rootDir = new Path(TEST_ROOT_DIR);
    createInputFile(rootDir);

    job.setNumReduceTasks(0);
    TaskAttemptID taskId = TaskAttemptID.forName("attempt_200907082313_0424_m_000000_0");
    job.setClass("mapreduce.job.outputformat.class", NullOutputFormat.class, OutputFormat.class);
    job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, TEST_ROOT_DIR);
    jobId = taskId.getJobID();

    JobContext jContext = new JobContextImpl(job, jobId);
    InputFormat<?, ?> input = ReflectionUtils.newInstance(jContext.getInputFormatClass(), job);

    List<InputSplit> splits = input.getSplits(jContext);
    JobSplitWriter.createSplitFiles(
        new Path(TEST_ROOT_DIR), job, new Path(TEST_ROOT_DIR).getFileSystem(job), splits);
    TaskSplitMetaInfo[] splitMetaInfo =
        SplitMetaInfoReader.readSplitMetaInfo(jobId, fs, job, new Path(TEST_ROOT_DIR));
    job.setUseNewMapper(true); // use new api
    for (int i = 0; i < splitMetaInfo.length; i++) { // rawSplits.length is 1
      map =
          new TestMapTask(
              job.get(JTConfig.JT_SYSTEM_DIR, "/tmp/hadoop/mapred/system") + jobId + "job.xml",
              taskId,
              i,
              splitMetaInfo[i].getSplitIndex(),
              1);

      JobConf localConf = new JobConf(job);
      map.localizeConfiguration(localConf);
      map.setConf(localConf);
      map.run(localConf, fakeUmbilical);
    }
    // clean up
    fs.delete(rootDir, true);
  }
Beispiel #6
0
 /**
  * For each split sampled, emit when the ratio of the number of records retained to the total
  * record count is less than the specified frequency.
  */
 @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
 public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
   List<InputSplit> splits = inf.getSplits(job);
   ArrayList<K> samples = new ArrayList<K>();
   int splitsToSample = Math.min(maxSplitsSampled, splits.size());
   int splitStep = splits.size() / splitsToSample;
   long records = 0;
   long kept = 0;
   for (int i = 0; i < splitsToSample; ++i) {
     RecordReader<K, V> reader =
         inf.createRecordReader(
             splits.get(i * splitStep),
             new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
     while (reader.nextKeyValue()) {
       ++records;
       if ((double) kept / records < freq) {
         ++kept;
         samples.add(reader.getCurrentKey());
       }
     }
     reader.close();
   }
   return (K[]) samples.toArray();
 }
  public void testBinary() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = new Job(conf);

    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);

    FileOutputFormat.setOutputPath(job, outdir);

    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);

    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();

    TaskAttemptContext context =
        MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, BytesWritable> outputFormat =
        new SequenceFileAsBinaryOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);

    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
      for (int i = 0; i < RECORDS; ++i) {
        iwritable = new IntWritable(r.nextInt());
        iwritable.write(outbuf);
        bkey.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        dwritable = new DoubleWritable(r.nextDouble());
        dwritable.write(outbuf);
        bval.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        writer.write(bkey, bval);
      }
    } finally {
      writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);

    InputFormat<IntWritable, DoubleWritable> iformat =
        new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    SequenceFileInputFormat.setInputPaths(job, outdir);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job)) {
      RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
      MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext =
          new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(
              job.getConfiguration(),
              context.getTaskAttemptID(),
              reader,
              null,
              null,
              MapReduceTestUtil.createDummyReporter(),
              split);
      reader.initialize(split, mcontext);
      try {
        int sourceInt;
        double sourceDouble;
        while (reader.nextKeyValue()) {
          sourceInt = r.nextInt();
          sourceDouble = r.nextDouble();
          iwritable = reader.getCurrentKey();
          dwritable = reader.getCurrentValue();
          assertEquals(
              "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*",
              sourceInt,
              iwritable.get());
          assertTrue(
              "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*",
              Double.compare(dwritable.get(), sourceDouble) == 0);
          ++count;
        }
      } finally {
        reader.close();
      }
    }
    assertEquals("Some records not found", RECORDS, count);
  }
  @SuppressWarnings("unchecked")
  public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(job);
    Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(job);
    Map<Class<? extends InputFormat>, List<Path>> formatPaths =
        new HashMap<Class<? extends InputFormat>, List<Path>>();

    // First, build a map of InputFormats to Paths
    for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
      if (!formatPaths.containsKey(entry.getValue().getClass())) {
        formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
      }

      formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
    }

    for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
      Class<? extends InputFormat> formatClass = formatEntry.getKey();
      InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
      List<Path> paths = formatEntry.getValue();

      Map<Class<? extends Mapper>, List<Path>> mapperPaths =
          new HashMap<Class<? extends Mapper>, List<Path>>();

      // Now, for each set of paths that have a common InputFormat, build
      // a map of Mappers to the paths they're used for
      for (Path path : paths) {
        Class<? extends Mapper> mapperClass = mapperMap.get(path);
        if (!mapperPaths.containsKey(mapperClass)) {
          mapperPaths.put(mapperClass, new LinkedList<Path>());
        }

        mapperPaths.get(mapperClass).add(path);
      }

      // Now each set of paths that has a common InputFormat and Mapper can
      // be added to the same job, and split together.
      for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
        paths = mapEntry.getValue();
        Class<? extends Mapper> mapperClass = mapEntry.getKey();

        if (mapperClass == null) {
          try {
            mapperClass = job.getMapperClass();
          } catch (ClassNotFoundException e) {
            throw new IOException("Mapper class is not found", e);
          }
        }

        FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));

        // Get splits for each input path and tag with InputFormat
        // and Mapper types by wrapping in a TaggedInputSplit.
        List<InputSplit> pathSplits = format.getSplits(jobCopy);
        for (InputSplit pathSplit : pathSplits) {
          splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
        }
      }
    }

    return splits;
  }