@SuppressWarnings("unchecked") public DelegatingRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // Find the InputFormat and then the RecordReader from the TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils.newInstance( taggedInputSplit.getInputFormatClass(), context.getConfiguration()); originalRR = inputFormat.createRecordReader(taggedInputSplit.getInputSplit(), context); }
/** * Constructs the DelegatingRecordReader. * * @param split TaggegInputSplit object * @param context TaskAttemptContext object * @throws IOException * @throws InterruptedException */ @SuppressWarnings("unchecked") public DelegatingRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) DCUtils.loadSerializedObjectInDC( context.getConfiguration(), InputFormat.class, taggedInputSplit.getInputFormatFile(), true); originalRR = inputFormat.createRecordReader(taggedInputSplit.getInputSplit(), context); }
/** * Randomize the split order, then take the specified number of keys from each split sampled, * where each key is selected with the specified probability and possibly replaced by a * subsequently selected key when the quota of keys from that split is satisfied. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); // shuffle splits for (int i = 0; i < splits.size(); ++i) { InputSplit tmp = splits.get(i); int j = r.nextInt(splits.size()); splits.set(i, splits.get(j)); splits.set(j, tmp); } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i = 0; i < splitsToSample || (i < splits.size() && samples.size() < numSamples); ++i) { RecordReader<K, V> reader = inf.createRecordReader( splits.get(i), new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID())); while (reader.nextKeyValue()) { if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { samples.add(reader.getCurrentKey()); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.nextInt(numSamples); if (ind != numSamples) { samples.set(ind, reader.getCurrentKey()); } freq *= (numSamples - 1) / (double) numSamples; } } } reader.close(); } return (K[]) samples.toArray(); }
public static RecordReader getRecordReader( String cobolLocation, String datafileLocation, String delimiter, int fileFormat) throws IOException, InterruptedException { Configuration conf = new Configuration(false); conf.set("fs.default.name", "file:///"); conf.set(Constants.COPYBOOK_INPUTFORMAT_CBL_HDFS_PATH_CONF, cobolLocation); conf.set(Constants.COPYBOOK_INPUTFORMAT_FIELD_DELIMITER, delimiter); conf.set(Constants.COPYBOOK_INPUTFORMAT_FILE_STRUCTURE, Integer.toString(fileFormat)); File testFile = new File(datafileLocation); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(CopybookInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); reader.initialize(split, context); return reader; }
/** * Validates map phase progress after each record is processed by map task using custom task * reporter. */ public void testMapProgress() throws Exception { JobConf job = new JobConf(); fs = FileSystem.getLocal(job); Path rootDir = new Path(TEST_ROOT_DIR); createInputFile(rootDir); job.setNumReduceTasks(0); TaskAttemptID taskId = TaskAttemptID.forName("attempt_200907082313_0424_m_000000_0"); job.setClass("mapreduce.job.outputformat.class", NullOutputFormat.class, OutputFormat.class); job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, TEST_ROOT_DIR); jobId = taskId.getJobID(); JobContext jContext = new JobContextImpl(job, jobId); InputFormat<?, ?> input = ReflectionUtils.newInstance(jContext.getInputFormatClass(), job); List<InputSplit> splits = input.getSplits(jContext); JobSplitWriter.createSplitFiles( new Path(TEST_ROOT_DIR), job, new Path(TEST_ROOT_DIR).getFileSystem(job), splits); TaskSplitMetaInfo[] splitMetaInfo = SplitMetaInfoReader.readSplitMetaInfo(jobId, fs, job, new Path(TEST_ROOT_DIR)); job.setUseNewMapper(true); // use new api for (int i = 0; i < splitMetaInfo.length; i++) { // rawSplits.length is 1 map = new TestMapTask( job.get(JTConfig.JT_SYSTEM_DIR, "/tmp/hadoop/mapred/system") + jobId + "job.xml", taskId, i, splitMetaInfo[i].getSplitIndex(), 1); JobConf localConf = new JobConf(job); map.localizeConfiguration(localConf); map.setConf(localConf); map.run(localConf, fakeUmbilical); } // clean up fs.delete(rootDir, true); }
/** * For each split sampled, emit when the ratio of the number of records retained to the total * record count is less than the specified frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); int splitStep = splits.size() / splitsToSample; long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K, V> reader = inf.createRecordReader( splits.get(i * splitStep), new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID())); while (reader.nextKeyValue()) { ++records; if ((double) kept / records < freq) { ++kept; samples.add(reader.getCurrentKey()); } } reader.close(); } return (K[]) samples.toArray(); }
public void testBinary() throws IOException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq"); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); FileOutputFormat.setOutputPath(job, outdir); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class); SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true); SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); BytesWritable bkey = new BytesWritable(); BytesWritable bval = new BytesWritable(); TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration()); OutputFormat<BytesWritable, BytesWritable> outputFormat = new SequenceFileAsBinaryOutputFormat(); OutputCommitter committer = outputFormat.getOutputCommitter(context); committer.setupJob(job); RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context); IntWritable iwritable = new IntWritable(); DoubleWritable dwritable = new DoubleWritable(); DataOutputBuffer outbuf = new DataOutputBuffer(); LOG.info("Creating data by SequenceFileAsBinaryOutputFormat"); try { for (int i = 0; i < RECORDS; ++i) { iwritable = new IntWritable(r.nextInt()); iwritable.write(outbuf); bkey.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); dwritable = new DoubleWritable(r.nextDouble()); dwritable.write(outbuf); bval.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); writer.write(bkey, bval); } } finally { writer.close(context); } committer.commitTask(context); committer.commitJob(job); InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>(); int count = 0; r.setSeed(seed); SequenceFileInputFormat.setInputPaths(job, outdir); LOG.info("Reading data by SequenceFileInputFormat"); for (InputSplit split : iformat.getSplits(job)) { RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context); MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>( job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); try { int sourceInt; double sourceDouble; while (reader.nextKeyValue()) { sourceInt = r.nextInt(); sourceDouble = r.nextDouble(); iwritable = reader.getCurrentKey(); dwritable = reader.getCurrentValue(); assertEquals( "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get()); assertTrue( "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0); ++count; } } finally { reader.close(); } } assertEquals("Some records not found", RECORDS, count); }
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); Job jobCopy = new Job(conf); List<InputSplit> splits = new ArrayList<InputSplit>(); Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(job); Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(job); Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>(); // First, build a map of InputFormats to Paths for (Entry<Path, InputFormat> entry : formatMap.entrySet()) { if (!formatPaths.containsKey(entry.getValue().getClass())) { formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>()); } formatPaths.get(entry.getValue().getClass()).add(entry.getKey()); } for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) { Class<? extends InputFormat> formatClass = formatEntry.getKey(); InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf); List<Path> paths = formatEntry.getValue(); Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>(); // Now, for each set of paths that have a common InputFormat, build // a map of Mappers to the paths they're used for for (Path path : paths) { Class<? extends Mapper> mapperClass = mapperMap.get(path); if (!mapperPaths.containsKey(mapperClass)) { mapperPaths.put(mapperClass, new LinkedList<Path>()); } mapperPaths.get(mapperClass).add(path); } // Now each set of paths that has a common InputFormat and Mapper can // be added to the same job, and split together. for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) { paths = mapEntry.getValue(); Class<? extends Mapper> mapperClass = mapEntry.getKey(); if (mapperClass == null) { try { mapperClass = job.getMapperClass(); } catch (ClassNotFoundException e) { throw new IOException("Mapper class is not found", e); } } FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> pathSplits = format.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass)); } } } return splits; }