private org.apache.hadoop.mapreduce.OutputCommitter createOutputCommitter( boolean newApiCommitter, JobID jobId, Configuration conf) throws Exception { org.apache.hadoop.mapreduce.OutputCommitter committer = null; LOG.info("OutputCommitter set in config " + conf.get("mapred.output.committer.class")); if (newApiCommitter) { org.apache.hadoop.mapreduce.TaskID taskId = new org.apache.hadoop.mapreduce.TaskID(jobId, true, 0); org.apache.hadoop.mapreduce.TaskAttemptID taskAttemptID = new org.apache.hadoop.mapreduce.TaskAttemptID(taskId, 0); org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = new TaskAttemptContextImpl(conf, taskAttemptID); OutputFormat outputFormat = ReflectionUtils.newInstance(taskContext.getOutputFormatClass(), conf); committer = outputFormat.getOutputCommitter(taskContext); } else { committer = ReflectionUtils.newInstance( conf.getClass( "mapred.output.committer.class", FileOutputCommitter.class, org.apache.hadoop.mapred.OutputCommitter.class), conf); } LOG.info("OutputCommitter is " + committer.getClass().getName()); return committer; }
@Override public StoreFuncInterface createStoreFunc(POStore store) throws IOException { Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties()); StoreFuncInterface storeFunc = store.getStoreFunc(); JobContext jc = HadoopShims.createJobContext(conf, new JobID()); OutputFormat<?, ?> outputFormat = storeFunc.getOutputFormat(); PigOutputFormat.setLocation(jc, store); context = HadoopShims.createTaskAttemptContext(conf, HadoopShims.getNewTaskAttemptID()); PigOutputFormat.setLocation(context, store); try { outputFormat.checkOutputSpecs(jc); } catch (InterruptedException e) { throw new IOException(e); } try { outputCommitter = outputFormat.getOutputCommitter(context); outputCommitter.setupJob(jc); outputCommitter.setupTask(context); writer = outputFormat.getRecordWriter(context); } catch (InterruptedException e) { throw new IOException(e); } storeFunc.prepareToWrite(writer); return storeFunc; }
@Test public void testDeleteMissing() { TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); JobContext jobContext = new JobContextImpl( taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID()); Configuration conf = jobContext.getConfiguration(); String sourceBase; String targetBase; FileSystem fs = null; try { OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); fs = FileSystem.get(conf); sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault()); targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault()); String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault()); fs.rename(new Path(targetBaseAdd), new Path(targetBase)); DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out")); options.setSyncFolder(true); options.setDeleteMissing(true); options.appendToConf(conf); CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS); Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong())); listing.buildListing(listingFile, options); conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase); conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase); committer.commitJob(jobContext); if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { Assert.fail("Source and target folders are not in sync"); } if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) { Assert.fail("Source and target folders are not in sync"); } // Test for idempotent commit committer.commitJob(jobContext); if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { Assert.fail("Source and target folders are not in sync"); } if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) { Assert.fail("Source and target folders are not in sync"); } } catch (Throwable e) { LOG.error("Exception encountered while testing for delete missing", e); Assert.fail("Delete missing failure"); } finally { TestDistCpUtils.delete(fs, "/tmp1"); conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false"); } }
@Override public void tearDown() throws IOException { if (writer != null) { try { writer.close(context); } catch (InterruptedException e) { throw new IOException(e); } writer = null; } if (outputCommitter.needsTaskCommit(context)) outputCommitter.commitTask(context); HadoopShims.commitOrCleanup(outputCommitter, context); }
@Test public void testPreserveStatus() { TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); JobContext jobContext = new JobContextImpl( taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID()); Configuration conf = jobContext.getConfiguration(); String sourceBase; String targetBase; FileSystem fs = null; try { OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); fs = FileSystem.get(conf); FsPermission sourcePerm = new FsPermission((short) 511); FsPermission initialPerm = new FsPermission((short) 448); sourceBase = TestDistCpUtils.createTestSetup(fs, sourcePerm); targetBase = TestDistCpUtils.createTestSetup(fs, initialPerm); DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out")); options.preserve(FileAttribute.PERMISSION); options.appendToConf(conf); CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS); Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong())); listing.buildListing(listingFile, options); conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase); committer.commitJob(jobContext); if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) { Assert.fail("Permission don't match"); } // Test for idempotent commit committer.commitJob(jobContext); if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) { Assert.fail("Permission don't match"); } } catch (IOException e) { LOG.error("Exception encountered while testing for preserve status", e); Assert.fail("Preserve status failure"); } finally { TestDistCpUtils.delete(fs, "/tmp1"); } }
@Override public void commitJob(JobContext jobContext) throws IOException { Configuration conf = jobContext.getConfiguration(); Set<Path> handledPaths = Sets.newHashSet(); for (Map.Entry<String, OutputCommitter> e : committers.entrySet()) { OutputCommitter oc = e.getValue(); Job job = getJob(jobContext.getJobID(), e.getKey(), conf); configureJob(e.getKey(), job, outputs.get(e.getKey())); if (oc instanceof FileOutputCommitter) { Path outputPath = ((FileOutputCommitter) oc).getWorkPath().getParent(); if (handledPaths.contains(outputPath)) { continue; } else { handledPaths.add(outputPath); } } oc.commitJob(job); } }
@Test public void testNoCommitAction() { TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); JobContext jobContext = new JobContextImpl( taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID()); try { OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); committer.commitJob(jobContext); Assert.assertEquals(taskAttemptContext.getStatus(), "Commit Successful"); // Test for idempotent commit committer.commitJob(jobContext); Assert.assertEquals(taskAttemptContext.getStatus(), "Commit Successful"); } catch (IOException e) { LOG.error("Exception encountered ", e); Assert.fail("Commit failed"); } }
@Test public void testAtomicCommitExistingFinal() { TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); JobContext jobContext = new JobContextImpl( taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID()); Configuration conf = jobContext.getConfiguration(); String workPath = "/tmp1/" + String.valueOf(rand.nextLong()); String finalPath = "/tmp1/" + String.valueOf(rand.nextLong()); FileSystem fs = null; try { OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); fs = FileSystem.get(conf); fs.mkdirs(new Path(workPath)); fs.mkdirs(new Path(finalPath)); conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, workPath); conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, finalPath); conf.setBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, true); Assert.assertTrue(fs.exists(new Path(workPath))); Assert.assertTrue(fs.exists(new Path(finalPath))); try { committer.commitJob(jobContext); Assert.fail("Should not be able to atomic-commit to pre-existing path."); } catch (Exception exception) { Assert.assertTrue(fs.exists(new Path(workPath))); Assert.assertTrue(fs.exists(new Path(finalPath))); LOG.info("Atomic-commit Test pass."); } } catch (IOException e) { LOG.error("Exception encountered while testing for atomic commit.", e); Assert.fail("Atomic commit failure"); } finally { TestDistCpUtils.delete(fs, workPath); TestDistCpUtils.delete(fs, finalPath); } }
/** * Creates an lzo file with random data. * * @param outputDir Output directory. * @param fs File system we're using. * @param attemptContext Task attempt context, contains task id etc. * @throws IOException * @throws InterruptedException */ private byte[] createTestInput( Path outputDir, FileSystem fs, TaskAttemptContext attemptContext, int charsToOutput) throws IOException, InterruptedException { TextOutputFormat<Text, Text> output = new TextOutputFormat<Text, Text>(); RecordWriter<Text, Text> rw = null; md5_.reset(); try { rw = output.getRecordWriter(attemptContext); char[] chars = "abcdefghijklmnopqrstuvwxyz\u00E5\u00E4\u00F6".toCharArray(); Random r = new Random(System.currentTimeMillis()); Text key = new Text(); Text value = new Text(); int charsMax = chars.length - 1; for (int i = 0; i < charsToOutput; ) { i += fillText(chars, r, charsMax, key); i += fillText(chars, r, charsMax, value); rw.write(key, value); md5_.update(key.getBytes(), 0, key.getLength()); // text output format writes tab between the key and value md5_.update("\t".getBytes("UTF-8")); md5_.update(value.getBytes(), 0, value.getLength()); } } finally { if (rw != null) { rw.close(attemptContext); OutputCommitter committer = output.getOutputCommitter(attemptContext); committer.commitTask(attemptContext); committer.cleanupJob(attemptContext); } } byte[] result = md5_.digest(); md5_.reset(); return result; }
public void testBinary() throws IOException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq"); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); FileOutputFormat.setOutputPath(job, outdir); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class); SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true); SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); BytesWritable bkey = new BytesWritable(); BytesWritable bval = new BytesWritable(); TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration()); OutputFormat<BytesWritable, BytesWritable> outputFormat = new SequenceFileAsBinaryOutputFormat(); OutputCommitter committer = outputFormat.getOutputCommitter(context); committer.setupJob(job); RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context); IntWritable iwritable = new IntWritable(); DoubleWritable dwritable = new DoubleWritable(); DataOutputBuffer outbuf = new DataOutputBuffer(); LOG.info("Creating data by SequenceFileAsBinaryOutputFormat"); try { for (int i = 0; i < RECORDS; ++i) { iwritable = new IntWritable(r.nextInt()); iwritable.write(outbuf); bkey.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); dwritable = new DoubleWritable(r.nextDouble()); dwritable.write(outbuf); bval.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); writer.write(bkey, bval); } } finally { writer.close(context); } committer.commitTask(context); committer.commitJob(job); InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>(); int count = 0; r.setSeed(seed); SequenceFileInputFormat.setInputPaths(job, outdir); LOG.info("Reading data by SequenceFileInputFormat"); for (InputSplit split : iformat.getSplits(job)) { RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context); MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>( job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); try { int sourceInt; double sourceDouble; while (reader.nextKeyValue()) { sourceInt = r.nextInt(); sourceDouble = r.nextDouble(); iwritable = reader.getCurrentKey(); dwritable = reader.getCurrentValue(); assertEquals( "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get()); assertTrue( "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0); ++count; } } finally { reader.close(); } } assertEquals("Some records not found", RECORDS, count); }
@SuppressWarnings("unchecked") @Override public void run() { JobID jobId = profile.getJobID(); JobContext jContext = new JobContextImpl(conf, jobId); org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = null; try { outputCommitter = createOutputCommitter(conf.getUseNewMapper(), jobId, conf); } catch (Exception e) { LOG.info("Failed to createOutputCommitter", e); return; } try { TaskSplitMetaInfo[] taskSplitMetaInfos = SplitMetaInfoReader.readSplitMetaInfo(jobId, localFs, conf, systemJobDir); int numReduceTasks = job.getNumReduceTasks(); if (numReduceTasks > 1 || numReduceTasks < 0) { // we only allow 0 or 1 reducer in local mode numReduceTasks = 1; job.setNumReduceTasks(1); } outputCommitter.setupJob(jContext); status.setSetupProgress(1.0f); Map<TaskAttemptID, MapOutputFile> mapOutputFiles = Collections.synchronizedMap(new HashMap<TaskAttemptID, MapOutputFile>()); List<MapTaskRunnable> taskRunnables = getMapTaskRunnables(taskSplitMetaInfos, jobId, mapOutputFiles); ExecutorService mapService = createMapExecutor(taskRunnables.size()); // Start populating the executor with work units. // They may begin running immediately (in other threads). for (Runnable r : taskRunnables) { mapService.submit(r); } try { mapService.shutdown(); // Instructs queue to drain. // Wait for tasks to finish; do not use a time-based timeout. // (See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6179024) LOG.info("Waiting for map tasks"); mapService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); } catch (InterruptedException ie) { // Cancel all threads. mapService.shutdownNow(); throw ie; } LOG.info("Map task executor complete."); // After waiting for the map tasks to complete, if any of these // have thrown an exception, rethrow it now in the main thread context. for (MapTaskRunnable r : taskRunnables) { if (r.storedException != null) { throw new Exception(r.storedException); } } TaskAttemptID reduceId = new TaskAttemptID(new TaskID(jobId, false, 0), 0); try { if (numReduceTasks > 0) { ReduceTask reduce = new ReduceTask(systemJobFile.toString(), reduceId, 0, mapIds.size(), 1); reduce.setUser(UserGroupInformation.getCurrentUser().getShortUserName()); JobConf localConf = new JobConf(job); localConf.set("mapreduce.jobtracker.address", "local"); TaskRunner.setupChildMapredLocalDirs(reduce, localConf); // move map output to reduce input for (int i = 0; i < mapIds.size(); i++) { if (!this.isInterrupted()) { TaskAttemptID mapId = mapIds.get(i); Path mapOut = mapOutputFiles.get(mapId).getOutputFile(); MapOutputFile localOutputFile = new MapOutputFile(); localOutputFile.setConf(localConf); Path reduceIn = localOutputFile.getInputFileForWrite( mapId.getTaskID(), localFs.getFileStatus(mapOut).getLen()); if (!localFs.mkdirs(reduceIn.getParent())) { throw new IOException( "Mkdirs failed to create " + reduceIn.getParent().toString()); } if (!localFs.rename(mapOut, reduceIn)) throw new IOException("Couldn't rename " + mapOut); } else { throw new InterruptedException(); } } if (!this.isInterrupted()) { reduce.setJobFile(localJobFile.toString()); localConf.setUser(reduce.getUser()); reduce.localizeConfiguration(localConf); reduce.setConf(localConf); reduce_tasks += 1; myMetrics.launchReduce(reduce.getTaskID()); reduce.run(localConf, this); myMetrics.completeReduce(reduce.getTaskID()); reduce_tasks -= 1; } else { throw new InterruptedException(); } } } finally { for (MapOutputFile output : mapOutputFiles.values()) { output.removeAll(); } } // delete the temporary directory in output directory outputCommitter.commitJob(jContext); status.setCleanupProgress(1.0f); if (killed) { this.status.setRunState(JobStatus.KILLED); } else { this.status.setRunState(JobStatus.SUCCEEDED); } JobEndNotifier.localRunnerNotification(job, status); } catch (Throwable t) { try { outputCommitter.abortJob(jContext, org.apache.hadoop.mapreduce.JobStatus.State.FAILED); } catch (IOException ioe) { LOG.info("Error cleaning up job:" + id); } status.setCleanupProgress(1.0f); if (killed) { this.status.setRunState(JobStatus.KILLED); } else { this.status.setRunState(JobStatus.FAILED); } LOG.warn(id, t); JobEndNotifier.localRunnerNotification(job, status); } finally { try { fs.delete(systemJobFile.getParent(), true); // delete submit dir localFs.delete(localJobFile, true); // delete local copy // Cleanup distributed cache taskDistributedCacheManager.release(); trackerDistributedCacheManager.purgeCache(); } catch (IOException e) { LOG.warn("Error cleaning up " + id + ": " + e); } } }