public VertexManagerPluginDescriptor build() { VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(ShuffleVertexManager.class.getName()); try { return desc.setUserPayload(TezUtils.createUserPayloadFromConf(this.conf)); } catch (IOException e) { throw new TezUncheckedException(e); } }
public static void main(String[] args) { try { Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler()); String containerIdStr = System.getenv(Environment.CONTAINER_ID.name()); String nodeHostString = System.getenv(Environment.NM_HOST.name()); String nodePortString = System.getenv(Environment.NM_PORT.name()); String nodeHttpPortString = System.getenv(Environment.NM_HTTP_PORT.name()); String appSubmitTimeStr = System.getenv(ApplicationConstants.APP_SUBMIT_TIME_ENV); validateInputParam(appSubmitTimeStr, ApplicationConstants.APP_SUBMIT_TIME_ENV); ContainerId containerId = ConverterUtils.toContainerId(containerIdStr); ApplicationAttemptId applicationAttemptId = containerId.getApplicationAttemptId(); long appSubmitTime = Long.parseLong(appSubmitTimeStr); Configuration conf = new Configuration(new YarnConfiguration()); TezUtils.addUserSpecifiedTezConfiguration(conf); String jobUserName = System.getenv(ApplicationConstants.Environment.USER.name()); // Do not automatically close FileSystem objects so that in case of // SIGTERM I have a chance to write out the job history. I'll be closing // the objects myself. conf.setBoolean("fs.automatic.close", false); // Command line options Options opts = new Options(); opts.addOption( TezConstants.TEZ_SESSION_MODE_CLI_OPTION, false, "Run Tez Application Master in Session mode"); CommandLine cliParser = new GnuParser().parse(opts, args); DAGAppMaster appMaster = new DAGAppMaster( applicationAttemptId, containerId, nodeHostString, Integer.parseInt(nodePortString), Integer.parseInt(nodeHttpPortString), appSubmitTime, cliParser.hasOption(TezConstants.TEZ_SESSION_MODE_CLI_OPTION)); ShutdownHookManager.get() .addShutdownHook(new DAGAppMasterShutdownHook(appMaster), SHUTDOWN_HOOK_PRIORITY); initAndStartAppMaster(appMaster, conf, jobUserName); } catch (Throwable t) { LOG.fatal("Error starting DAGAppMaster", t); System.exit(1); } }
public ShuffleManager( TezInputContext inputContext, Configuration conf, int numInputs, int bufferSize, boolean ifileReadAheadEnabled, int ifileReadAheadLength, CompressionCodec codec, FetchedInputAllocator inputAllocator) throws IOException { this.inputContext = inputContext; this.numInputs = numInputs; this.shuffledInputsCounter = inputContext.getCounters().findCounter(TaskCounter.NUM_SHUFFLED_INPUTS); this.failedShufflesCounter = inputContext.getCounters().findCounter(TaskCounter.NUM_FAILED_SHUFFLE_INPUTS); this.bytesShuffledCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES); this.decompressedDataSizeCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_DECOMPRESSED); this.bytesShuffledToDiskCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_DISK); this.bytesShuffledToMemCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_MEM); this.ifileBufferSize = bufferSize; this.ifileReadAhead = ifileReadAheadEnabled; this.ifileReadAheadLength = ifileReadAheadLength; this.codec = codec; this.inputManager = inputAllocator; this.srcNameTrimmed = TezUtils.cleanVertexName(inputContext.getSourceVertexName()); completedInputSet = Collections.newSetFromMap(new ConcurrentHashMap<InputIdentifier, Boolean>(numInputs)); completedInputs = new LinkedBlockingQueue<FetchedInput>(numInputs); knownSrcHosts = new ConcurrentHashMap<String, InputHost>(); pendingHosts = new LinkedBlockingQueue<InputHost>(); obsoletedInputs = Collections.newSetFromMap(new ConcurrentHashMap<InputAttemptIdentifier, Boolean>()); runningFetchers = Collections.newSetFromMap(new ConcurrentHashMap<Fetcher, Boolean>()); int maxConfiguredFetchers = conf.getInt( TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES_DEFAULT); this.numFetchers = Math.min(maxConfiguredFetchers, numInputs); ExecutorService fetcherRawExecutor = Executors.newFixedThreadPool( numFetchers, new ThreadFactoryBuilder() .setDaemon(true) .setNameFormat("Fetcher [" + srcNameTrimmed + "] #%d") .build()); this.fetcherExecutor = MoreExecutors.listeningDecorator(fetcherRawExecutor); ExecutorService schedulerRawExecutor = Executors.newFixedThreadPool( 1, new ThreadFactoryBuilder() .setDaemon(true) .setNameFormat("ShuffleRunner [" + srcNameTrimmed + "]") .build()); this.schedulerExecutor = MoreExecutors.listeningDecorator(schedulerRawExecutor); this.startTime = System.currentTimeMillis(); this.lastProgressTime = startTime; this.shuffleSecret = ShuffleUtils.getJobTokenSecretFromTokenBytes( inputContext.getServiceConsumerMetaData( TezConfiguration.TEZ_SHUFFLE_HANDLER_SERVICE_ID)); httpConnectionParams = ShuffleUtils.constructHttpShuffleConnectionParams(conf); LOG.info( this.getClass().getSimpleName() + " : numInputs=" + numInputs + ", compressionCodec=" + (codec == null ? "NoCompressionCodec" : codec.getClass().getName()) + ", numFetchers=" + numFetchers + ", ifileBufferSize=" + ifileBufferSize + ", ifileReadAheadEnabled=" + ifileReadAhead + ", ifileReadAheadLength=" + ifileReadAheadLength + ", " + httpConnectionParams.toString()); }
@Override public void initialize(VertexManagerPluginContext context) { Configuration conf; try { conf = TezUtils.createConfFromUserPayload(context.getUserPayload()); } catch (IOException e) { throw new TezUncheckedException(e); } this.context = context; this.slowStartMinSrcCompletionFraction = conf.getFloat( ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION, ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION_DEFAULT); this.slowStartMaxSrcCompletionFraction = conf.getFloat( ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_MAX_SRC_FRACTION, ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_MAX_SRC_FRACTION_DEFAULT); if (slowStartMinSrcCompletionFraction < 0 || slowStartMaxSrcCompletionFraction < slowStartMinSrcCompletionFraction) { throw new IllegalArgumentException( "Invalid values for slowStartMinSrcCompletionFraction" + "/slowStartMaxSrcCompletionFraction. Min cannot be < 0 and " + "max cannot be < min."); } enableAutoParallelism = conf.getBoolean( ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL, ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL_DEFAULT); desiredTaskInputDataSize = conf.getLong( ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE, ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE_DEFAULT); minTaskParallelism = conf.getInt( ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_MIN_TASK_PARALLELISM, ShuffleVertexManager.TEZ_AM_SHUFFLE_VERTEX_MANAGER_MIN_TASK_PARALLELISM_DEFAULT); LOG.info( "Shuffle Vertex Manager: settings" + " minFrac:" + slowStartMinSrcCompletionFraction + " maxFrac:" + slowStartMaxSrcCompletionFraction + " auto:" + enableAutoParallelism + " desiredTaskIput:" + desiredTaskInputDataSize + " minTasks:" + minTaskParallelism); Map<String, EdgeProperty> inputs = context.getInputVertexEdgeProperties(); for (Map.Entry<String, EdgeProperty> entry : inputs.entrySet()) { if (entry.getValue().getDataMovementType() == DataMovementType.SCATTER_GATHER) { String vertex = entry.getKey(); bipartiteSources.put(vertex, new HashSet<Integer>()); } } if (bipartiteSources.isEmpty()) { throw new TezUncheckedException("Atleast 1 bipartite source should exist"); } // dont track the source tasks here since those tasks may themselves be // dynamically changed as the DAG progresses. }
@Override public void initialize() { Configuration conf; try { conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload()); } catch (IOException e) { throw new TezUncheckedException(e); } this.slowStartMinSrcCompletionFraction = conf.getFloat( ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION, ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION_DEFAULT); float defaultSlowStartMaxSrcFraction = ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MAX_SRC_FRACTION_DEFAULT; if (slowStartMinSrcCompletionFraction > defaultSlowStartMaxSrcFraction) { defaultSlowStartMaxSrcFraction = slowStartMinSrcCompletionFraction; } this.slowStartMaxSrcCompletionFraction = conf.getFloat( ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MAX_SRC_FRACTION, defaultSlowStartMaxSrcFraction); if (slowStartMinSrcCompletionFraction < 0 || slowStartMaxSrcCompletionFraction > 1 || slowStartMaxSrcCompletionFraction < slowStartMinSrcCompletionFraction) { throw new IllegalArgumentException( "Invalid values for slowStartMinSrcCompletionFraction" + "/slowStartMaxSrcCompletionFraction. Min cannot be < 0, max cannot be > 1," + " and max cannot be < min."); } enableAutoParallelism = conf.getBoolean( ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL, ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL_DEFAULT); desiredTaskInputDataSize = conf.getLong( ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE, ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE_DEFAULT); minTaskParallelism = Math.max( 1, conf.getInt( ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_TASK_PARALLELISM, ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_TASK_PARALLELISM_DEFAULT)); LOG.info( "Shuffle Vertex Manager: settings" + " minFrac:" + slowStartMinSrcCompletionFraction + " maxFrac:" + slowStartMaxSrcCompletionFraction + " auto:" + enableAutoParallelism + " desiredTaskIput:" + desiredTaskInputDataSize + " minTasks:" + minTaskParallelism); updatePendingTasks(); if (enableAutoParallelism) { getContext().vertexReconfigurationPlanned(); } // dont track the source tasks here since those tasks may themselves be // dynamically changed as the DAG progresses. }
@Test public void testReduceProcessor() throws Exception { final String dagName = "mrdag0"; String mapVertexName = MultiStageMRConfigUtil.getInitialMapVertexName(); String reduceVertexName = MultiStageMRConfigUtil.getFinalReduceVertexName(); JobConf jobConf = new JobConf(defaultConf); setUpJobConf(jobConf); MRHelpers.translateVertexConfToTez(jobConf); jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0); jobConf.set( MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir, "localized-resources").toUri().toString()); jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false); Path mapInput = new Path(workDir, "map0"); MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput); InputSpec mapInputSpec = new InputSpec( "NullSrcVertex", new InputDescriptor(MRInputLegacy.class.getName()) .setUserPayload(MRHelpers.createMRInputPayload(jobConf, null)), 1); OutputSpec mapOutputSpec = new OutputSpec( "NullDestVertex", new OutputDescriptor(LocalOnFileSorterOutput.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1); // Run a map LogicalIOProcessorRuntimeTask mapTask = MapUtils.createLogicalTask( localFs, workDir, jobConf, 0, mapInput, new TestUmbilical(), dagName, mapVertexName, Collections.singletonList(mapInputSpec), Collections.singletonList(mapOutputSpec)); mapTask.initialize(); mapTask.run(); mapTask.close(); LOG.info("Starting reduce..."); Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>(); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.set( MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir, "localized-resources").toUri().toString()); FileOutputFormat.setOutputPath(jobConf, new Path(workDir, "output")); ProcessorDescriptor reduceProcessorDesc = new ProcessorDescriptor(ReduceProcessor.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)); InputSpec reduceInputSpec = new InputSpec( mapVertexName, new InputDescriptor(LocalMergedInput.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1); OutputSpec reduceOutputSpec = new OutputSpec( "NullDestinationVertex", new OutputDescriptor(MROutputLegacy.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1); // Now run a reduce TaskSpec taskSpec = new TaskSpec( TezTestUtils.getMockTaskAttemptId(0, 1, 0, 0), dagName, reduceVertexName, reduceProcessorDesc, Collections.singletonList(reduceInputSpec), Collections.singletonList(reduceOutputSpec), null); Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>(); serviceConsumerMetadata.put( ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID, ShuffleUtils.convertJobTokenToBytes(shuffleToken)); LogicalIOProcessorRuntimeTask task = new LogicalIOProcessorRuntimeTask( taskSpec, 0, jobConf, new String[] {workDir.toString()}, new TestUmbilical(), serviceConsumerMetadata, HashMultimap.<String, String>create()); task.initialize(); task.run(); task.close(); // MRTask mrTask = (MRTask)t.getProcessor(); // TODO NEWTEZ Verify the partitioner has not been created // Likely not applicable anymore. // Assert.assertNull(mrTask.getPartitioner()); // Only a task commit happens, hence the data is still in the temporary directory. Path reduceOutputDir = new Path( new Path(workDir, "output"), "_temporary/0/" + IDConverter.toMRTaskIdForOutput(TezTestUtils.getMockTaskId(0, 1, 0))); Path reduceOutputFile = new Path(reduceOutputDir, "part-v001-o000-00000"); SequenceFile.Reader reader = new SequenceFile.Reader(localFs, reduceOutputFile, jobConf); LongWritable key = new LongWritable(); Text value = new Text(); long prev = Long.MIN_VALUE; while (reader.next(key, value)) { if (prev != Long.MIN_VALUE) { Assert.assertTrue(prev < key.get()); prev = key.get(); } } reader.close(); }
@Test(timeout = 5000) public void testTaskAttemptFailedKilled() throws IOException, TezException { ApplicationId appId = ApplicationId.newInstance(1000, 1); ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 1); Credentials credentials = new Credentials(); AppContext appContext = mock(AppContext.class); EventHandler eventHandler = mock(EventHandler.class); DAG dag = mock(DAG.class); AMContainerMap amContainerMap = mock(AMContainerMap.class); Map<ApplicationAccessType, String> appAcls = new HashMap<ApplicationAccessType, String>(); doReturn(eventHandler).when(appContext).getEventHandler(); doReturn(dag).when(appContext).getCurrentDAG(); doReturn(appAttemptId).when(appContext).getApplicationAttemptId(); doReturn(credentials).when(appContext).getAppCredentials(); doReturn(appAcls).when(appContext).getApplicationACLs(); doReturn(amContainerMap).when(appContext).getAllContainers(); NodeId nodeId = NodeId.newInstance("localhost", 0); AMContainer amContainer = mock(AMContainer.class); Container container = mock(Container.class); doReturn(nodeId).when(container).getNodeId(); doReturn(amContainer).when(amContainerMap).get(any(ContainerId.class)); doReturn(container).when(amContainer).getContainer(); Configuration conf = new TezConfiguration(); UserPayload userPayload = TezUtils.createUserPayloadFromConf(conf); TaskCommunicatorManager taskAttemptListener = new TaskCommunicatorManager( appContext, mock(TaskHeartbeatHandler.class), mock(ContainerHeartbeatHandler.class), Lists.newArrayList( new NamedEntityDescriptor(TezConstants.getTezYarnServicePluginName(), null) .setUserPayload(userPayload))); TaskSpec taskSpec1 = mock(TaskSpec.class); TezTaskAttemptID taskAttemptId1 = mock(TezTaskAttemptID.class); doReturn(taskAttemptId1).when(taskSpec1).getTaskAttemptID(); AMContainerTask amContainerTask1 = new AMContainerTask(taskSpec1, null, null, false, 10); TaskSpec taskSpec2 = mock(TaskSpec.class); TezTaskAttemptID taskAttemptId2 = mock(TezTaskAttemptID.class); doReturn(taskAttemptId2).when(taskSpec2).getTaskAttemptID(); AMContainerTask amContainerTask2 = new AMContainerTask(taskSpec2, null, null, false, 10); ContainerId containerId1 = createContainerId(appId, 1); taskAttemptListener.registerRunningContainer(containerId1, 0); taskAttemptListener.registerTaskAttempt(amContainerTask1, containerId1, 0); ContainerId containerId2 = createContainerId(appId, 2); taskAttemptListener.registerRunningContainer(containerId2, 0); taskAttemptListener.registerTaskAttempt(amContainerTask2, containerId2, 0); taskAttemptListener.taskFailed( taskAttemptId1, TaskAttemptEndReason.COMMUNICATION_ERROR, "Diagnostics1"); taskAttemptListener.taskKilled( taskAttemptId2, TaskAttemptEndReason.EXECUTOR_BUSY, "Diagnostics2"); ArgumentCaptor<Event> argumentCaptor = ArgumentCaptor.forClass(Event.class); verify(eventHandler, times(2)).handle(argumentCaptor.capture()); assertTrue(argumentCaptor.getAllValues().get(0) instanceof TaskAttemptEventAttemptFailed); assertTrue(argumentCaptor.getAllValues().get(1) instanceof TaskAttemptEventAttemptKilled); TaskAttemptEventAttemptFailed failedEvent = (TaskAttemptEventAttemptFailed) argumentCaptor.getAllValues().get(0); TaskAttemptEventAttemptKilled killedEvent = (TaskAttemptEventAttemptKilled) argumentCaptor.getAllValues().get(1); assertEquals("Diagnostics1", failedEvent.getDiagnosticInfo()); assertEquals( TaskAttemptTerminationCause.COMMUNICATION_ERROR, failedEvent.getTerminationCause()); assertEquals("Diagnostics2", killedEvent.getDiagnosticInfo()); assertEquals(TaskAttemptTerminationCause.SERVICE_BUSY, killedEvent.getTerminationCause()); // TODO TEZ-2003. Verify unregistration from the registered list }