/** * restoreFromTaskStartedEvent -> restoreFromTaskAttemptFinishedEvent (FAILED) -> RecoverTranstion */ @Test(timeout = 5000) public void testRecovery_OnlyTAFinishedEvent_FAILED() { restoreFromTaskStartEvent(); TezTaskAttemptID taId = getNewTaskAttemptID(task.getTaskId()); task.restoreFromEvent( new TaskAttemptFinishedEvent( taId, vertexName, 0L, 0L, TaskAttemptState.FAILED, TaskAttemptTerminationCause.CONTAINER_LAUNCH_FAILED, "", new TezCounters(), 0, null)); task.handle(new TaskEventRecoverTask(task.getTaskId())); // wait for the second task attempt is scheduled dispatcher.await(); assertEquals(TaskStateInternal.RUNNING, task.getInternalState()); // taskAttempt_1 is recovered to FAILED, and new task attempt is scheduled assertEquals(2, task.getAttempts().size()); assertEquals(1, task.getFinishedAttemptsCount()); assertEquals(1, task.failedAttempts); assertEquals(null, task.successfulAttempt); }
@Test(timeout = 10000) public void testFailAbortDoesntHang() throws IOException { Configuration conf = new Configuration(); conf.set(MRJobConfig.MR_AM_STAGING_DIR, stagingDir); conf.set(MRJobConfig.MR_AM_COMMITTER_CANCEL_TIMEOUT_MS, "1000"); DrainDispatcher dispatcher = new DrainDispatcher(); dispatcher.init(conf); dispatcher.start(); OutputCommitter committer = Mockito.mock(OutputCommitter.class); CommitterEventHandler commitHandler = createCommitterEventHandler(dispatcher, committer); commitHandler.init(conf); commitHandler.start(); // Job has only 1 mapper task. No reducers conf.setInt(MRJobConfig.NUM_REDUCES, 0); conf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, 1); JobImpl job = createRunningStubbedJob(conf, dispatcher, 1, null); // Fail / finish all the tasks. This should land the JobImpl directly in the // FAIL_ABORT state for (Task t : job.tasks.values()) { TaskImpl task = (TaskImpl) t; task.handle(new TaskEvent(task.getID(), TaskEventType.T_SCHEDULE)); for (TaskAttempt ta : task.getAttempts().values()) { task.handle(new TaskTAttemptEvent(ta.getID(), TaskEventType.T_ATTEMPT_FAILED)); } } assertJobState(job, JobStateInternal.FAIL_ABORT); dispatcher.await(); // Verify abortJob is called once and the job failed Mockito.verify(committer, Mockito.timeout(2000).times(1)) .abortJob((JobContext) Mockito.any(), (State) Mockito.any()); assertJobState(job, JobStateInternal.FAILED); dispatcher.stop(); }
/** restoreFromTaskStartedEvent -> restoreFromTaskAttemptStartedEvent -> RecoverTranstion */ @Test(timeout = 5000) public void testRecovery_OneTAStarted() { restoreFromTaskStartEvent(); TezTaskAttemptID taId = getNewTaskAttemptID(task.getTaskId()); restoreFromFirstTaskAttemptStartEvent(taId); task.handle(new TaskEventRecoverTask(task.getTaskId())); // wait for the second task attempt is scheduled dispatcher.await(); assertEquals(TaskStateInternal.RUNNING, task.getInternalState()); // taskAttempt_1 is recovered to KILLED, and new task attempt is scheduled assertEquals(2, task.getAttempts().size()); assertEquals(1, task.getFinishedAttemptsCount()); assertEquals(0, task.failedAttempts); assertEquals(null, task.successfulAttempt); }
@Before public void setUp() { dispatcher = new DrainDispatcher(); dispatcher.register(DAGEventType.class, mock(EventHandler.class)); dispatcher.register(VertexEventType.class, mock(EventHandler.class)); dispatcher.register(TaskEventType.class, new TaskEventHandler()); dispatcher.register(TaskAttemptEventType.class, taEventHandler); dispatcher.init(new Configuration()); dispatcher.start(); vertex = mock(Vertex.class, RETURNS_DEEP_STUBS); when(vertex.getProcessorDescriptor().getClassName()).thenReturn(""); mockAppContext = mock(AppContext.class, RETURNS_DEEP_STUBS); when(mockAppContext.getCurrentDAG().getVertex(any(TezVertexID.class))).thenReturn(vertex); mockHistoryEventHandler = new MockHistoryEventHandler(mockAppContext); when(mockAppContext.getHistoryHandler()).thenReturn(mockHistoryEventHandler); task = new TaskImpl( vertexId, 0, dispatcher.getEventHandler(), new Configuration(), mock(TaskCommunicatorManagerInterface.class), new SystemClock(), mock(TaskHeartbeatHandler.class), mockAppContext, false, Resource.newInstance(1, 1), mock(ContainerContext.class), mock(StateChangeNotifier.class), vertex); Map<String, OutputCommitter> committers = new HashMap<String, OutputCommitter>(); committers.put( "out1", new TestOutputCommitter(mock(OutputCommitterContext.class), true, false)); when(task.getVertex().getOutputCommitters()).thenReturn(committers); }
@Test public void testReconnectNode() throws Exception { final DrainDispatcher dispatcher = new DrainDispatcher(); rm = new MockRM() { @Override protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() { return new SchedulerEventDispatcher(this.scheduler) { @Override public void handle(SchedulerEvent event) { scheduler.handle(event); } }; } @Override protected Dispatcher createDispatcher() { return dispatcher; } }; rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 5120); nm1.nodeHeartbeat(true); nm2.nodeHeartbeat(false); dispatcher.await(); checkUnealthyNMCount(rm, nm2, true, 1); final int expectedNMs = ClusterMetrics.getMetrics().getNumActiveNMs(); QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics(); // TODO Metrics incorrect in case of the FifoScheduler Assert.assertEquals(5120, metrics.getAvailableMB()); // reconnect of healthy node nm1 = rm.registerNode("host1:1234", 5120); NodeHeartbeatResponse response = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); dispatcher.await(); Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs()); checkUnealthyNMCount(rm, nm2, true, 1); // reconnect of unhealthy node nm2 = rm.registerNode("host2:5678", 5120); response = nm2.nodeHeartbeat(false); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); dispatcher.await(); Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs()); checkUnealthyNMCount(rm, nm2, true, 1); // unhealthy node changed back to healthy nm2 = rm.registerNode("host2:5678", 5120); dispatcher.await(); response = nm2.nodeHeartbeat(true); response = nm2.nodeHeartbeat(true); dispatcher.await(); Assert.assertEquals(5120 + 5120, metrics.getAvailableMB()); // reconnect of node with changed capability nm1 = rm.registerNode("host2:5678", 10240); dispatcher.await(); response = nm1.nodeHeartbeat(true); dispatcher.await(); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); Assert.assertEquals(5120 + 10240, metrics.getAvailableMB()); // reconnect of node with changed capability and running applications List<ApplicationId> runningApps = new ArrayList<ApplicationId>(); runningApps.add(ApplicationId.newInstance(1, 0)); nm1 = rm.registerNode("host2:5678", 15360, 2, runningApps); dispatcher.await(); response = nm1.nodeHeartbeat(true); dispatcher.await(); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); Assert.assertEquals(5120 + 15360, metrics.getAvailableMB()); // reconnect healthy node changing http port nm1 = new MockNM("host1:1234", 5120, rm.getResourceTrackerService()); nm1.setHttpPort(3); nm1.registerNode(); dispatcher.await(); response = nm1.nodeHeartbeat(true); response = nm1.nodeHeartbeat(true); dispatcher.await(); RMNode rmNode = rm.getRMContext().getRMNodes().get(nm1.getNodeId()); Assert.assertEquals(3, rmNode.getHttpPort()); Assert.assertEquals(5120, rmNode.getTotalCapability().getMemory()); Assert.assertEquals(5120 + 15360, metrics.getAvailableMB()); }
/** Decommissioning using a pre-configured exclude hosts file */ @Test public void testDecommissionWithExcludeHosts() throws Exception { Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath()); writeToHostsFile(""); final DrainDispatcher dispatcher = new DrainDispatcher(); rm = new MockRM(conf) { @Override protected Dispatcher createDispatcher() { return dispatcher; } }; rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); MockNM nm3 = rm.registerNode("localhost:4433", 1024); dispatcher.await(); int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); dispatcher.await(); // To test that IPs also work String ip = NetUtils.normalizeHostName("localhost"); writeToHostsFile("host2", ip); rm.getNodesListManager().refreshNodes(conf); checkDecommissionedNMCount(rm, metricCount + 2); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue( "The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm3.nodeHeartbeat(true); Assert.assertTrue( "The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); dispatcher.await(); writeToHostsFile(""); rm.getNodesListManager().refreshNodes(conf); nm3 = rm.registerNode("localhost:4433", 1024); dispatcher.await(); nodeHeartbeat = nm3.nodeHeartbeat(true); dispatcher.await(); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); // decommissined node is 1 since 1 node is rejoined after updating exclude // file checkDecommissionedNMCount(rm, metricCount + 1); }
/** * n = maxFailedAttempts, in the previous AM attempt, n-1 task attempts are killed. And last task * attempt is still in running state. When recovering, the last attempt should transit to killed * and task is still in running state and new task attempt is scheduled. */ @Test(timeout = 5000) public void testTaskRecovery_MultipleAttempts3() throws InterruptedException { int maxFailedAttempts = conf.getInt( TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS, TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS_DEFAULT); restoreFromTaskStartEvent(); for (int i = 0; i < maxFailedAttempts - 1; ++i) { TezTaskAttemptID taId = getNewTaskAttemptID(task.getTaskId()); task.restoreFromEvent( new TaskAttemptStartedEvent( taId, vertexName, 0L, mock(ContainerId.class), mock(NodeId.class), "", "", "", 0, null, 0)); task.restoreFromEvent( new TaskAttemptFinishedEvent( taId, vertexName, 0, 0, TaskAttemptState.FAILED, null, "", null, 0, null)); } assertEquals(maxFailedAttempts - 1, task.getAttempts().size()); assertEquals(maxFailedAttempts - 1, task.failedAttempts); TezTaskAttemptID newTaskAttemptId = getNewTaskAttemptID(task.getTaskId()); TaskState recoveredState = task.restoreFromEvent( new TaskAttemptStartedEvent( newTaskAttemptId, vertexName, 0, mock(ContainerId.class), mock(NodeId.class), "", "", "", 0, null, 0)); assertEquals(TaskState.RUNNING, recoveredState); assertEquals( TaskAttemptStateInternal.NEW, ((TaskAttemptImpl) task.getAttempt(newTaskAttemptId)).getInternalState()); assertEquals(maxFailedAttempts, task.getAttempts().size()); task.handle(new TaskEventRecoverTask(task.getTaskId())); // wait until task attempt receive the Recover event from task dispatcher.await(); assertEquals(TaskStateInternal.RUNNING, task.getInternalState()); assertEquals( TaskAttemptStateInternal.KILLED, ((TaskAttemptImpl) (task.getAttempt(newTaskAttemptId))).getInternalState()); assertEquals(maxFailedAttempts - 1, task.failedAttempts); // new task attempt is added assertEquals(maxFailedAttempts + 1, task.getAttempts().size()); }
/** * restoreFromTaskStartedEvent -> restoreFromTaskAttemptStartedEvent -> * restoreFromTaskAttemptFinishedEvent (KILLED) -> RecoverTransition */ @Test(timeout = 5000) public void testRecovery_OneTAStarted_Killed() { restoreFromTaskStartEvent(); long taStartTime = taskStartTime + 100L; TezTaskAttemptID taId = getNewTaskAttemptID(task.getTaskId()); TaskState recoveredState = task.restoreFromEvent( new TaskAttemptStartedEvent( taId, vertexName, taStartTime, mock(ContainerId.class), mock(NodeId.class), "", "", "", 0, null, 0)); assertEquals(TaskState.RUNNING, recoveredState); assertEquals( TaskAttemptStateInternal.NEW, ((TaskAttemptImpl) task.getAttempt(taId)).getInternalState()); assertEquals(1, task.getAttempts().size()); assertEquals(0, task.getFinishedAttemptsCount()); assertEquals(0, task.failedAttempts); assertEquals(1, task.getUncompletedAttemptsCount()); assertEquals(null, task.successfulAttempt); long taFinishTime = taStartTime + 100L; recoveredState = task.restoreFromEvent( new TaskAttemptFinishedEvent( taId, vertexName, taStartTime, taFinishTime, TaskAttemptState.KILLED, null, "", new TezCounters(), 0, null)); assertEquals(TaskState.RUNNING, recoveredState); assertEquals( TaskAttemptStateInternal.NEW, ((TaskAttemptImpl) task.getAttempt(taId)).getInternalState()); assertEquals(1, task.getAttempts().size()); assertEquals(1, task.getFinishedAttemptsCount()); assertEquals(0, task.failedAttempts); assertEquals(0, task.getUncompletedAttemptsCount()); assertEquals(null, task.successfulAttempt); task.handle(new TaskEventRecoverTask(task.getTaskId())); // wait for Task send TA_RECOVER to TA and TA complete the RecoverTransition dispatcher.await(); assertEquals(TaskStateInternal.RUNNING, task.getInternalState()); assertEquals( TaskAttemptStateInternal.KILLED, ((TaskAttemptImpl) task.getAttempt(taId)).getInternalState()); // new task attempt is scheduled assertEquals(2, task.getAttempts().size()); assertEquals(1, task.getFinishedAttemptsCount()); assertEquals(0, task.failedAttempts); assertEquals(1, task.getUncompletedAttemptsCount()); assertEquals(null, task.successfulAttempt); }
private void syncNodeLost(MockNM nm) throws Exception { rm.sendNodeStarted(nm); rm.NMwaitForState(nm.getNodeId(), NodeState.RUNNING); rm.sendNodeLost(nm); dispatcher.await(); }
private void syncNodeHeartbeat(MockNM nm, boolean health) throws Exception { nm.nodeHeartbeat(health); dispatcher.await(); }