Beispiel #1
0
 /**
  * restoreFromTaskStartedEvent -> restoreFromTaskAttemptFinishedEvent (FAILED) -> RecoverTranstion
  */
 @Test(timeout = 5000)
 public void testRecovery_OnlyTAFinishedEvent_FAILED() {
   restoreFromTaskStartEvent();
   TezTaskAttemptID taId = getNewTaskAttemptID(task.getTaskId());
   task.restoreFromEvent(
       new TaskAttemptFinishedEvent(
           taId,
           vertexName,
           0L,
           0L,
           TaskAttemptState.FAILED,
           TaskAttemptTerminationCause.CONTAINER_LAUNCH_FAILED,
           "",
           new TezCounters(),
           0,
           null));
   task.handle(new TaskEventRecoverTask(task.getTaskId()));
   // wait for the second task attempt is scheduled
   dispatcher.await();
   assertEquals(TaskStateInternal.RUNNING, task.getInternalState());
   // taskAttempt_1 is recovered to FAILED, and new task attempt is scheduled
   assertEquals(2, task.getAttempts().size());
   assertEquals(1, task.getFinishedAttemptsCount());
   assertEquals(1, task.failedAttempts);
   assertEquals(null, task.successfulAttempt);
 }
  @Test(timeout = 10000)
  public void testFailAbortDoesntHang() throws IOException {
    Configuration conf = new Configuration();
    conf.set(MRJobConfig.MR_AM_STAGING_DIR, stagingDir);
    conf.set(MRJobConfig.MR_AM_COMMITTER_CANCEL_TIMEOUT_MS, "1000");

    DrainDispatcher dispatcher = new DrainDispatcher();
    dispatcher.init(conf);
    dispatcher.start();
    OutputCommitter committer = Mockito.mock(OutputCommitter.class);
    CommitterEventHandler commitHandler = createCommitterEventHandler(dispatcher, committer);
    commitHandler.init(conf);
    commitHandler.start();
    // Job has only 1 mapper task. No reducers
    conf.setInt(MRJobConfig.NUM_REDUCES, 0);
    conf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, 1);
    JobImpl job = createRunningStubbedJob(conf, dispatcher, 1, null);

    // Fail / finish all the tasks. This should land the JobImpl directly in the
    // FAIL_ABORT state
    for (Task t : job.tasks.values()) {
      TaskImpl task = (TaskImpl) t;
      task.handle(new TaskEvent(task.getID(), TaskEventType.T_SCHEDULE));
      for (TaskAttempt ta : task.getAttempts().values()) {
        task.handle(new TaskTAttemptEvent(ta.getID(), TaskEventType.T_ATTEMPT_FAILED));
      }
    }
    assertJobState(job, JobStateInternal.FAIL_ABORT);

    dispatcher.await();
    // Verify abortJob is called once and the job failed
    Mockito.verify(committer, Mockito.timeout(2000).times(1))
        .abortJob((JobContext) Mockito.any(), (State) Mockito.any());
    assertJobState(job, JobStateInternal.FAILED);

    dispatcher.stop();
  }
Beispiel #3
0
  /** restoreFromTaskStartedEvent -> restoreFromTaskAttemptStartedEvent -> RecoverTranstion */
  @Test(timeout = 5000)
  public void testRecovery_OneTAStarted() {
    restoreFromTaskStartEvent();
    TezTaskAttemptID taId = getNewTaskAttemptID(task.getTaskId());
    restoreFromFirstTaskAttemptStartEvent(taId);

    task.handle(new TaskEventRecoverTask(task.getTaskId()));
    // wait for the second task attempt is scheduled
    dispatcher.await();
    assertEquals(TaskStateInternal.RUNNING, task.getInternalState());
    // taskAttempt_1 is recovered to KILLED, and new task attempt is scheduled
    assertEquals(2, task.getAttempts().size());
    assertEquals(1, task.getFinishedAttemptsCount());
    assertEquals(0, task.failedAttempts);
    assertEquals(null, task.successfulAttempt);
  }
Beispiel #4
0
  @Before
  public void setUp() {
    dispatcher = new DrainDispatcher();
    dispatcher.register(DAGEventType.class, mock(EventHandler.class));
    dispatcher.register(VertexEventType.class, mock(EventHandler.class));
    dispatcher.register(TaskEventType.class, new TaskEventHandler());
    dispatcher.register(TaskAttemptEventType.class, taEventHandler);
    dispatcher.init(new Configuration());
    dispatcher.start();

    vertex = mock(Vertex.class, RETURNS_DEEP_STUBS);
    when(vertex.getProcessorDescriptor().getClassName()).thenReturn("");

    mockAppContext = mock(AppContext.class, RETURNS_DEEP_STUBS);
    when(mockAppContext.getCurrentDAG().getVertex(any(TezVertexID.class))).thenReturn(vertex);
    mockHistoryEventHandler = new MockHistoryEventHandler(mockAppContext);
    when(mockAppContext.getHistoryHandler()).thenReturn(mockHistoryEventHandler);
    task =
        new TaskImpl(
            vertexId,
            0,
            dispatcher.getEventHandler(),
            new Configuration(),
            mock(TaskCommunicatorManagerInterface.class),
            new SystemClock(),
            mock(TaskHeartbeatHandler.class),
            mockAppContext,
            false,
            Resource.newInstance(1, 1),
            mock(ContainerContext.class),
            mock(StateChangeNotifier.class),
            vertex);

    Map<String, OutputCommitter> committers = new HashMap<String, OutputCommitter>();
    committers.put(
        "out1", new TestOutputCommitter(mock(OutputCommitterContext.class), true, false));
    when(task.getVertex().getOutputCommitters()).thenReturn(committers);
  }
  @Test
  public void testReconnectNode() throws Exception {
    final DrainDispatcher dispatcher = new DrainDispatcher();
    rm =
        new MockRM() {
          @Override
          protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
            return new SchedulerEventDispatcher(this.scheduler) {
              @Override
              public void handle(SchedulerEvent event) {
                scheduler.handle(event);
              }
            };
          }

          @Override
          protected Dispatcher createDispatcher() {
            return dispatcher;
          }
        };
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 5120);
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(false);
    dispatcher.await();
    checkUnealthyNMCount(rm, nm2, true, 1);
    final int expectedNMs = ClusterMetrics.getMetrics().getNumActiveNMs();
    QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics();
    // TODO Metrics incorrect in case of the FifoScheduler
    Assert.assertEquals(5120, metrics.getAvailableMB());

    // reconnect of healthy node
    nm1 = rm.registerNode("host1:1234", 5120);
    NodeHeartbeatResponse response = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    dispatcher.await();
    Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs());
    checkUnealthyNMCount(rm, nm2, true, 1);

    // reconnect of unhealthy node
    nm2 = rm.registerNode("host2:5678", 5120);
    response = nm2.nodeHeartbeat(false);
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    dispatcher.await();
    Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs());
    checkUnealthyNMCount(rm, nm2, true, 1);

    // unhealthy node changed back to healthy
    nm2 = rm.registerNode("host2:5678", 5120);
    dispatcher.await();
    response = nm2.nodeHeartbeat(true);
    response = nm2.nodeHeartbeat(true);
    dispatcher.await();
    Assert.assertEquals(5120 + 5120, metrics.getAvailableMB());

    // reconnect of node with changed capability
    nm1 = rm.registerNode("host2:5678", 10240);
    dispatcher.await();
    response = nm1.nodeHeartbeat(true);
    dispatcher.await();
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    Assert.assertEquals(5120 + 10240, metrics.getAvailableMB());

    // reconnect of node with changed capability and running applications
    List<ApplicationId> runningApps = new ArrayList<ApplicationId>();
    runningApps.add(ApplicationId.newInstance(1, 0));
    nm1 = rm.registerNode("host2:5678", 15360, 2, runningApps);
    dispatcher.await();
    response = nm1.nodeHeartbeat(true);
    dispatcher.await();
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    Assert.assertEquals(5120 + 15360, metrics.getAvailableMB());

    // reconnect healthy node changing http port
    nm1 = new MockNM("host1:1234", 5120, rm.getResourceTrackerService());
    nm1.setHttpPort(3);
    nm1.registerNode();
    dispatcher.await();
    response = nm1.nodeHeartbeat(true);
    response = nm1.nodeHeartbeat(true);
    dispatcher.await();
    RMNode rmNode = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
    Assert.assertEquals(3, rmNode.getHttpPort());
    Assert.assertEquals(5120, rmNode.getTotalCapability().getMemory());
    Assert.assertEquals(5120 + 15360, metrics.getAvailableMB());
  }
  /** Decommissioning using a pre-configured exclude hosts file */
  @Test
  public void testDecommissionWithExcludeHosts() throws Exception {
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());

    writeToHostsFile("");
    final DrainDispatcher dispatcher = new DrainDispatcher();
    rm =
        new MockRM(conf) {
          @Override
          protected Dispatcher createDispatcher() {
            return dispatcher;
          }
        };
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    MockNM nm3 = rm.registerNode("localhost:4433", 1024);

    dispatcher.await();

    int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs();
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    dispatcher.await();

    // To test that IPs also work
    String ip = NetUtils.normalizeHostName("localhost");
    writeToHostsFile("host2", ip);

    rm.getNodesListManager().refreshNodes(conf);

    checkDecommissionedNMCount(rm, metricCount + 2);

    nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(
        "The decommisioned metrics are not updated",
        NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));

    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue(
        "The decommisioned metrics are not updated",
        NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
    dispatcher.await();

    writeToHostsFile("");
    rm.getNodesListManager().refreshNodes(conf);

    nm3 = rm.registerNode("localhost:4433", 1024);
    dispatcher.await();
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    dispatcher.await();
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    // decommissined node is 1 since 1 node is rejoined after updating exclude
    // file
    checkDecommissionedNMCount(rm, metricCount + 1);
  }
Beispiel #7
0
  /**
   * n = maxFailedAttempts, in the previous AM attempt, n-1 task attempts are killed. And last task
   * attempt is still in running state. When recovering, the last attempt should transit to killed
   * and task is still in running state and new task attempt is scheduled.
   */
  @Test(timeout = 5000)
  public void testTaskRecovery_MultipleAttempts3() throws InterruptedException {
    int maxFailedAttempts =
        conf.getInt(
            TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS,
            TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS_DEFAULT);
    restoreFromTaskStartEvent();

    for (int i = 0; i < maxFailedAttempts - 1; ++i) {
      TezTaskAttemptID taId = getNewTaskAttemptID(task.getTaskId());
      task.restoreFromEvent(
          new TaskAttemptStartedEvent(
              taId,
              vertexName,
              0L,
              mock(ContainerId.class),
              mock(NodeId.class),
              "",
              "",
              "",
              0,
              null,
              0));
      task.restoreFromEvent(
          new TaskAttemptFinishedEvent(
              taId, vertexName, 0, 0, TaskAttemptState.FAILED, null, "", null, 0, null));
    }
    assertEquals(maxFailedAttempts - 1, task.getAttempts().size());
    assertEquals(maxFailedAttempts - 1, task.failedAttempts);

    TezTaskAttemptID newTaskAttemptId = getNewTaskAttemptID(task.getTaskId());
    TaskState recoveredState =
        task.restoreFromEvent(
            new TaskAttemptStartedEvent(
                newTaskAttemptId,
                vertexName,
                0,
                mock(ContainerId.class),
                mock(NodeId.class),
                "",
                "",
                "",
                0,
                null,
                0));

    assertEquals(TaskState.RUNNING, recoveredState);
    assertEquals(
        TaskAttemptStateInternal.NEW,
        ((TaskAttemptImpl) task.getAttempt(newTaskAttemptId)).getInternalState());
    assertEquals(maxFailedAttempts, task.getAttempts().size());

    task.handle(new TaskEventRecoverTask(task.getTaskId()));
    // wait until task attempt receive the Recover event from task
    dispatcher.await();

    assertEquals(TaskStateInternal.RUNNING, task.getInternalState());
    assertEquals(
        TaskAttemptStateInternal.KILLED,
        ((TaskAttemptImpl) (task.getAttempt(newTaskAttemptId))).getInternalState());
    assertEquals(maxFailedAttempts - 1, task.failedAttempts);

    // new task attempt is added
    assertEquals(maxFailedAttempts + 1, task.getAttempts().size());
  }
Beispiel #8
0
  /**
   * restoreFromTaskStartedEvent -> restoreFromTaskAttemptStartedEvent ->
   * restoreFromTaskAttemptFinishedEvent (KILLED) -> RecoverTransition
   */
  @Test(timeout = 5000)
  public void testRecovery_OneTAStarted_Killed() {
    restoreFromTaskStartEvent();

    long taStartTime = taskStartTime + 100L;
    TezTaskAttemptID taId = getNewTaskAttemptID(task.getTaskId());
    TaskState recoveredState =
        task.restoreFromEvent(
            new TaskAttemptStartedEvent(
                taId,
                vertexName,
                taStartTime,
                mock(ContainerId.class),
                mock(NodeId.class),
                "",
                "",
                "",
                0,
                null,
                0));
    assertEquals(TaskState.RUNNING, recoveredState);
    assertEquals(
        TaskAttemptStateInternal.NEW, ((TaskAttemptImpl) task.getAttempt(taId)).getInternalState());
    assertEquals(1, task.getAttempts().size());
    assertEquals(0, task.getFinishedAttemptsCount());
    assertEquals(0, task.failedAttempts);
    assertEquals(1, task.getUncompletedAttemptsCount());
    assertEquals(null, task.successfulAttempt);

    long taFinishTime = taStartTime + 100L;
    recoveredState =
        task.restoreFromEvent(
            new TaskAttemptFinishedEvent(
                taId,
                vertexName,
                taStartTime,
                taFinishTime,
                TaskAttemptState.KILLED,
                null,
                "",
                new TezCounters(),
                0,
                null));
    assertEquals(TaskState.RUNNING, recoveredState);
    assertEquals(
        TaskAttemptStateInternal.NEW, ((TaskAttemptImpl) task.getAttempt(taId)).getInternalState());
    assertEquals(1, task.getAttempts().size());
    assertEquals(1, task.getFinishedAttemptsCount());
    assertEquals(0, task.failedAttempts);
    assertEquals(0, task.getUncompletedAttemptsCount());
    assertEquals(null, task.successfulAttempt);

    task.handle(new TaskEventRecoverTask(task.getTaskId()));
    // wait for Task send TA_RECOVER to TA and TA complete the RecoverTransition
    dispatcher.await();
    assertEquals(TaskStateInternal.RUNNING, task.getInternalState());
    assertEquals(
        TaskAttemptStateInternal.KILLED,
        ((TaskAttemptImpl) task.getAttempt(taId)).getInternalState());
    // new task attempt is scheduled
    assertEquals(2, task.getAttempts().size());
    assertEquals(1, task.getFinishedAttemptsCount());
    assertEquals(0, task.failedAttempts);
    assertEquals(1, task.getUncompletedAttemptsCount());
    assertEquals(null, task.successfulAttempt);
  }
 private void syncNodeLost(MockNM nm) throws Exception {
   rm.sendNodeStarted(nm);
   rm.NMwaitForState(nm.getNodeId(), NodeState.RUNNING);
   rm.sendNodeLost(nm);
   dispatcher.await();
 }
 private void syncNodeHeartbeat(MockNM nm, boolean health) throws Exception {
   nm.nodeHeartbeat(health);
   dispatcher.await();
 }