private void killUnfinishedAttempt(TaskAttempt attempt, String logMsg) {
   if (commitAttempt != null && commitAttempt.equals(attempt)) {
     LOG.info("Removing commit attempt: " + commitAttempt);
     commitAttempt = null;
   }
   if (attempt != null && !attempt.isFinished()) {
     eventHandler.handle(new TaskAttemptEventKillRequest(attempt.getID(), logMsg));
   }
 }
 @Override
 public float getProgress() {
   readLock.lock();
   try {
     TaskAttempt bestAttempt = selectBestAttempt();
     if (bestAttempt == null) {
       return 0f;
     }
     return bestAttempt.getProgress();
   } finally {
     readLock.unlock();
   }
 }
 // this is always called in read/write lock
 // TODO Verify behaviour is Task is killed (no finished attempt)
 private long getFinishTime() {
   if (!isFinished()) {
     return 0;
   }
   long finishTime = 0;
   for (TaskAttempt at : attempts.values()) {
     // select the max finish time of all attempts
     // FIXME shouldnt this not count attempts killed after an attempt succeeds
     if (finishTime < at.getFinishTime()) {
       finishTime = at.getFinishTime();
     }
   }
   return finishTime;
 }
  @Override
  public boolean canCommit(TezTaskAttemptID taskAttemptID) {
    writeLock.lock();
    try {
      if (getState() != TaskState.RUNNING) {
        LOG.info("Task not running. Issuing kill to bad commit attempt " + taskAttemptID);
        eventHandler.handle(
            new TaskAttemptEventKillRequest(taskAttemptID, "Task not running. Bad attempt."));
        return false;
      }
      if (commitAttempt == null) {
        TaskAttempt ta = getAttempt(taskAttemptID);
        if (ta == null) {
          throw new TezUncheckedException("Unknown task for commit: " + taskAttemptID);
        }
        // Its ok to get a non-locked state snapshot since we handle changes of
        // state in the task attempt. Dont want to deadlock here.
        TaskAttemptState taState = ta.getStateNoLock();
        if (taState == TaskAttemptState.RUNNING) {
          commitAttempt = taskAttemptID;
          LOG.info(taskAttemptID + " given a go for committing the task output.");
          return true;
        } else {
          LOG.info(
              taskAttemptID
                  + " with state: "
                  + taState
                  + " given a no-go for commit because its not running.");
          return false;
        }
      } else {
        if (commitAttempt.equals(taskAttemptID)) {
          LOG.info(taskAttemptID + " given a go for committing the task output.");
          return true;
        }
        // Don't think this can be a pluggable decision, so simply raise an
        // event for the TaskAttempt to delete its output.
        // Wait for commit attempt to succeed. Dont kill this. If commit
        // attempt fails then choose a different committer. When commit attempt
        // succeeds then this and others will be killed
        LOG.info(commitAttempt + " is current committer. Commit waiting for:  " + taskAttemptID);
        return false;
      }

    } finally {
      writeLock.unlock();
    }
  }
 @Override
 public TezCounters getCounters() {
   TezCounters counters = null;
   readLock.lock();
   try {
     TaskAttempt bestAttempt = selectBestAttempt();
     if (bestAttempt != null) {
       counters = bestAttempt.getCounters();
     } else {
       counters = TaskAttemptImpl.EMPTY_COUNTERS;
       //        counters.groups = new HashMap<CharSequence, CounterGroup>();
     }
     return counters;
   } finally {
     readLock.unlock();
   }
 }
    @Override
    public void transition(TaskImpl task, TaskEvent event) {
      TezTaskAttemptID successTaId = ((TaskEventTAUpdate) event).getTaskAttemptID();

      if (task.commitAttempt != null && !task.commitAttempt.equals(successTaId)) {
        // The succeeded attempt is not the one that was selected to commit
        // This is impossible and has to be a bug
        throw new TezUncheckedException(
            "TA: "
                + successTaId
                + " succeeded but TA: "
                + task.commitAttempt
                + " was expected to commit and succeed");
      }

      task.handleTaskAttemptCompletion(successTaId, TaskAttemptStateInternal.SUCCEEDED);
      task.finishedAttempts++;
      --task.numberUncompletedAttempts;
      task.successfulAttempt = successTaId;
      task.eventHandler.handle(new VertexEventTaskCompleted(task.taskId, TaskState.SUCCEEDED));
      LOG.info("Task succeeded with attempt " + task.successfulAttempt);
      if (task.historyTaskStartGenerated) {
        task.logJobHistoryTaskFinishedEvent();
      }

      // issue kill to all other attempts
      for (TaskAttempt attempt : task.attempts.values()) {
        if (attempt.getID() != task.successfulAttempt
            &&
            // This is okay because it can only talk us out of sending a
            //  TA_KILL message to an attempt that doesn't need one for
            //  other reasons.
            !attempt.isFinished()) {
          LOG.info("Issuing kill to other attempt " + attempt.getID());
          task.eventHandler.handle(
              new TaskAttemptEventKillRequest(attempt.getID(), "Alternate attempt succeeded"));
        }
      }
      // send notification to DAG scheduler
      task.eventHandler.handle(
          new DAGEventSchedulerUpdate(
              DAGEventSchedulerUpdate.UpdateType.TA_SUCCEEDED,
              task.attempts.get(task.successfulAttempt)));
      task.finished(TaskStateInternal.SUCCEEDED);
    }
Example #7
0
  public void testBasicSpeculation(boolean withProgress) throws Exception {
    DAG dag = DAG.create("test");
    Vertex vA = Vertex.create("A", ProcessorDescriptor.create("Proc.class"), 5);
    dag.addVertex(vA);

    MockTezClient tezClient = createTezSession();

    DAGClient dagClient = tezClient.submitDAG(dag);
    DAGImpl dagImpl = (DAGImpl) mockApp.getContext().getCurrentDAG();
    TezVertexID vertexId = TezVertexID.getInstance(dagImpl.getID(), 0);
    // original attempt is killed and speculative one is successful
    TezTaskAttemptID killedTaId =
        TezTaskAttemptID.getInstance(TezTaskID.getInstance(vertexId, 0), 0);
    TezTaskAttemptID successTaId =
        TezTaskAttemptID.getInstance(TezTaskID.getInstance(vertexId, 0), 1);

    mockLauncher.updateProgress(withProgress);
    // cause speculation trigger
    mockLauncher.setStatusUpdatesForTask(killedTaId, 100);

    mockLauncher.startScheduling(true);
    dagClient.waitForCompletion();
    Assert.assertEquals(DAGStatus.State.SUCCEEDED, dagClient.getDAGStatus(null).getState());
    Task task = dagImpl.getTask(killedTaId.getTaskID());
    Assert.assertEquals(2, task.getAttempts().size());
    Assert.assertEquals(successTaId, task.getSuccessfulAttempt().getID());
    TaskAttempt killedAttempt = task.getAttempt(killedTaId);
    Joiner.on(",").join(killedAttempt.getDiagnostics()).contains("Killed as speculative attempt");
    Assert.assertEquals(
        TaskAttemptTerminationCause.TERMINATED_EFFECTIVE_SPECULATION,
        killedAttempt.getTerminationCause());
    if (withProgress) {
      // without progress updates occasionally more than 1 task speculates
      Assert.assertEquals(
          1, task.getCounters().findCounter(TaskCounter.NUM_SPECULATIONS).getValue());
      Assert.assertEquals(
          1, dagImpl.getAllCounters().findCounter(TaskCounter.NUM_SPECULATIONS).getValue());
      org.apache.tez.dag.app.dag.Vertex v = dagImpl.getVertex(killedTaId.getTaskID().getVertexID());
      Assert.assertEquals(
          1, v.getAllCounters().findCounter(TaskCounter.NUM_SPECULATIONS).getValue());
    }
    tezClient.stop();
  }
 // this is always called in read/write lock
 private long getLaunchTime() {
   long taskLaunchTime = 0;
   boolean launchTimeSet = false;
   for (TaskAttempt at : attempts.values()) {
     // select the least launch time of all attempts
     long attemptLaunchTime = at.getLaunchTime();
     if (attemptLaunchTime != 0 && !launchTimeSet) {
       // For the first non-zero launch time
       launchTimeSet = true;
       taskLaunchTime = attemptLaunchTime;
     } else if (attemptLaunchTime != 0 && taskLaunchTime > attemptLaunchTime) {
       taskLaunchTime = attemptLaunchTime;
     }
   }
   if (!launchTimeSet) {
     return this.scheduledTime;
   }
   return taskLaunchTime;
 }
  // This is always called in the Write Lock
  private void addAndScheduleAttempt() {
    TaskAttempt attempt = createAttempt(attempts.size());
    if (LOG.isDebugEnabled()) {
      LOG.debug("Created attempt " + attempt.getID());
    }
    switch (attempts.size()) {
      case 0:
        attempts = Collections.singletonMap(attempt.getID(), attempt);
        break;

      case 1:
        Map<TezTaskAttemptID, TaskAttempt> newAttempts =
            new LinkedHashMap<TezTaskAttemptID, TaskAttempt>(maxAttempts);
        newAttempts.putAll(attempts);
        attempts = newAttempts;
        attempts.put(attempt.getID(), attempt);
        break;

      default:
        attempts.put(attempt.getID(), attempt);
        break;
    }

    // TODO: Recovery
    /*
    // Update nextATtemptNumber
    if (taskAttemptsFromPreviousGeneration.isEmpty()) {
      ++nextAttemptNumber;
    } else {
      // There are still some TaskAttempts from previous generation, use them
      nextAttemptNumber =
          taskAttemptsFromPreviousGeneration.remove(0).getAttemptId().getId();
    }
    */

    ++numberUncompletedAttempts;
    // schedule the nextAttemptNumber
    // send event to DAG to assign priority and schedule the attempt with global
    // picture in mind
    eventHandler.handle(
        new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, attempt));
  }
  @Test(timeout = 10000)
  public void testDAGSchedulerNaturalOrder() {
    DAG mockDag = mock(DAG.class);
    Vertex mockVertex = mock(Vertex.class);
    TaskAttempt mockAttempt = mock(TaskAttempt.class);
    when(mockDag.getVertex((TezVertexID) any())).thenReturn(mockVertex);
    when(mockVertex.getDistanceFromRoot()).thenReturn(0).thenReturn(1).thenReturn(2);
    when(mockAttempt.getIsRescheduled()).thenReturn(false);

    DAGEventSchedulerUpdate event =
        new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt);

    DAGScheduler scheduler = new DAGSchedulerNaturalOrder(mockDag, mockEventHandler);
    scheduler.scheduleTask(event);
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2);
    scheduler.scheduleTask(event);
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 4);
    scheduler.scheduleTask(event);
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6);

    when(mockAttempt.getIsRescheduled()).thenReturn(true);
    scheduler.scheduleTask(event);
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 5);
  }
  // select the nextAttemptNumber with best progress
  // always called inside the Read Lock
  private TaskAttempt selectBestAttempt() {
    float progress = 0f;
    TaskAttempt result = null;
    for (TaskAttempt at : attempts.values()) {
      switch (at.getState()) {

          // ignore all failed task attempts
        case FAILED:
        case KILLED:
          continue;
        default:
      }
      if (result == null) {
        result = at; // The first time around
      }
      // calculate the best progress
      float attemptProgress = at.getProgress();
      if (attemptProgress > progress) {
        result = at;
        progress = attemptProgress;
      }
    }
    return result;
  }
  @Override
  public TaskReport getReport() {
    // TODO TEZPB This is broken. Records will not work without the PBImpl, which
    // is in a different package.
    TaskReport report = Records.newRecord(TaskReport.class);
    readLock.lock();
    try {
      report.setTaskId(taskId);
      report.setStartTime(getLaunchTime());
      report.setFinishTime(getFinishTime());
      report.setTaskState(getState());
      report.setProgress(getProgress());

      for (TaskAttempt attempt : attempts.values()) {
        if (TaskAttemptState.RUNNING.equals(attempt.getState())) {
          report.addRunningAttempt(attempt.getID());
        }
      }

      report.setSuccessfulAttempt(successfulAttempt);

      for (TaskAttempt att : attempts.values()) {
        String prefix = "AttemptID:" + att.getID() + " Info:";
        for (CharSequence cs : att.getDiagnostics()) {
          report.addDiagnostics(prefix + cs);
        }
      }

      // Add a copy of counters as the last step so that their lifetime on heap
      // is as small as possible.
      report.setCounters(getCounters());

      return report;
    } finally {
      readLock.unlock();
    }
  }
  @Ignore
  @Test(timeout = 10000)
  public void testDAGSchedulerMRR() {
    DAG mockDag = mock(DAG.class);
    TezDAGID dagId = TezDAGID.getInstance("1", 1, 1);

    TaskSchedulerEventHandler mockTaskScheduler = mock(TaskSchedulerEventHandler.class);

    Vertex mockVertex1 = mock(Vertex.class);
    TezVertexID mockVertexId1 = TezVertexID.getInstance(dagId, 1);
    when(mockVertex1.getVertexId()).thenReturn(mockVertexId1);
    when(mockVertex1.getDistanceFromRoot()).thenReturn(0);
    TaskAttempt mockAttempt1 = mock(TaskAttempt.class);
    when(mockAttempt1.getVertexID()).thenReturn(mockVertexId1);
    when(mockAttempt1.getIsRescheduled()).thenReturn(false);
    when(mockDag.getVertex(mockVertexId1)).thenReturn(mockVertex1);

    Vertex mockVertex2 = mock(Vertex.class);
    TezVertexID mockVertexId2 = TezVertexID.getInstance(dagId, 2);
    when(mockVertex2.getVertexId()).thenReturn(mockVertexId2);
    when(mockVertex2.getDistanceFromRoot()).thenReturn(1);
    TaskAttempt mockAttempt2 = mock(TaskAttempt.class);
    when(mockAttempt2.getVertexID()).thenReturn(mockVertexId2);
    when(mockAttempt2.getIsRescheduled()).thenReturn(false);
    when(mockDag.getVertex(mockVertexId2)).thenReturn(mockVertex2);
    TaskAttempt mockAttempt2f = mock(TaskAttempt.class);
    when(mockAttempt2f.getVertexID()).thenReturn(mockVertexId2);
    when(mockAttempt2f.getIsRescheduled()).thenReturn(true);

    Vertex mockVertex3 = mock(Vertex.class);
    TezVertexID mockVertexId3 = TezVertexID.getInstance(dagId, 3);
    when(mockVertex3.getVertexId()).thenReturn(mockVertexId3);
    when(mockVertex3.getDistanceFromRoot()).thenReturn(2);
    TaskAttempt mockAttempt3 = mock(TaskAttempt.class);
    when(mockAttempt3.getVertexID()).thenReturn(mockVertexId3);
    when(mockAttempt3.getIsRescheduled()).thenReturn(false);
    when(mockDag.getVertex(mockVertexId3)).thenReturn(mockVertex3);

    DAGEventSchedulerUpdate mockEvent1 = mock(DAGEventSchedulerUpdate.class);
    when(mockEvent1.getAttempt()).thenReturn(mockAttempt1);
    DAGEventSchedulerUpdate mockEvent2 = mock(DAGEventSchedulerUpdate.class);
    when(mockEvent2.getAttempt()).thenReturn(mockAttempt2);
    DAGEventSchedulerUpdate mockEvent2f = mock(DAGEventSchedulerUpdate.class);
    when(mockEvent2f.getAttempt()).thenReturn(mockAttempt2f);
    DAGEventSchedulerUpdate mockEvent3 = mock(DAGEventSchedulerUpdate.class);
    when(mockEvent3.getAttempt()).thenReturn(mockAttempt3);
    DAGScheduler scheduler =
        new DAGSchedulerMRR(mockDag, mockEventHandler, mockTaskScheduler, 0.5f);

    // M starts. M completes. R1 starts. R1 completes. R2 starts. R2 completes
    scheduler.scheduleTask(mockEvent1); // M starts
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3);
    scheduler.scheduleTask(mockEvent1); // M runs another
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3);
    scheduler.vertexCompleted(mockVertex1); // M completes
    scheduler.scheduleTask(mockEvent2); // R1 starts
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6);
    scheduler.scheduleTask(mockEvent2); // R1 runs another
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6);
    scheduler.scheduleTask(mockEvent2f); // R1 runs retry. Retry priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 4);
    scheduler.vertexCompleted(mockVertex2); // R1 completes
    scheduler.scheduleTask(mockEvent3); // R2 starts
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 9);
    scheduler.scheduleTask(mockEvent3); // R2 runs another
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 9);
    scheduler.vertexCompleted(mockVertex3); // R2 completes

    // M starts. R1 starts. M completes. R2 starts. R1 completes. R2 completes
    scheduler.scheduleTask(mockEvent1); // M starts
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3);
    scheduler.scheduleTask(mockEvent2); // R1 starts. Reordered priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2);
    scheduler.scheduleTask(mockEvent1); // M runs another
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3);
    scheduler.scheduleTask(mockEvent2); // R1 runs another. Reordered priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2);
    scheduler.scheduleTask(mockEvent2f); // R1 runs retry. Reordered priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2);
    scheduler.vertexCompleted(mockVertex1); // M completes
    scheduler.scheduleTask(mockEvent3); // R2 starts. Reordered priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 5);
    scheduler.scheduleTask(mockEvent2); // R1 runs another. Normal priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6);
    scheduler.scheduleTask(mockEvent2f); // R1 runs retry. Retry priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 4);
    scheduler.scheduleTask(mockEvent3); // R2 runs another. Reordered priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 5);
    scheduler.vertexCompleted(mockVertex2); // R1 completes
    scheduler.vertexCompleted(mockVertex3); // R2 completes

    // M starts. M completes. R1 starts. R2 starts. R1 completes. R2 completes
    scheduler.scheduleTask(mockEvent1); // M starts
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3);
    scheduler.vertexCompleted(mockVertex1); // M completes
    scheduler.scheduleTask(mockEvent2); // R1 starts
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6);
    scheduler.scheduleTask(mockEvent3); // R2 starts. Reordered priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 5);
    scheduler.scheduleTask(mockEvent2); // R1 runs another
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6);
    scheduler.vertexCompleted(mockVertex2); // R1 completes
    scheduler.vertexCompleted(mockVertex3); // R2 completes

    // M starts. R1 starts. M completes. R1 completes. R2 starts. R2 completes
    scheduler.scheduleTask(mockEvent1); // M starts
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3);
    scheduler.scheduleTask(mockEvent2); // R1 starts. Reordered priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2);
    scheduler.vertexCompleted(mockVertex1); // M completes
    scheduler.scheduleTask(mockEvent2); // R1 starts. Normal priority
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6);
    scheduler.vertexCompleted(mockVertex2); // R1 completes
    scheduler.scheduleTask(mockEvent3); // R2 starts
    Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 9);
    scheduler.vertexCompleted(mockVertex3); // R2 completes
  }