private void killUnfinishedAttempt(TaskAttempt attempt, String logMsg) { if (commitAttempt != null && commitAttempt.equals(attempt)) { LOG.info("Removing commit attempt: " + commitAttempt); commitAttempt = null; } if (attempt != null && !attempt.isFinished()) { eventHandler.handle(new TaskAttemptEventKillRequest(attempt.getID(), logMsg)); } }
@Override public float getProgress() { readLock.lock(); try { TaskAttempt bestAttempt = selectBestAttempt(); if (bestAttempt == null) { return 0f; } return bestAttempt.getProgress(); } finally { readLock.unlock(); } }
// this is always called in read/write lock // TODO Verify behaviour is Task is killed (no finished attempt) private long getFinishTime() { if (!isFinished()) { return 0; } long finishTime = 0; for (TaskAttempt at : attempts.values()) { // select the max finish time of all attempts // FIXME shouldnt this not count attempts killed after an attempt succeeds if (finishTime < at.getFinishTime()) { finishTime = at.getFinishTime(); } } return finishTime; }
@Override public boolean canCommit(TezTaskAttemptID taskAttemptID) { writeLock.lock(); try { if (getState() != TaskState.RUNNING) { LOG.info("Task not running. Issuing kill to bad commit attempt " + taskAttemptID); eventHandler.handle( new TaskAttemptEventKillRequest(taskAttemptID, "Task not running. Bad attempt.")); return false; } if (commitAttempt == null) { TaskAttempt ta = getAttempt(taskAttemptID); if (ta == null) { throw new TezUncheckedException("Unknown task for commit: " + taskAttemptID); } // Its ok to get a non-locked state snapshot since we handle changes of // state in the task attempt. Dont want to deadlock here. TaskAttemptState taState = ta.getStateNoLock(); if (taState == TaskAttemptState.RUNNING) { commitAttempt = taskAttemptID; LOG.info(taskAttemptID + " given a go for committing the task output."); return true; } else { LOG.info( taskAttemptID + " with state: " + taState + " given a no-go for commit because its not running."); return false; } } else { if (commitAttempt.equals(taskAttemptID)) { LOG.info(taskAttemptID + " given a go for committing the task output."); return true; } // Don't think this can be a pluggable decision, so simply raise an // event for the TaskAttempt to delete its output. // Wait for commit attempt to succeed. Dont kill this. If commit // attempt fails then choose a different committer. When commit attempt // succeeds then this and others will be killed LOG.info(commitAttempt + " is current committer. Commit waiting for: " + taskAttemptID); return false; } } finally { writeLock.unlock(); } }
@Override public TezCounters getCounters() { TezCounters counters = null; readLock.lock(); try { TaskAttempt bestAttempt = selectBestAttempt(); if (bestAttempt != null) { counters = bestAttempt.getCounters(); } else { counters = TaskAttemptImpl.EMPTY_COUNTERS; // counters.groups = new HashMap<CharSequence, CounterGroup>(); } return counters; } finally { readLock.unlock(); } }
@Override public void transition(TaskImpl task, TaskEvent event) { TezTaskAttemptID successTaId = ((TaskEventTAUpdate) event).getTaskAttemptID(); if (task.commitAttempt != null && !task.commitAttempt.equals(successTaId)) { // The succeeded attempt is not the one that was selected to commit // This is impossible and has to be a bug throw new TezUncheckedException( "TA: " + successTaId + " succeeded but TA: " + task.commitAttempt + " was expected to commit and succeed"); } task.handleTaskAttemptCompletion(successTaId, TaskAttemptStateInternal.SUCCEEDED); task.finishedAttempts++; --task.numberUncompletedAttempts; task.successfulAttempt = successTaId; task.eventHandler.handle(new VertexEventTaskCompleted(task.taskId, TaskState.SUCCEEDED)); LOG.info("Task succeeded with attempt " + task.successfulAttempt); if (task.historyTaskStartGenerated) { task.logJobHistoryTaskFinishedEvent(); } // issue kill to all other attempts for (TaskAttempt attempt : task.attempts.values()) { if (attempt.getID() != task.successfulAttempt && // This is okay because it can only talk us out of sending a // TA_KILL message to an attempt that doesn't need one for // other reasons. !attempt.isFinished()) { LOG.info("Issuing kill to other attempt " + attempt.getID()); task.eventHandler.handle( new TaskAttemptEventKillRequest(attempt.getID(), "Alternate attempt succeeded")); } } // send notification to DAG scheduler task.eventHandler.handle( new DAGEventSchedulerUpdate( DAGEventSchedulerUpdate.UpdateType.TA_SUCCEEDED, task.attempts.get(task.successfulAttempt))); task.finished(TaskStateInternal.SUCCEEDED); }
public void testBasicSpeculation(boolean withProgress) throws Exception { DAG dag = DAG.create("test"); Vertex vA = Vertex.create("A", ProcessorDescriptor.create("Proc.class"), 5); dag.addVertex(vA); MockTezClient tezClient = createTezSession(); DAGClient dagClient = tezClient.submitDAG(dag); DAGImpl dagImpl = (DAGImpl) mockApp.getContext().getCurrentDAG(); TezVertexID vertexId = TezVertexID.getInstance(dagImpl.getID(), 0); // original attempt is killed and speculative one is successful TezTaskAttemptID killedTaId = TezTaskAttemptID.getInstance(TezTaskID.getInstance(vertexId, 0), 0); TezTaskAttemptID successTaId = TezTaskAttemptID.getInstance(TezTaskID.getInstance(vertexId, 0), 1); mockLauncher.updateProgress(withProgress); // cause speculation trigger mockLauncher.setStatusUpdatesForTask(killedTaId, 100); mockLauncher.startScheduling(true); dagClient.waitForCompletion(); Assert.assertEquals(DAGStatus.State.SUCCEEDED, dagClient.getDAGStatus(null).getState()); Task task = dagImpl.getTask(killedTaId.getTaskID()); Assert.assertEquals(2, task.getAttempts().size()); Assert.assertEquals(successTaId, task.getSuccessfulAttempt().getID()); TaskAttempt killedAttempt = task.getAttempt(killedTaId); Joiner.on(",").join(killedAttempt.getDiagnostics()).contains("Killed as speculative attempt"); Assert.assertEquals( TaskAttemptTerminationCause.TERMINATED_EFFECTIVE_SPECULATION, killedAttempt.getTerminationCause()); if (withProgress) { // without progress updates occasionally more than 1 task speculates Assert.assertEquals( 1, task.getCounters().findCounter(TaskCounter.NUM_SPECULATIONS).getValue()); Assert.assertEquals( 1, dagImpl.getAllCounters().findCounter(TaskCounter.NUM_SPECULATIONS).getValue()); org.apache.tez.dag.app.dag.Vertex v = dagImpl.getVertex(killedTaId.getTaskID().getVertexID()); Assert.assertEquals( 1, v.getAllCounters().findCounter(TaskCounter.NUM_SPECULATIONS).getValue()); } tezClient.stop(); }
// this is always called in read/write lock private long getLaunchTime() { long taskLaunchTime = 0; boolean launchTimeSet = false; for (TaskAttempt at : attempts.values()) { // select the least launch time of all attempts long attemptLaunchTime = at.getLaunchTime(); if (attemptLaunchTime != 0 && !launchTimeSet) { // For the first non-zero launch time launchTimeSet = true; taskLaunchTime = attemptLaunchTime; } else if (attemptLaunchTime != 0 && taskLaunchTime > attemptLaunchTime) { taskLaunchTime = attemptLaunchTime; } } if (!launchTimeSet) { return this.scheduledTime; } return taskLaunchTime; }
// This is always called in the Write Lock private void addAndScheduleAttempt() { TaskAttempt attempt = createAttempt(attempts.size()); if (LOG.isDebugEnabled()) { LOG.debug("Created attempt " + attempt.getID()); } switch (attempts.size()) { case 0: attempts = Collections.singletonMap(attempt.getID(), attempt); break; case 1: Map<TezTaskAttemptID, TaskAttempt> newAttempts = new LinkedHashMap<TezTaskAttemptID, TaskAttempt>(maxAttempts); newAttempts.putAll(attempts); attempts = newAttempts; attempts.put(attempt.getID(), attempt); break; default: attempts.put(attempt.getID(), attempt); break; } // TODO: Recovery /* // Update nextATtemptNumber if (taskAttemptsFromPreviousGeneration.isEmpty()) { ++nextAttemptNumber; } else { // There are still some TaskAttempts from previous generation, use them nextAttemptNumber = taskAttemptsFromPreviousGeneration.remove(0).getAttemptId().getId(); } */ ++numberUncompletedAttempts; // schedule the nextAttemptNumber // send event to DAG to assign priority and schedule the attempt with global // picture in mind eventHandler.handle( new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, attempt)); }
@Test(timeout = 10000) public void testDAGSchedulerNaturalOrder() { DAG mockDag = mock(DAG.class); Vertex mockVertex = mock(Vertex.class); TaskAttempt mockAttempt = mock(TaskAttempt.class); when(mockDag.getVertex((TezVertexID) any())).thenReturn(mockVertex); when(mockVertex.getDistanceFromRoot()).thenReturn(0).thenReturn(1).thenReturn(2); when(mockAttempt.getIsRescheduled()).thenReturn(false); DAGEventSchedulerUpdate event = new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt); DAGScheduler scheduler = new DAGSchedulerNaturalOrder(mockDag, mockEventHandler); scheduler.scheduleTask(event); Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2); scheduler.scheduleTask(event); Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 4); scheduler.scheduleTask(event); Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6); when(mockAttempt.getIsRescheduled()).thenReturn(true); scheduler.scheduleTask(event); Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 5); }
// select the nextAttemptNumber with best progress // always called inside the Read Lock private TaskAttempt selectBestAttempt() { float progress = 0f; TaskAttempt result = null; for (TaskAttempt at : attempts.values()) { switch (at.getState()) { // ignore all failed task attempts case FAILED: case KILLED: continue; default: } if (result == null) { result = at; // The first time around } // calculate the best progress float attemptProgress = at.getProgress(); if (attemptProgress > progress) { result = at; progress = attemptProgress; } } return result; }
@Override public TaskReport getReport() { // TODO TEZPB This is broken. Records will not work without the PBImpl, which // is in a different package. TaskReport report = Records.newRecord(TaskReport.class); readLock.lock(); try { report.setTaskId(taskId); report.setStartTime(getLaunchTime()); report.setFinishTime(getFinishTime()); report.setTaskState(getState()); report.setProgress(getProgress()); for (TaskAttempt attempt : attempts.values()) { if (TaskAttemptState.RUNNING.equals(attempt.getState())) { report.addRunningAttempt(attempt.getID()); } } report.setSuccessfulAttempt(successfulAttempt); for (TaskAttempt att : attempts.values()) { String prefix = "AttemptID:" + att.getID() + " Info:"; for (CharSequence cs : att.getDiagnostics()) { report.addDiagnostics(prefix + cs); } } // Add a copy of counters as the last step so that their lifetime on heap // is as small as possible. report.setCounters(getCounters()); return report; } finally { readLock.unlock(); } }
@Ignore @Test(timeout = 10000) public void testDAGSchedulerMRR() { DAG mockDag = mock(DAG.class); TezDAGID dagId = TezDAGID.getInstance("1", 1, 1); TaskSchedulerEventHandler mockTaskScheduler = mock(TaskSchedulerEventHandler.class); Vertex mockVertex1 = mock(Vertex.class); TezVertexID mockVertexId1 = TezVertexID.getInstance(dagId, 1); when(mockVertex1.getVertexId()).thenReturn(mockVertexId1); when(mockVertex1.getDistanceFromRoot()).thenReturn(0); TaskAttempt mockAttempt1 = mock(TaskAttempt.class); when(mockAttempt1.getVertexID()).thenReturn(mockVertexId1); when(mockAttempt1.getIsRescheduled()).thenReturn(false); when(mockDag.getVertex(mockVertexId1)).thenReturn(mockVertex1); Vertex mockVertex2 = mock(Vertex.class); TezVertexID mockVertexId2 = TezVertexID.getInstance(dagId, 2); when(mockVertex2.getVertexId()).thenReturn(mockVertexId2); when(mockVertex2.getDistanceFromRoot()).thenReturn(1); TaskAttempt mockAttempt2 = mock(TaskAttempt.class); when(mockAttempt2.getVertexID()).thenReturn(mockVertexId2); when(mockAttempt2.getIsRescheduled()).thenReturn(false); when(mockDag.getVertex(mockVertexId2)).thenReturn(mockVertex2); TaskAttempt mockAttempt2f = mock(TaskAttempt.class); when(mockAttempt2f.getVertexID()).thenReturn(mockVertexId2); when(mockAttempt2f.getIsRescheduled()).thenReturn(true); Vertex mockVertex3 = mock(Vertex.class); TezVertexID mockVertexId3 = TezVertexID.getInstance(dagId, 3); when(mockVertex3.getVertexId()).thenReturn(mockVertexId3); when(mockVertex3.getDistanceFromRoot()).thenReturn(2); TaskAttempt mockAttempt3 = mock(TaskAttempt.class); when(mockAttempt3.getVertexID()).thenReturn(mockVertexId3); when(mockAttempt3.getIsRescheduled()).thenReturn(false); when(mockDag.getVertex(mockVertexId3)).thenReturn(mockVertex3); DAGEventSchedulerUpdate mockEvent1 = mock(DAGEventSchedulerUpdate.class); when(mockEvent1.getAttempt()).thenReturn(mockAttempt1); DAGEventSchedulerUpdate mockEvent2 = mock(DAGEventSchedulerUpdate.class); when(mockEvent2.getAttempt()).thenReturn(mockAttempt2); DAGEventSchedulerUpdate mockEvent2f = mock(DAGEventSchedulerUpdate.class); when(mockEvent2f.getAttempt()).thenReturn(mockAttempt2f); DAGEventSchedulerUpdate mockEvent3 = mock(DAGEventSchedulerUpdate.class); when(mockEvent3.getAttempt()).thenReturn(mockAttempt3); DAGScheduler scheduler = new DAGSchedulerMRR(mockDag, mockEventHandler, mockTaskScheduler, 0.5f); // M starts. M completes. R1 starts. R1 completes. R2 starts. R2 completes scheduler.scheduleTask(mockEvent1); // M starts Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3); scheduler.scheduleTask(mockEvent1); // M runs another Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3); scheduler.vertexCompleted(mockVertex1); // M completes scheduler.scheduleTask(mockEvent2); // R1 starts Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6); scheduler.scheduleTask(mockEvent2); // R1 runs another Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6); scheduler.scheduleTask(mockEvent2f); // R1 runs retry. Retry priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 4); scheduler.vertexCompleted(mockVertex2); // R1 completes scheduler.scheduleTask(mockEvent3); // R2 starts Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 9); scheduler.scheduleTask(mockEvent3); // R2 runs another Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 9); scheduler.vertexCompleted(mockVertex3); // R2 completes // M starts. R1 starts. M completes. R2 starts. R1 completes. R2 completes scheduler.scheduleTask(mockEvent1); // M starts Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3); scheduler.scheduleTask(mockEvent2); // R1 starts. Reordered priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2); scheduler.scheduleTask(mockEvent1); // M runs another Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3); scheduler.scheduleTask(mockEvent2); // R1 runs another. Reordered priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2); scheduler.scheduleTask(mockEvent2f); // R1 runs retry. Reordered priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2); scheduler.vertexCompleted(mockVertex1); // M completes scheduler.scheduleTask(mockEvent3); // R2 starts. Reordered priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 5); scheduler.scheduleTask(mockEvent2); // R1 runs another. Normal priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6); scheduler.scheduleTask(mockEvent2f); // R1 runs retry. Retry priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 4); scheduler.scheduleTask(mockEvent3); // R2 runs another. Reordered priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 5); scheduler.vertexCompleted(mockVertex2); // R1 completes scheduler.vertexCompleted(mockVertex3); // R2 completes // M starts. M completes. R1 starts. R2 starts. R1 completes. R2 completes scheduler.scheduleTask(mockEvent1); // M starts Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3); scheduler.vertexCompleted(mockVertex1); // M completes scheduler.scheduleTask(mockEvent2); // R1 starts Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6); scheduler.scheduleTask(mockEvent3); // R2 starts. Reordered priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 5); scheduler.scheduleTask(mockEvent2); // R1 runs another Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6); scheduler.vertexCompleted(mockVertex2); // R1 completes scheduler.vertexCompleted(mockVertex3); // R2 completes // M starts. R1 starts. M completes. R1 completes. R2 starts. R2 completes scheduler.scheduleTask(mockEvent1); // M starts Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 3); scheduler.scheduleTask(mockEvent2); // R1 starts. Reordered priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 2); scheduler.vertexCompleted(mockVertex1); // M completes scheduler.scheduleTask(mockEvent2); // R1 starts. Normal priority Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 6); scheduler.vertexCompleted(mockVertex2); // R1 completes scheduler.scheduleTask(mockEvent3); // R2 starts Assert.assertTrue(mockEventHandler.event.getPriority().getPriority() == 9); scheduler.vertexCompleted(mockVertex3); // R2 completes }