@Override public void analyze(DagInfo dagInfo) throws TezException { // get all attempts in the dag and find the last failed/succeeded attempt. // ignore killed attempt to handle kills that happen upon dag completion TaskAttemptInfo lastAttempt = null; long lastAttemptFinishTime = 0; for (VertexInfo vertex : dagInfo.getVertices()) { for (TaskInfo task : vertex.getTasks()) { for (TaskAttemptInfo attempt : task.getTaskAttempts()) { attempts.put(attempt.getTaskAttemptId(), attempt); if (attempt.getStatus().equals(succeededState) || attempt.getStatus().equals(failedState)) { if (lastAttemptFinishTime < attempt.getFinishTime()) { lastAttempt = attempt; lastAttemptFinishTime = attempt.getFinishTime(); } } } } } if (lastAttempt == null) { System.out.println("Cannot find last attempt to finish in DAG " + dagInfo.getDagId()); return; } createCriticalPath(dagInfo, lastAttempt, lastAttemptFinishTime, attempts); analyzeCriticalPath(dagInfo); if (getConf().getBoolean(DRAW_SVG, true)) { saveCriticalPathAsSVG(dagInfo); } }
private void createCriticalPath( DagInfo dagInfo, TaskAttemptInfo lastAttempt, long lastAttemptFinishTime, Map<String, TaskAttemptInfo> attempts) { List<CriticalPathStep> tempCP = Lists.newLinkedList(); if (lastAttempt != null) { TaskAttemptInfo currentAttempt = lastAttempt; CriticalPathStep currentStep = new CriticalPathStep(currentAttempt, EntityType.DAG_COMMIT); long currentAttemptStopCriticalPathTime = lastAttemptFinishTime; // add the commit step currentStep.stopCriticalPathTime = dagInfo.getFinishTime(); currentStep.startCriticalPathTime = currentAttemptStopCriticalPathTime; currentStep.reason = CriticalPathDependency.COMMIT_DEPENDENCY; tempCP.add(currentStep); while (true) { Preconditions.checkState(currentAttempt != null); Preconditions.checkState(currentAttemptStopCriticalPathTime > 0); System.out.println( "Step: " + tempCP.size() + " Attempt: " + currentAttempt.getTaskAttemptId()); currentStep = new CriticalPathStep(currentAttempt, EntityType.ATTEMPT); currentStep.stopCriticalPathTime = currentAttemptStopCriticalPathTime; // consider the last data event seen immediately preceding the current critical path // stop time for this attempt long currentStepLastDataEventTime = 0; String currentStepLastDataTA = null; DataDependencyEvent item = currentAttempt.getLastDataEventInfo(currentStep.stopCriticalPathTime); if (item != null) { currentStepLastDataEventTime = item.getTimestamp(); currentStepLastDataTA = item.getTaskAttemptId(); } // sanity check for (CriticalPathStep previousStep : tempCP) { if (previousStep.type == EntityType.ATTEMPT) { if (previousStep.attempt.getTaskAttemptId().equals(currentAttempt.getTaskAttemptId())) { // found loop. // this should only happen for read errors in currentAttempt List<DataDependencyEvent> dataEvents = currentAttempt.getLastDataEvents(); Preconditions.checkState(dataEvents.size() > 1); // received // original and // retry data events Preconditions.checkState( currentStepLastDataEventTime < dataEvents.get(dataEvents.size() - 1).getTimestamp()); // new event is // earlier than // last } } } tempCP.add(currentStep); // find the next attempt on the critical path boolean dataDependency = false; // find out predecessor dependency if (currentStepLastDataEventTime > currentAttempt.getCreationTime()) { dataDependency = true; } long startCriticalPathTime = 0; String nextAttemptId = null; CriticalPathDependency reason = null; if (dataDependency) { // last data event was produced after the attempt was scheduled. use // data dependency // typically the case when scheduling ahead of time System.out.println("Has data dependency"); if (!Strings.isNullOrEmpty(currentStepLastDataTA)) { // there is a valid data causal TA. Use it. nextAttemptId = currentStepLastDataTA; reason = CriticalPathDependency.DATA_DEPENDENCY; startCriticalPathTime = currentStepLastDataEventTime; System.out.println("Using data dependency " + nextAttemptId); } else { // there is no valid data causal TA. This means data event came from the same vertex VertexInfo vertex = currentAttempt.getTaskInfo().getVertexInfo(); Preconditions.checkState( !vertex.getAdditionalInputInfoList().isEmpty(), "Vertex: " + vertex.getVertexId() + " has no external inputs but the last data event " + "TA is null for " + currentAttempt.getTaskAttemptId()); nextAttemptId = null; reason = CriticalPathDependency.INIT_DEPENDENCY; System.out.println("Using init dependency"); } } else { // attempt was scheduled after last data event. use scheduling dependency // typically happens for retries System.out.println("Has scheduling dependency"); if (!Strings.isNullOrEmpty(currentAttempt.getCreationCausalTA())) { // there is a scheduling causal TA. Use it. nextAttemptId = currentAttempt.getCreationCausalTA(); reason = CriticalPathDependency.RETRY_DEPENDENCY; TaskAttemptInfo nextAttempt = attempts.get(nextAttemptId); if (nextAttemptId != null) { VertexInfo currentVertex = currentAttempt.getTaskInfo().getVertexInfo(); VertexInfo nextVertex = nextAttempt.getTaskInfo().getVertexInfo(); if (!nextVertex.getVertexName().equals(currentVertex.getVertexName())) { // cause from different vertex. Might be rerun to re-generate outputs for (VertexInfo outVertex : currentVertex.getOutputVertices()) { if (nextVertex.getVertexName().equals(outVertex.getVertexName())) { // next vertex is an output vertex reason = CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY; break; } } } } if (reason == CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY) { // rescheduled due to read error. start critical at read error report time. // for now proxy own creation time for read error report time startCriticalPathTime = currentAttempt.getCreationTime(); } else { // rescheduled due to own previous attempt failure // we are critical when the previous attempt fails Preconditions.checkState(nextAttempt != null); Preconditions.checkState( nextAttempt .getTaskInfo() .getTaskId() .equals(currentAttempt.getTaskInfo().getTaskId())); startCriticalPathTime = nextAttempt.getFinishTime(); } System.out.println("Using scheduling dependency " + nextAttemptId); } else { // there is no scheduling causal TA. if (!Strings.isNullOrEmpty(currentStepLastDataTA)) { // there is a data event going to the vertex. Count the time between data event and // creation time as Initializer/Manager overhead and follow data dependency nextAttemptId = currentStepLastDataTA; reason = CriticalPathDependency.DATA_DEPENDENCY; startCriticalPathTime = currentStepLastDataEventTime; long overhead = currentAttempt.getCreationTime() - currentStepLastDataEventTime; currentStep.notes.add( "Initializer/VertexManager scheduling overhead " + SVGUtils.getTimeStr(overhead)); System.out.println("Using data dependency " + nextAttemptId); } else { // there is no scheduling causal TA and no data event casual TA. // the vertex has external input that sent the last data events // or the vertex has external input but does not use events // or the vertex has no external inputs or edges nextAttemptId = null; reason = CriticalPathDependency.INIT_DEPENDENCY; System.out.println("Using init dependency"); } } } currentStep.startCriticalPathTime = startCriticalPathTime; currentStep.reason = reason; Preconditions.checkState( currentStep.stopCriticalPathTime >= currentStep.startCriticalPathTime); if (Strings.isNullOrEmpty(nextAttemptId)) { Preconditions.checkState(reason.equals(CriticalPathDependency.INIT_DEPENDENCY)); Preconditions.checkState(startCriticalPathTime == 0); // no predecessor attempt found. this is the last step in the critical path // assume attempts start critical path time is when its scheduled. before that is // vertex initialization time currentStep.startCriticalPathTime = currentStep.attempt.getCreationTime(); // add vertex init step long initStepStopCriticalTime = currentStep.startCriticalPathTime; currentStep = new CriticalPathStep(currentAttempt, EntityType.VERTEX_INIT); currentStep.stopCriticalPathTime = initStepStopCriticalTime; currentStep.startCriticalPathTime = dagInfo.getStartTime(); currentStep.reason = CriticalPathDependency.INIT_DEPENDENCY; tempCP.add(currentStep); if (!tempCP.isEmpty()) { for (int i = tempCP.size() - 1; i >= 0; --i) { criticalPath.add(tempCP.get(i)); } } return; } currentAttempt = attempts.get(nextAttemptId); currentAttemptStopCriticalPathTime = startCriticalPathTime; } } }
private void analyzeAllocationOverhead(DagInfo dag) { List<TaskAttemptInfo> preemptedAttempts = Lists.newArrayList(); for (VertexInfo v : dag.getVertices()) { for (TaskInfo t : v.getTasks()) { for (TaskAttemptInfo a : t.getTaskAttempts()) { if (a.getTerminationCause() .equals(TaskAttemptTerminationCause.INTERNAL_PREEMPTION.name())) { System.out.println("Found preempted attempt " + a.getTaskAttemptId()); preemptedAttempts.add(a); } } } } for (int i = 0; i < criticalPath.size(); ++i) { CriticalPathStep step = criticalPath.get(i); TaskAttemptInfo attempt = step.attempt; if (step.getType() != EntityType.ATTEMPT) { continue; } long creationTime = attempt.getCreationTime(); long allocationTime = attempt.getAllocationTime(); if (allocationTime < step.startCriticalPathTime) { // allocated before it became critical continue; } // the attempt is critical before allocation. So allocation overhead needs analysis Container container = attempt.getContainer(); if (container != null) { Collection<TaskAttemptInfo> attempts = dag.getContainerMapping().get(container); if (attempts != null && !attempts.isEmpty()) { // arrange attempts by allocation time List<TaskAttemptInfo> attemptsList = Lists.newArrayList(attempts); Collections.sort(attemptsList, TaskAttemptInfo.orderingOnAllocationTime()); // walk the list to record allocation time before the current attempt long containerPreviousAllocatedTime = 0; for (TaskAttemptInfo containerAttempt : attemptsList) { if (containerAttempt.getTaskAttemptId().equals(attempt.getTaskAttemptId())) { break; } System.out.println( "Container: " + container.getId() + " running att: " + containerAttempt.getTaskAttemptId() + " wait att: " + attempt.getTaskAttemptId()); containerPreviousAllocatedTime += containerAttempt.getAllocationToEndTimeInterval(); } if (containerPreviousAllocatedTime == 0) { step.notes.add("Container " + container.getId() + " newly allocated."); } else { if (containerPreviousAllocatedTime >= attempt.getCreationToAllocationTimeInterval()) { step.notes.add("Container " + container.getId() + " was fully allocated"); } else { step.notes.add( "Container " + container.getId() + " allocated for " + SVGUtils.getTimeStr(containerPreviousAllocatedTime) + " out of " + SVGUtils.getTimeStr(attempt.getCreationToAllocationTimeInterval()) + " of allocation wait time"); } } } // look for internal preemptions while attempt was waiting for allocation for (TaskAttemptInfo a : preemptedAttempts) { if (a.getFinishTime() > creationTime && a.getFinishTime() < allocationTime) { // found an attempt that was preempted within this time interval step.notes.add("Potentially waited for preemption of " + a.getShortName()); } } } } }