private void retryAllFailures() throws IOException { logger.info("Restarting all failed jobs"); this.retryFailedJobs = false; this.flowKilled = false; this.flowFailed = false; this.flow.setStatus(Status.RUNNING); ArrayList<ExecutableNode> retryJobs = new ArrayList<ExecutableNode>(); resetFailedState(this.flow, retryJobs); for (ExecutableNode node : retryJobs) { if (node.getStatus() == Status.READY || node.getStatus() == Status.DISABLED) { runReadyJob(node); } else if (node.getStatus() == Status.SUCCEEDED) { for (String outNodeId : node.getOutNodes()) { ExecutableFlowBase base = node.getParentFlow(); runReadyJob(base.getExecutableNode(outNodeId)); } } runReadyJob(node); } updateFlow(); }
/** * Determines what the state of the next node should be. Returns null if the node should not be * run. * * @param node * @return */ public Status getImpliedStatus(ExecutableNode node) { // If it's running or finished with 'SUCCEEDED', than don't even // bother starting this job. if (Status.isStatusRunning(node.getStatus()) || node.getStatus() == Status.SUCCEEDED) { return null; } // Go through the node's dependencies. If all of the previous job's // statuses is finished and not FAILED or KILLED, than we can safely // run this job. ExecutableFlowBase flow = node.getParentFlow(); boolean shouldKill = false; for (String dependency : node.getInNodes()) { ExecutableNode dependencyNode = flow.getExecutableNode(dependency); Status depStatus = dependencyNode.getStatus(); if (!Status.isStatusFinished(depStatus)) { return null; } else if (depStatus == Status.FAILED || depStatus == Status.CANCELLED || depStatus == Status.KILLED) { // We propagate failures as KILLED states. shouldKill = true; } } // If it's disabled but ready to run, we want to make sure it continues // being disabled. if (node.getStatus() == Status.DISABLED || node.getStatus() == Status.SKIPPED) { return Status.SKIPPED; } // If the flow has failed, and we want to finish only the currently running // jobs, we just // kill everything else. We also kill, if the flow has been cancelled. if (flowFailed && failureAction == ExecutionOptions.FailureAction.FINISH_CURRENTLY_RUNNING) { return Status.CANCELLED; } else if (shouldKill || isKilled()) { return Status.CANCELLED; } // All good to go, ready to run. return Status.READY; }
private boolean runReadyJob(ExecutableNode node) throws IOException { if (Status.isStatusFinished(node.getStatus()) || Status.isStatusRunning(node.getStatus())) { return false; } Status nextNodeStatus = getImpliedStatus(node); if (nextNodeStatus == null) { return false; } if (nextNodeStatus == Status.CANCELLED) { logger.info("Cancelling '" + node.getNestedId() + "' due to prior errors."); node.cancelNode(System.currentTimeMillis()); finishExecutableNode(node); } else if (nextNodeStatus == Status.SKIPPED) { logger.info("Skipping disabled job '" + node.getId() + "'."); node.skipNode(System.currentTimeMillis()); finishExecutableNode(node); } else if (nextNodeStatus == Status.READY) { if (node instanceof ExecutableFlowBase) { ExecutableFlowBase flow = ((ExecutableFlowBase) node); logger.info("Running flow '" + flow.getNestedId() + "'."); flow.setStatus(Status.RUNNING); flow.setStartTime(System.currentTimeMillis()); prepareJobProperties(flow); for (String startNodeId : ((ExecutableFlowBase) node).getStartNodes()) { ExecutableNode startNode = flow.getExecutableNode(startNodeId); runReadyJob(startNode); } } else { runExecutableNode(node); } } return true; }
private void resetFailedState(ExecutableFlowBase flow, List<ExecutableNode> nodesToRetry) { // bottom up LinkedList<ExecutableNode> queue = new LinkedList<ExecutableNode>(); for (String id : flow.getEndNodes()) { ExecutableNode node = flow.getExecutableNode(id); queue.add(node); } long maxStartTime = -1; while (!queue.isEmpty()) { ExecutableNode node = queue.poll(); Status oldStatus = node.getStatus(); maxStartTime = Math.max(node.getStartTime(), maxStartTime); long currentTime = System.currentTimeMillis(); if (node.getStatus() == Status.SUCCEEDED) { // This is a candidate parent for restart nodesToRetry.add(node); continue; } else if (node.getStatus() == Status.RUNNING) { continue; } else if (node.getStatus() == Status.SKIPPED) { node.setStatus(Status.DISABLED); node.setEndTime(-1); node.setStartTime(-1); node.setUpdateTime(currentTime); } else if (node instanceof ExecutableFlowBase) { ExecutableFlowBase base = (ExecutableFlowBase) node; switch (base.getStatus()) { case CANCELLED: node.setStatus(Status.READY); node.setEndTime(-1); node.setStartTime(-1); node.setUpdateTime(currentTime); // Break out of the switch. We'll reset the flow just like a normal node break; case KILLED: case FAILED: case FAILED_FINISHING: resetFailedState(base, nodesToRetry); continue; default: // Continue the while loop. If the job is in a finished state that's not // a failure, we don't want to reset the job. continue; } } else if (node.getStatus() == Status.CANCELLED) { // Not a flow, but killed node.setStatus(Status.READY); node.setStartTime(-1); node.setEndTime(-1); node.setUpdateTime(currentTime); } else if (node.getStatus() == Status.FAILED || node.getStatus() == Status.KILLED) { node.resetForRetry(); nodesToRetry.add(node); } if (!(node instanceof ExecutableFlowBase) && node.getStatus() != oldStatus) { logger.info( "Resetting job '" + node.getNestedId() + "' from " + oldStatus + " to " + node.getStatus()); } for (String inId : node.getInNodes()) { ExecutableNode nodeUp = flow.getExecutableNode(inId); queue.add(nodeUp); } } // At this point, the following code will reset the flow Status oldFlowState = flow.getStatus(); if (maxStartTime == -1) { // Nothing has run inside the flow, so we assume the flow hasn't even started running yet. flow.setStatus(Status.READY); } else { flow.setStatus(Status.RUNNING); // Add any READY start nodes. Usually it means the flow started, but the start node has not. for (String id : flow.getStartNodes()) { ExecutableNode node = flow.getExecutableNode(id); if (node.getStatus() == Status.READY || node.getStatus() == Status.DISABLED) { nodesToRetry.add(node); } } } flow.setUpdateTime(System.currentTimeMillis()); flow.setEndTime(-1); logger.info( "Resetting flow '" + flow.getNestedId() + "' from " + oldFlowState + " to " + flow.getStatus()); }
private void finalizeFlow(ExecutableFlowBase flow) { String id = flow == this.flow ? "" : flow.getNestedId(); // If it's not the starting flow, we'll create set of output props // for the finished flow. boolean succeeded = true; Props previousOutput = null; for (String end : flow.getEndNodes()) { ExecutableNode node = flow.getExecutableNode(end); if (node.getStatus() == Status.KILLED || node.getStatus() == Status.FAILED || node.getStatus() == Status.CANCELLED) { succeeded = false; } Props output = node.getOutputProps(); if (output != null) { output = Props.clone(output); output.setParent(previousOutput); previousOutput = output; } } flow.setOutputProps(previousOutput); if (!succeeded && (flow.getStatus() == Status.RUNNING)) { flow.setStatus(Status.KILLED); } flow.setEndTime(System.currentTimeMillis()); flow.setUpdateTime(System.currentTimeMillis()); long durationSec = (flow.getEndTime() - flow.getStartTime()) / 1000; switch (flow.getStatus()) { case FAILED_FINISHING: logger.info("Setting flow '" + id + "' status to FAILED in " + durationSec + " seconds"); flow.setStatus(Status.FAILED); break; case FAILED: case KILLED: case CANCELLED: case FAILED_SUCCEEDED: logger.info( "Flow '" + id + "' is set to " + flow.getStatus().toString() + " in " + durationSec + " seconds"); break; default: flow.setStatus(Status.SUCCEEDED); logger.info( "Flow '" + id + "' is set to " + flow.getStatus().toString() + " in " + durationSec + " seconds"); } // If the finalized flow is actually the top level flow, than we finish // the main loop. if (flow instanceof ExecutableFlow) { flowFinished = true; } }
private boolean progressGraph() throws IOException { finishedNodes.swap(); // The following nodes are finished, so we'll collect a list of outnodes // that are candidates for running next. HashSet<ExecutableNode> nodesToCheck = new HashSet<ExecutableNode>(); for (ExecutableNode node : finishedNodes) { Set<String> outNodeIds = node.getOutNodes(); ExecutableFlowBase parentFlow = node.getParentFlow(); // If a job is seen as failed, then we set the parent flow to FAILED_FINISHING if (node.getStatus() == Status.FAILED) { // The job cannot be retried or has run out of retry attempts. We will // fail the job and its flow now. if (!retryJobIfPossible(node)) { propagateStatus(node.getParentFlow(), Status.FAILED_FINISHING); if (failureAction == FailureAction.CANCEL_ALL) { this.kill(); } this.flowFailed = true; } else { nodesToCheck.add(node); continue; } } if (outNodeIds.isEmpty()) { // There's no outnodes means it's the end of a flow, so we finalize // and fire an event. finalizeFlow(parentFlow); finishExecutableNode(parentFlow); // If the parent has a parent, then we process if (!(parentFlow instanceof ExecutableFlow)) { outNodeIds = parentFlow.getOutNodes(); parentFlow = parentFlow.getParentFlow(); } } // Add all out nodes from the finished job. We'll check against this set to // see if any are candidates for running. for (String nodeId : outNodeIds) { ExecutableNode outNode = parentFlow.getExecutableNode(nodeId); nodesToCheck.add(outNode); } } // Runs candidate jobs. The code will check to see if they are ready to run before // Instant kill or skip if necessary. boolean jobsRun = false; for (ExecutableNode node : nodesToCheck) { if (Status.isStatusFinished(node.getStatus()) || Status.isStatusRunning(node.getStatus())) { // Really shouldn't get in here. continue; } jobsRun |= runReadyJob(node); } if (jobsRun || finishedNodes.getSize() > 0) { updateFlow(); return true; } return false; }