@Override protected void render(Block html) { ContainerId containerID; try { containerID = ConverterUtils.toContainerId($(CONTAINER_ID)); } catch (IllegalArgumentException e) { html.p()._("Invalid containerId " + $(CONTAINER_ID))._(); return; } DIV<Hamlet> div = html.div("#content"); Container container = this.nmContext.getContainers().get(containerID); if (container == null) { div.h1("Unknown Container. Container might have completed, " + "please go back to the previous page and retry.")._(); return; } ContainerStatus containerData = container.cloneAndGetContainerStatus(); int exitCode = containerData.getExitStatus(); String exiStatus = (exitCode == YarnConfiguration.INVALID_CONTAINER_EXIT_STATUS) ? "N/A" : String.valueOf(exitCode); info("Container information") ._("ContainerID", $(CONTAINER_ID)) ._("ContainerState", container.getContainerState()) ._("ExitStatus", exiStatus) ._("Diagnostics", containerData.getDiagnostics()) ._("User", container.getUser()) ._("TotalMemoryNeeded", container.getLaunchContext().getResource().getMemory()) ._("logs", ujoin("containerlogs", $(CONTAINER_ID), container.getUser()), "Link to logs"); html._(InfoBlock.class); }
private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata( ContainerStatus containerStatus) { ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>(); eventMetadataBuilder.put( GobblinYarnMetricTagNames.CONTAINER_ID, containerStatus.getContainerId().toString()); eventMetadataBuilder.put( GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE, containerStatus.getState().toString()); if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) { eventMetadataBuilder.put( GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS, containerStatus.getExitStatus() + ""); } if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { eventMetadataBuilder.put( GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS, containerStatus.getDiagnostics()); } return eventMetadataBuilder; }
/** * Handle the completion of a container. A new container will be requested to replace the one that * just exited. Depending on the exit status and if container host affinity is enabled, the new * container may or may not try to be started on the same node. * * <p>A container completes in either of the following conditions: 1) some error happens in the * container and caused the container to exit, 2) the container gets killed due to some reason, * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster. A * replacement container is needed in all but the last case. */ private void handleContainerCompletion(ContainerStatus containerStatus) { Map.Entry<Container, String> completedContainerEntry = this.containerMap.remove(containerStatus.getContainerId()); String completedInstanceName = completedContainerEntry.getValue(); LOGGER.info( String.format( "Container %s running Helix instance %s has completed with exit status %d", containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus())); if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { LOGGER.info( String.format( "Received the following diagnostics information for container %s: %s", containerStatus.getContainerId(), containerStatus.getDiagnostics())); } if (this.shutdownInProgress) { return; } int retryCount = this.helixInstanceRetryCount .putIfAbsent(completedInstanceName, new AtomicInteger(0)) .incrementAndGet(); // Populate event metadata Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent(); if (this.eventSubmitter.isPresent()) { eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus)); eventMetadataBuilder .get() .put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName); eventMetadataBuilder .get() .put( GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT, retryCount + ""); } if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) { if (this.eventSubmitter.isPresent()) { this.eventSubmitter .get() .submit( GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.warn( "Maximum number of retries has been achieved for Helix instance " + completedInstanceName); return; } // Add the Helix instance name of the completed container to the queue of unused // instance names so they can be reused by a replacement container. this.unusedHelixInstanceNames.offer(completedInstanceName); if (this.eventSubmitter.isPresent()) { this.eventSubmitter .get() .submit( GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.info( String.format( "Requesting a new container to replace %s to run Helix instance %s", containerStatus.getContainerId(), completedInstanceName)); this.eventBus.post( new NewContainerRequest( shouldStickToTheSameNode(containerStatus.getExitStatus()) ? Optional.of(completedContainerEntry.getKey()) : Optional.<Container>absent())); }
@SuppressWarnings("unchecked") @Override public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info( "Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { LOG.info( "Got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { // shell script failed // counts as completed numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason numAllocatedContainers.decrementAndGet(); numRequestedContainers.decrementAndGet(); // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully numCompletedContainers.incrementAndGet(); LOG.info( "Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } try { publishContainerEndEvent(timelineClient, containerStatus); } catch (Exception e) { LOG.error( "Container start event could not be pulished for " + containerStatus.getContainerId().toString(), e); } } // ask for more containers if any failed int askCount = numTotalContainers - numRequestedContainers.get(); numRequestedContainers.addAndGet(askCount); if (askCount > 0) { for (int i = 0; i < askCount; ++i) { ContainerRequest containerAsk = setupContainerAskForRM(); amRMClient.addContainerRequest(containerAsk); } } if (numCompletedContainers.get() == numTotalContainers) { done = true; } }