Exemplo n.º 1
0
    @Override
    protected void render(Block html) {
      ContainerId containerID;
      try {
        containerID = ConverterUtils.toContainerId($(CONTAINER_ID));
      } catch (IllegalArgumentException e) {
        html.p()._("Invalid containerId " + $(CONTAINER_ID))._();
        return;
      }

      DIV<Hamlet> div = html.div("#content");
      Container container = this.nmContext.getContainers().get(containerID);
      if (container == null) {
        div.h1("Unknown Container. Container might have completed, "
                + "please go back to the previous page and retry.")._();
        return;
      }
      ContainerStatus containerData = container.cloneAndGetContainerStatus();
      int exitCode = containerData.getExitStatus();
      String exiStatus = 
          (exitCode == YarnConfiguration.INVALID_CONTAINER_EXIT_STATUS) ? 
              "N/A" : String.valueOf(exitCode);
      info("Container information")
        ._("ContainerID", $(CONTAINER_ID))
        ._("ContainerState", container.getContainerState())
        ._("ExitStatus", exiStatus)
        ._("Diagnostics", containerData.getDiagnostics())
        ._("User", container.getUser())
        ._("TotalMemoryNeeded",
            container.getLaunchContext().getResource().getMemory())
        ._("logs", ujoin("containerlogs", $(CONTAINER_ID), container.getUser()),
            "Link to logs");
      html._(InfoBlock.class);
    }
Exemplo n.º 2
0
  private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata(
      ContainerStatus containerStatus) {
    ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>();
    eventMetadataBuilder.put(
        GobblinYarnMetricTagNames.CONTAINER_ID, containerStatus.getContainerId().toString());
    eventMetadataBuilder.put(
        GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE,
        containerStatus.getState().toString());
    if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) {
      eventMetadataBuilder.put(
          GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS,
          containerStatus.getExitStatus() + "");
    }
    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
      eventMetadataBuilder.put(
          GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS,
          containerStatus.getDiagnostics());
    }

    return eventMetadataBuilder;
  }
Exemplo n.º 3
0
  /**
   * Handle the completion of a container. A new container will be requested to replace the one that
   * just exited. Depending on the exit status and if container host affinity is enabled, the new
   * container may or may not try to be started on the same node.
   *
   * <p>A container completes in either of the following conditions: 1) some error happens in the
   * container and caused the container to exit, 2) the container gets killed due to some reason,
   * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
   * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster. A
   * replacement container is needed in all but the last case.
   */
  private void handleContainerCompletion(ContainerStatus containerStatus) {
    Map.Entry<Container, String> completedContainerEntry =
        this.containerMap.remove(containerStatus.getContainerId());
    String completedInstanceName = completedContainerEntry.getValue();

    LOGGER.info(
        String.format(
            "Container %s running Helix instance %s has completed with exit status %d",
            containerStatus.getContainerId(),
            completedInstanceName,
            containerStatus.getExitStatus()));

    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
      LOGGER.info(
          String.format(
              "Received the following diagnostics information for container %s: %s",
              containerStatus.getContainerId(), containerStatus.getDiagnostics()));
    }

    if (this.shutdownInProgress) {
      return;
    }

    int retryCount =
        this.helixInstanceRetryCount
            .putIfAbsent(completedInstanceName, new AtomicInteger(0))
            .incrementAndGet();

    // Populate event metadata
    Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent();
    if (this.eventSubmitter.isPresent()) {
      eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
      eventMetadataBuilder
          .get()
          .put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName);
      eventMetadataBuilder
          .get()
          .put(
              GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT,
              retryCount + "");
    }

    if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
      if (this.eventSubmitter.isPresent()) {
        this.eventSubmitter
            .get()
            .submit(
                GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                eventMetadataBuilder.get().build());
      }

      LOGGER.warn(
          "Maximum number of retries has been achieved for Helix instance "
              + completedInstanceName);
      return;
    }

    // Add the Helix instance name of the completed container to the queue of unused
    // instance names so they can be reused by a replacement container.
    this.unusedHelixInstanceNames.offer(completedInstanceName);

    if (this.eventSubmitter.isPresent()) {
      this.eventSubmitter
          .get()
          .submit(
              GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
              eventMetadataBuilder.get().build());
    }

    LOGGER.info(
        String.format(
            "Requesting a new container to replace %s to run Helix instance %s",
            containerStatus.getContainerId(), completedInstanceName));
    this.eventBus.post(
        new NewContainerRequest(
            shouldStickToTheSameNode(containerStatus.getExitStatus())
                ? Optional.of(completedContainerEntry.getKey())
                : Optional.<Container>absent()));
  }
    @SuppressWarnings("unchecked")
    @Override
    public void onContainersCompleted(List<ContainerStatus> completedContainers) {
      LOG.info(
          "Got response from RM for container ask, completedCnt=" + completedContainers.size());
      for (ContainerStatus containerStatus : completedContainers) {
        LOG.info(
            "Got container status for containerID="
                + containerStatus.getContainerId()
                + ", state="
                + containerStatus.getState()
                + ", exitStatus="
                + containerStatus.getExitStatus()
                + ", diagnostics="
                + containerStatus.getDiagnostics());

        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);

        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
          // container failed
          if (ContainerExitStatus.ABORTED != exitStatus) {
            // shell script failed
            // counts as completed
            numCompletedContainers.incrementAndGet();
            numFailedContainers.incrementAndGet();
          } else {
            // container was killed by framework, possibly preempted
            // we should re-try as the container was lost for some reason
            numAllocatedContainers.decrementAndGet();
            numRequestedContainers.decrementAndGet();
            // we do not need to release the container as it would be done
            // by the RM
          }
        } else {
          // nothing to do
          // container completed successfully
          numCompletedContainers.incrementAndGet();
          LOG.info(
              "Container completed successfully."
                  + ", containerId="
                  + containerStatus.getContainerId());
        }
        try {
          publishContainerEndEvent(timelineClient, containerStatus);
        } catch (Exception e) {
          LOG.error(
              "Container start event could not be pulished for "
                  + containerStatus.getContainerId().toString(),
              e);
        }
      }

      // ask for more containers if any failed
      int askCount = numTotalContainers - numRequestedContainers.get();
      numRequestedContainers.addAndGet(askCount);

      if (askCount > 0) {
        for (int i = 0; i < askCount; ++i) {
          ContainerRequest containerAsk = setupContainerAskForRM();
          amRMClient.addContainerRequest(containerAsk);
        }
      }

      if (numCompletedContainers.get() == numTotalContainers) {
        done = true;
      }
    }