예제 #1
0
    @Override
    protected void render(Block html) {
      ContainerId containerID;
      try {
        containerID = ConverterUtils.toContainerId($(CONTAINER_ID));
      } catch (IllegalArgumentException e) {
        html.p()._("Invalid containerId " + $(CONTAINER_ID))._();
        return;
      }

      DIV<Hamlet> div = html.div("#content");
      Container container = this.nmContext.getContainers().get(containerID);
      if (container == null) {
        div.h1("Unknown Container. Container might have completed, "
                + "please go back to the previous page and retry.")._();
        return;
      }
      ContainerStatus containerData = container.cloneAndGetContainerStatus();
      int exitCode = containerData.getExitStatus();
      String exiStatus = 
          (exitCode == YarnConfiguration.INVALID_CONTAINER_EXIT_STATUS) ? 
              "N/A" : String.valueOf(exitCode);
      info("Container information")
        ._("ContainerID", $(CONTAINER_ID))
        ._("ContainerState", container.getContainerState())
        ._("ExitStatus", exiStatus)
        ._("Diagnostics", containerData.getDiagnostics())
        ._("User", container.getUser())
        ._("TotalMemoryNeeded",
            container.getLaunchContext().getResource().getMemory())
        ._("logs", ujoin("containerlogs", $(CONTAINER_ID), container.getUser()),
            "Link to logs");
      html._(InfoBlock.class);
    }
예제 #2
0
  private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata(
      ContainerStatus containerStatus) {
    ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>();
    eventMetadataBuilder.put(
        GobblinYarnMetricTagNames.CONTAINER_ID, containerStatus.getContainerId().toString());
    eventMetadataBuilder.put(
        GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE,
        containerStatus.getState().toString());
    if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) {
      eventMetadataBuilder.put(
          GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS,
          containerStatus.getExitStatus() + "");
    }
    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
      eventMetadataBuilder.put(
          GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS,
          containerStatus.getDiagnostics());
    }

    return eventMetadataBuilder;
  }
  private void updateQueueWithNodeUpdate(NodeUpdateSchedulerEventWrapper eventWrapper) {
    RMNodeWrapper node = (RMNodeWrapper) eventWrapper.getRMNode();
    List<UpdatedContainerInfo> containerList = node.getContainerUpdates();
    for (UpdatedContainerInfo info : containerList) {
      for (ContainerStatus status : info.getCompletedContainers()) {
        ContainerId containerId = status.getContainerId();
        SchedulerAppReport app =
            scheduler.getSchedulerAppInfo(containerId.getApplicationAttemptId());

        if (app == null) {
          // this happens for the AM container
          // The app have already removed when the NM sends the release
          // information.
          continue;
        }

        String queue = appQueueMap.get(containerId.getApplicationAttemptId().getApplicationId());
        int releasedMemory = 0, releasedVCores = 0;
        if (status.getExitStatus() == ContainerExitStatus.SUCCESS) {
          for (RMContainer rmc : app.getLiveContainers()) {
            if (rmc.getContainerId() == containerId) {
              releasedMemory += rmc.getContainer().getResource().getMemory();
              releasedVCores += rmc.getContainer().getResource().getVirtualCores();
              break;
            }
          }
        } else if (status.getExitStatus() == ContainerExitStatus.ABORTED) {
          if (preemptionContainerMap.containsKey(containerId)) {
            Resource preResource = preemptionContainerMap.get(containerId);
            releasedMemory += preResource.getMemory();
            releasedVCores += preResource.getVirtualCores();
            preemptionContainerMap.remove(containerId);
          }
        }
        // update queue counters
        updateQueueMetrics(queue, releasedMemory, releasedVCores);
      }
    }
  }
  private static void publishContainerEndEvent(
      TimelineClient timelineClient, ContainerStatus container) throws IOException, YarnException {
    TimelineEntity entity = new TimelineEntity();
    entity.setEntityId(container.getContainerId().toString());
    entity.setEntityType(DSEntity.DS_CONTAINER.toString());
    entity.addPrimaryFilter("user", UserGroupInformation.getCurrentUser().toString());
    TimelineEvent event = new TimelineEvent();
    event.setTimestamp(System.currentTimeMillis());
    event.setEventType(DSEvent.DS_CONTAINER_END.toString());
    event.addEventInfo("State", container.getState().name());
    event.addEventInfo("Exit Status", container.getExitStatus());
    entity.addEvent(event);

    timelineClient.putEntities(entity);
  }
예제 #5
0
  /**
   * Handle the completion of a container. A new container will be requested to replace the one that
   * just exited. Depending on the exit status and if container host affinity is enabled, the new
   * container may or may not try to be started on the same node.
   *
   * <p>A container completes in either of the following conditions: 1) some error happens in the
   * container and caused the container to exit, 2) the container gets killed due to some reason,
   * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
   * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster. A
   * replacement container is needed in all but the last case.
   */
  private void handleContainerCompletion(ContainerStatus containerStatus) {
    Map.Entry<Container, String> completedContainerEntry =
        this.containerMap.remove(containerStatus.getContainerId());
    String completedInstanceName = completedContainerEntry.getValue();

    LOGGER.info(
        String.format(
            "Container %s running Helix instance %s has completed with exit status %d",
            containerStatus.getContainerId(),
            completedInstanceName,
            containerStatus.getExitStatus()));

    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
      LOGGER.info(
          String.format(
              "Received the following diagnostics information for container %s: %s",
              containerStatus.getContainerId(), containerStatus.getDiagnostics()));
    }

    if (this.shutdownInProgress) {
      return;
    }

    int retryCount =
        this.helixInstanceRetryCount
            .putIfAbsent(completedInstanceName, new AtomicInteger(0))
            .incrementAndGet();

    // Populate event metadata
    Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent();
    if (this.eventSubmitter.isPresent()) {
      eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
      eventMetadataBuilder
          .get()
          .put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName);
      eventMetadataBuilder
          .get()
          .put(
              GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT,
              retryCount + "");
    }

    if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
      if (this.eventSubmitter.isPresent()) {
        this.eventSubmitter
            .get()
            .submit(
                GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                eventMetadataBuilder.get().build());
      }

      LOGGER.warn(
          "Maximum number of retries has been achieved for Helix instance "
              + completedInstanceName);
      return;
    }

    // Add the Helix instance name of the completed container to the queue of unused
    // instance names so they can be reused by a replacement container.
    this.unusedHelixInstanceNames.offer(completedInstanceName);

    if (this.eventSubmitter.isPresent()) {
      this.eventSubmitter
          .get()
          .submit(
              GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
              eventMetadataBuilder.get().build());
    }

    LOGGER.info(
        String.format(
            "Requesting a new container to replace %s to run Helix instance %s",
            containerStatus.getContainerId(), completedInstanceName));
    this.eventBus.post(
        new NewContainerRequest(
            shouldStickToTheSameNode(containerStatus.getExitStatus())
                ? Optional.of(completedContainerEntry.getKey())
                : Optional.<Container>absent()));
  }
    @SuppressWarnings("unchecked")
    @Override
    public void onContainersCompleted(List<ContainerStatus> completedContainers) {
      LOG.info(
          "Got response from RM for container ask, completedCnt=" + completedContainers.size());
      for (ContainerStatus containerStatus : completedContainers) {
        LOG.info(
            "Got container status for containerID="
                + containerStatus.getContainerId()
                + ", state="
                + containerStatus.getState()
                + ", exitStatus="
                + containerStatus.getExitStatus()
                + ", diagnostics="
                + containerStatus.getDiagnostics());

        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);

        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
          // container failed
          if (ContainerExitStatus.ABORTED != exitStatus) {
            // shell script failed
            // counts as completed
            numCompletedContainers.incrementAndGet();
            numFailedContainers.incrementAndGet();
          } else {
            // container was killed by framework, possibly preempted
            // we should re-try as the container was lost for some reason
            numAllocatedContainers.decrementAndGet();
            numRequestedContainers.decrementAndGet();
            // we do not need to release the container as it would be done
            // by the RM
          }
        } else {
          // nothing to do
          // container completed successfully
          numCompletedContainers.incrementAndGet();
          LOG.info(
              "Container completed successfully."
                  + ", containerId="
                  + containerStatus.getContainerId());
        }
        try {
          publishContainerEndEvent(timelineClient, containerStatus);
        } catch (Exception e) {
          LOG.error(
              "Container start event could not be pulished for "
                  + containerStatus.getContainerId().toString(),
              e);
        }
      }

      // ask for more containers if any failed
      int askCount = numTotalContainers - numRequestedContainers.get();
      numRequestedContainers.addAndGet(askCount);

      if (askCount > 0) {
        for (int i = 0; i < askCount; ++i) {
          ContainerRequest containerAsk = setupContainerAskForRM();
          amRMClient.addContainerRequest(containerAsk);
        }
      }

      if (numCompletedContainers.get() == numTotalContainers) {
        done = true;
      }
    }