Example #1
0
    @Override
    protected void render(Block html) {
      ContainerId containerID;
      try {
        containerID = ConverterUtils.toContainerId($(CONTAINER_ID));
      } catch (IllegalArgumentException e) {
        html.p()._("Invalid containerId " + $(CONTAINER_ID))._();
        return;
      }

      DIV<Hamlet> div = html.div("#content");
      Container container = this.nmContext.getContainers().get(containerID);
      if (container == null) {
        div.h1("Unknown Container. Container might have completed, "
                + "please go back to the previous page and retry.")._();
        return;
      }
      ContainerStatus containerData = container.cloneAndGetContainerStatus();
      int exitCode = containerData.getExitStatus();
      String exiStatus = 
          (exitCode == YarnConfiguration.INVALID_CONTAINER_EXIT_STATUS) ? 
              "N/A" : String.valueOf(exitCode);
      info("Container information")
        ._("ContainerID", $(CONTAINER_ID))
        ._("ContainerState", container.getContainerState())
        ._("ExitStatus", exiStatus)
        ._("Diagnostics", containerData.getDiagnostics())
        ._("User", container.getUser())
        ._("TotalMemoryNeeded",
            container.getLaunchContext().getResource().getMemory())
        ._("logs", ujoin("containerlogs", $(CONTAINER_ID), container.getUser()),
            "Link to logs");
      html._(InfoBlock.class);
    }
Example #2
0
 public static ContainerStatus newContainerStatus(
     ContainerId containerId, ContainerState containerState, String diagnostics, int exitStatus) {
   ContainerStatus containerStatus = recordFactory.newRecordInstance(ContainerStatus.class);
   containerStatus.setState(containerState);
   containerStatus.setContainerId(containerId);
   containerStatus.setDiagnostics(diagnostics);
   containerStatus.setExitStatus(exitStatus);
   return containerStatus;
 }
 /*
  * (non-Javadoc)
  * @see org.apache.hadoop.yarn.client.api.async.AMRMClientAsync.CallbackHandler#onContainersCompleted(java.util.List)
  */
 @Override
 public void onContainersCompleted(List<ContainerStatus> statuses) {
   LOG.info("onContainersCompleted() called");
   for (ContainerStatus status : statuses) {
     LOG.info("container '" + status.getContainerId() + "' status is " + status);
     synchronized (ApplicationMasterAsync.lock) {
       ApplicationMasterAsync.completedContainers++;
     }
   }
 }
Example #4
0
 @Private
 @Unstable
 public static ContainerStatus newInstance(
     ContainerId containerId, ContainerState containerState, String diagnostics, int exitStatus) {
   ContainerStatus containerStatus = Records.newRecord(ContainerStatus.class);
   containerStatus.setState(containerState);
   containerStatus.setContainerId(containerId);
   containerStatus.setDiagnostics(diagnostics);
   containerStatus.setExitStatus(exitStatus);
   return containerStatus;
 }
  private static void publishContainerEndEvent(
      TimelineClient timelineClient, ContainerStatus container) throws IOException, YarnException {
    TimelineEntity entity = new TimelineEntity();
    entity.setEntityId(container.getContainerId().toString());
    entity.setEntityType(DSEntity.DS_CONTAINER.toString());
    entity.addPrimaryFilter("user", UserGroupInformation.getCurrentUser().toString());
    TimelineEvent event = new TimelineEvent();
    event.setTimestamp(System.currentTimeMillis());
    event.setEventType(DSEvent.DS_CONTAINER_END.toString());
    event.addEventInfo("State", container.getState().name());
    event.addEventInfo("Exit Status", container.getExitStatus());
    entity.addEvent(event);

    timelineClient.putEntities(entity);
  }
 @Override
 protected void handleCompletedContainers(List<ContainerStatus> containerStatuses) {
   // strip away containers which were already marked
   // garbage by allocate tracker. system
   // never knew those even exist and might create mess
   // with monitor component. monitor only sees
   // complete status which is also the case for garbage
   // when it's released.
   List<ContainerStatus> garbageFree = new ArrayList<ContainerStatus>();
   for (ContainerStatus status : containerStatuses) {
     if (!garbageContainers.contains(status.getContainerId())) {
       garbageFree.add(status);
     }
   }
   allocatorListener.completed(garbageFree);
 }
 public void containerStatus(ContainerStatus containerStatus) throws Exception {
   Map<ApplicationId, List<ContainerStatus>> conts =
       new HashMap<ApplicationId, List<ContainerStatus>>();
   conts.put(
       containerStatus.getContainerId().getApplicationAttemptId().getApplicationId(),
       Arrays.asList(new ContainerStatus[] {containerStatus}));
   nodeHeartbeat(conts, true);
 }
  private NodeStatus getNodeStatus() {

    NodeStatus nodeStatus = recordFactory.newRecordInstance(NodeStatus.class);
    nodeStatus.setNodeId(this.nodeId);

    int numActiveContainers = 0;
    List<ContainerStatus> containersStatuses = new ArrayList<ContainerStatus>();
    for (Iterator<Entry<ContainerId, Container>> i =
            this.context.getContainers().entrySet().iterator();
        i.hasNext(); ) {
      Entry<ContainerId, Container> e = i.next();
      ContainerId containerId = e.getKey();
      Container container = e.getValue();

      // Clone the container to send it to the RM
      org.apache.hadoop.yarn.api.records.ContainerStatus containerStatus =
          container.cloneAndGetContainerStatus();
      containersStatuses.add(containerStatus);
      ++numActiveContainers;
      LOG.info("Sending out status for container: " + containerStatus);

      if (containerStatus.getState() == ContainerState.COMPLETE) {
        // Remove
        i.remove();

        LOG.info("Removed completed container " + containerId);
      }
    }
    nodeStatus.setContainersStatuses(containersStatuses);

    LOG.debug(this.nodeId + " sending out status for " + numActiveContainers + " containers");

    NodeHealthStatus nodeHealthStatus = this.context.getNodeHealthStatus();
    if (this.healthChecker != null) {
      this.healthChecker.setHealthStatus(nodeHealthStatus);
    }
    LOG.debug(
        "Node's health-status : "
            + nodeHealthStatus.getIsNodeHealthy()
            + ", "
            + nodeHealthStatus.getHealthReport());
    nodeStatus.setNodeHealthStatus(nodeHealthStatus);

    return nodeStatus;
  }
  private void updateQueueWithNodeUpdate(NodeUpdateSchedulerEventWrapper eventWrapper) {
    RMNodeWrapper node = (RMNodeWrapper) eventWrapper.getRMNode();
    List<UpdatedContainerInfo> containerList = node.getContainerUpdates();
    for (UpdatedContainerInfo info : containerList) {
      for (ContainerStatus status : info.getCompletedContainers()) {
        ContainerId containerId = status.getContainerId();
        SchedulerAppReport app =
            scheduler.getSchedulerAppInfo(containerId.getApplicationAttemptId());

        if (app == null) {
          // this happens for the AM container
          // The app have already removed when the NM sends the release
          // information.
          continue;
        }

        String queue = appQueueMap.get(containerId.getApplicationAttemptId().getApplicationId());
        int releasedMemory = 0, releasedVCores = 0;
        if (status.getExitStatus() == ContainerExitStatus.SUCCESS) {
          for (RMContainer rmc : app.getLiveContainers()) {
            if (rmc.getContainerId() == containerId) {
              releasedMemory += rmc.getContainer().getResource().getMemory();
              releasedVCores += rmc.getContainer().getResource().getVirtualCores();
              break;
            }
          }
        } else if (status.getExitStatus() == ContainerExitStatus.ABORTED) {
          if (preemptionContainerMap.containsKey(containerId)) {
            Resource preResource = preemptionContainerMap.get(containerId);
            releasedMemory += preResource.getMemory();
            releasedVCores += preResource.getVirtualCores();
            preemptionContainerMap.remove(containerId);
          }
        }
        // update queue counters
        updateQueueMetrics(queue, releasedMemory, releasedVCores);
      }
    }
  }
Example #10
0
  private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata(
      ContainerStatus containerStatus) {
    ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>();
    eventMetadataBuilder.put(
        GobblinYarnMetricTagNames.CONTAINER_ID, containerStatus.getContainerId().toString());
    eventMetadataBuilder.put(
        GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE,
        containerStatus.getState().toString());
    if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) {
      eventMetadataBuilder.put(
          GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS,
          containerStatus.getExitStatus() + "");
    }
    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
      eventMetadataBuilder.put(
          GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS,
          containerStatus.getDiagnostics());
    }

    return eventMetadataBuilder;
  }
 protected void removePendingReleaseRequests(List<ContainerStatus> completedContainersStatuses) {
   for (ContainerStatus containerStatus : completedContainersStatuses) {
     pendingRelease.remove(containerStatus.getContainerId());
   }
 }
Example #12
0
  /**
   * Handle the completion of a container. A new container will be requested to replace the one that
   * just exited. Depending on the exit status and if container host affinity is enabled, the new
   * container may or may not try to be started on the same node.
   *
   * <p>A container completes in either of the following conditions: 1) some error happens in the
   * container and caused the container to exit, 2) the container gets killed due to some reason,
   * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
   * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster. A
   * replacement container is needed in all but the last case.
   */
  private void handleContainerCompletion(ContainerStatus containerStatus) {
    Map.Entry<Container, String> completedContainerEntry =
        this.containerMap.remove(containerStatus.getContainerId());
    String completedInstanceName = completedContainerEntry.getValue();

    LOGGER.info(
        String.format(
            "Container %s running Helix instance %s has completed with exit status %d",
            containerStatus.getContainerId(),
            completedInstanceName,
            containerStatus.getExitStatus()));

    if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
      LOGGER.info(
          String.format(
              "Received the following diagnostics information for container %s: %s",
              containerStatus.getContainerId(), containerStatus.getDiagnostics()));
    }

    if (this.shutdownInProgress) {
      return;
    }

    int retryCount =
        this.helixInstanceRetryCount
            .putIfAbsent(completedInstanceName, new AtomicInteger(0))
            .incrementAndGet();

    // Populate event metadata
    Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent();
    if (this.eventSubmitter.isPresent()) {
      eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
      eventMetadataBuilder
          .get()
          .put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName);
      eventMetadataBuilder
          .get()
          .put(
              GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT,
              retryCount + "");
    }

    if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
      if (this.eventSubmitter.isPresent()) {
        this.eventSubmitter
            .get()
            .submit(
                GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
                eventMetadataBuilder.get().build());
      }

      LOGGER.warn(
          "Maximum number of retries has been achieved for Helix instance "
              + completedInstanceName);
      return;
    }

    // Add the Helix instance name of the completed container to the queue of unused
    // instance names so they can be reused by a replacement container.
    this.unusedHelixInstanceNames.offer(completedInstanceName);

    if (this.eventSubmitter.isPresent()) {
      this.eventSubmitter
          .get()
          .submit(
              GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
              eventMetadataBuilder.get().build());
    }

    LOGGER.info(
        String.format(
            "Requesting a new container to replace %s to run Helix instance %s",
            containerStatus.getContainerId(), completedInstanceName));
    this.eventBus.post(
        new NewContainerRequest(
            shouldStickToTheSameNode(containerStatus.getExitStatus())
                ? Optional.of(completedContainerEntry.getKey())
                : Optional.<Container>absent()));
  }
    @SuppressWarnings("unchecked")
    @Override
    public void onContainersCompleted(List<ContainerStatus> completedContainers) {
      LOG.info(
          "Got response from RM for container ask, completedCnt=" + completedContainers.size());
      for (ContainerStatus containerStatus : completedContainers) {
        LOG.info(
            "Got container status for containerID="
                + containerStatus.getContainerId()
                + ", state="
                + containerStatus.getState()
                + ", exitStatus="
                + containerStatus.getExitStatus()
                + ", diagnostics="
                + containerStatus.getDiagnostics());

        // non complete containers should not be here
        assert (containerStatus.getState() == ContainerState.COMPLETE);

        // increment counters for completed/failed containers
        int exitStatus = containerStatus.getExitStatus();
        if (0 != exitStatus) {
          // container failed
          if (ContainerExitStatus.ABORTED != exitStatus) {
            // shell script failed
            // counts as completed
            numCompletedContainers.incrementAndGet();
            numFailedContainers.incrementAndGet();
          } else {
            // container was killed by framework, possibly preempted
            // we should re-try as the container was lost for some reason
            numAllocatedContainers.decrementAndGet();
            numRequestedContainers.decrementAndGet();
            // we do not need to release the container as it would be done
            // by the RM
          }
        } else {
          // nothing to do
          // container completed successfully
          numCompletedContainers.incrementAndGet();
          LOG.info(
              "Container completed successfully."
                  + ", containerId="
                  + containerStatus.getContainerId());
        }
        try {
          publishContainerEndEvent(timelineClient, containerStatus);
        } catch (Exception e) {
          LOG.error(
              "Container start event could not be pulished for "
                  + containerStatus.getContainerId().toString(),
              e);
        }
      }

      // ask for more containers if any failed
      int askCount = numTotalContainers - numRequestedContainers.get();
      numRequestedContainers.addAndGet(askCount);

      if (askCount > 0) {
        for (int i = 0; i < askCount; ++i) {
          ContainerRequest containerAsk = setupContainerAskForRM();
          amRMClient.addContainerRequest(containerAsk);
        }
      }

      if (numCompletedContainers.get() == numTotalContainers) {
        done = true;
      }
    }