// This is to test fetching AM container will be retried, if AM container is
  // not fetchable since DNS is unavailable causing container token/NMtoken
  // creation failure.
  @Test(timeout = 20000)
  public void testAMContainerAllocationWhenDNSUnavailable() throws Exception {
    final YarnConfiguration conf = new YarnConfiguration();
    MockRM rm1 =
        new MockRM(conf) {
          @Override
          protected RMSecretManagerService createRMSecretManagerService() {
            return new TestRMSecretManagerService(conf, rmContext);
          }
        };
    rm1.start();

    MockNM nm1 = rm1.registerNode("unknownhost:1234", 8000);
    SecurityUtilTestHelper.setTokenServiceUseIp(true);
    RMApp app1 = rm1.submitApp(200);
    RMAppAttempt attempt = app1.getCurrentAppAttempt();
    nm1.nodeHeartbeat(true);

    // fetching am container will fail, keep retrying 5 times.
    while (numRetries <= 5) {
      nm1.nodeHeartbeat(true);
      Thread.sleep(1000);
      Assert.assertEquals(RMAppAttemptState.SCHEDULED, attempt.getAppAttemptState());
      System.out.println("Waiting for am container to be allocated.");
    }

    SecurityUtilTestHelper.setTokenServiceUseIp(false);
    rm1.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.ALLOCATED);
    MockRM.launchAndRegisterAM(app1, rm1, nm1);
  }
  @Test
  public void testPriorityWithPendingApplications() throws Exception {

    Configuration conf = new Configuration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    // Set Max Application Priority as 10
    conf.setInt(YarnConfiguration.MAX_CLUSTER_LEVEL_APPLICATION_PRIORITY, 10);
    MockRM rm = new MockRM(conf);
    rm.start();

    Priority appPriority1 = Priority.newInstance(5);
    MockNM nm1 = rm.registerNode("127.0.0.1:1234", 8 * GB);
    RMApp app1 = rm.submitApp(1 * GB, appPriority1);

    // kick the scheduler, 1 GB given to AM1, remaining 7GB on nm1
    MockAM am1 = MockRM.launchAM(app1, rm, nm1);
    am1.registerAppAttempt();

    // kick the scheduler, 7 containers will be allocated for App1
    List<Container> allocated1 = am1.allocateAndWaitForContainers("127.0.0.1", 7, 1 * GB, nm1);

    Assert.assertEquals(7, allocated1.size());
    Assert.assertEquals(1 * GB, allocated1.get(0).getResource().getMemory());

    // check node report, 8 GB used (1 AM and 7 containers) and 0 GB available
    SchedulerNodeReport report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
    Assert.assertEquals(8 * GB, report_nm1.getUsedResource().getMemory());
    Assert.assertEquals(0 * GB, report_nm1.getAvailableResource().getMemory());

    // Submit the second app App2 with priority 7
    Priority appPriority2 = Priority.newInstance(7);
    RMApp app2 = rm.submitApp(1 * GB, appPriority2);

    // Submit the third app App3 with priority 8
    Priority appPriority3 = Priority.newInstance(8);
    RMApp app3 = rm.submitApp(1 * GB, appPriority3);

    // Submit the second app App4 with priority 6
    Priority appPriority4 = Priority.newInstance(6);
    RMApp app4 = rm.submitApp(1 * GB, appPriority4);

    // Only one app can run as AM resource limit restricts it. Kill app1,
    // If app3 (highest priority among rest) gets active, it indicates that
    // priority is working with pendingApplications.
    rm.killApp(app1.getApplicationId());

    // kick the scheduler, app3 (high among pending) gets free space
    MockAM am3 = MockRM.launchAM(app3, rm, nm1);
    am3.registerAppAttempt();

    // check node report, 1 GB used and 7 GB available
    report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
    Assert.assertEquals(1 * GB, report_nm1.getUsedResource().getMemory());
    Assert.assertEquals(7 * GB, report_nm1.getAvailableResource().getMemory());

    rm.stop();
  }
  @Test
  public void testSetRMIdentifierInRegistration() throws Exception {

    Configuration conf = new Configuration();
    rm = new MockRM(conf);
    rm.start();

    MockNM nm = new MockNM("host1:1234", 5120, rm.getResourceTrackerService());
    RegisterNodeManagerResponse response = nm.registerNode();

    // Verify the RMIdentifier is correctly set in RegisterNodeManagerResponse
    Assert.assertEquals(ResourceManager.getClusterTimeStamp(), response.getRMIdentifier());
  }
  /**
   * Test RM read NM next heartBeat Interval correctly from Configuration file, and NM get next
   * heartBeat Interval from RM correctly
   */
  @Test(timeout = 50000)
  public void testGetNextHeartBeatInterval() throws Exception {
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS, "4000");

    rm = new MockRM(conf);
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);

    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertEquals(4000, nodeHeartbeat.getNextHeartBeatInterval());

    NodeHeartbeatResponse nodeHeartbeat2 = nm2.nodeHeartbeat(true);
    Assert.assertEquals(4000, nodeHeartbeat2.getNextHeartBeatInterval());
  }
  private void checkUnealthyNMCount(MockRM rm, MockNM nm1, boolean health, int count)
      throws Exception {

    int waitCount = 0;
    while ((rm.getRMContext().getRMNodes().get(nm1.getNodeId()).getState() != NodeState.UNHEALTHY)
            == health
        && waitCount++ < 20) {
      synchronized (this) {
        wait(100);
      }
    }
    Assert.assertFalse(
        (rm.getRMContext().getRMNodes().get(nm1.getNodeId()).getState() != NodeState.UNHEALTHY)
            == health);
    Assert.assertEquals(
        "Unhealthy metrics not incremented", count, ClusterMetrics.getMetrics().getUnhealthyNMs());
  }
  @Test
  public void testUnhealthyNMUnregistration() throws Exception {
    Configuration conf = new Configuration();
    rm = new MockRM(conf);
    rm.start();

    ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService();
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    Assert.assertEquals(0, ClusterMetrics.getMetrics().getUnhealthyNMs());
    // node healthy
    nm1.nodeHeartbeat(true);
    int shutdownNMsCount = ClusterMetrics.getMetrics().getNumShutdownNMs();

    // node unhealthy
    nm1.nodeHeartbeat(false);
    checkUnealthyNMCount(rm, nm1, true, 1);
    UnRegisterNodeManagerRequest request = Records.newRecord(UnRegisterNodeManagerRequest.class);
    request.setNodeId(nm1.getNodeId());
    resourceTrackerService.unRegisterNodeManager(request);
    checkShutdownNMCount(rm, ++shutdownNMsCount);
  }
  @Test
  public void testUnhealthyNodeStatus() throws Exception {
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());

    rm = new MockRM(conf);
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    Assert.assertEquals(0, ClusterMetrics.getMetrics().getUnhealthyNMs());
    // node healthy
    nm1.nodeHeartbeat(true);

    // node unhealthy
    nm1.nodeHeartbeat(false);
    checkUnealthyNMCount(rm, nm1, true, 1);

    // node healthy again
    nm1.nodeHeartbeat(true);
    checkUnealthyNMCount(rm, nm1, false, 0);
  }
  @Test
  public void testUnhealthyNodeStatus() throws Exception {
    Configuration conf = new Configuration();
    conf.set("yarn.resourcemanager.nodes.exclude-path", hostFile.getAbsolutePath());

    MockRM rm = new MockRM(conf);
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    Assert.assertEquals(0, ClusterMetrics.getMetrics().getUnhealthyNMs());
    // node healthy
    nm1.nodeHeartbeat(true);

    // node unhealthy
    nm1.nodeHeartbeat(false);
    checkUnealthyNMCount(rm, nm1, true, 1);

    // node healthy again
    nm1.nodeHeartbeat(true);
    checkUnealthyNMCount(rm, nm1, false, 0);
  }
  @Test
  public void testReboot() throws Exception {
    Configuration conf = new Configuration();
    rm = new MockRM(conf);
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = new MockNM("host2:1234", 2048, rm.getResourceTrackerService());

    int initialMetricCount = ClusterMetrics.getMetrics().getNumRebootedNMs();
    HeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));

    nodeHeartbeat =
        nm2.nodeHeartbeat(
            new HashMap<ApplicationId, List<ContainerStatus>>(),
            true,
            recordFactory.newRecordInstance(NodeId.class));
    Assert.assertTrue(NodeAction.REBOOT.equals(nodeHeartbeat.getNodeAction()));
    checkRebootedNMCount(rm, ++initialMetricCount);
  }
  @Test
  public void testReboot() throws Exception {
    Configuration conf = new Configuration();
    rm = new MockRM(conf);
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:1234", 2048);

    int initialMetricCount = ClusterMetrics.getMetrics().getNumRebootedNMs();
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));

    nodeHeartbeat =
        nm2.nodeHeartbeat(new HashMap<ApplicationId, List<ContainerStatus>>(), true, -100);
    Assert.assertTrue(NodeAction.RESYNC.equals(nodeHeartbeat.getNodeAction()));
    Assert.assertEquals(
        "Too far behind rm response id:0 nm response id:-100",
        nodeHeartbeat.getDiagnosticsMessage());
    checkRebootedNMCount(rm, ++initialMetricCount);
  }
 /** Decommissioning using a post-configured exclude hosts file */
 @Test
 public void testAddNewExcludePathToConfiguration() throws Exception {
   Configuration conf = new Configuration();
   rm = new MockRM(conf);
   rm.start();
   MockNM nm1 = rm.registerNode("host1:1234", 5120);
   MockNM nm2 = rm.registerNode("host2:5678", 10240);
   ClusterMetrics metrics = ClusterMetrics.getMetrics();
   assert (metrics != null);
   int initialMetricCount = metrics.getNumDecommisionedNMs();
   NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
   Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
   nodeHeartbeat = nm2.nodeHeartbeat(true);
   Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
   writeToHostsFile("host2");
   conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());
   rm.getNodesListManager().refreshNodes(conf);
   checkDecommissionedNMCount(rm, ++initialMetricCount);
   nodeHeartbeat = nm1.nodeHeartbeat(true);
   Assert.assertEquals(
       "Node should not have been decomissioned.",
       NodeAction.NORMAL,
       nodeHeartbeat.getNodeAction());
   nodeHeartbeat = nm2.nodeHeartbeat(true);
   Assert.assertEquals(
       "Node should have been decomissioned but is in state" + nodeHeartbeat.getNodeAction(),
       NodeAction.SHUTDOWN,
       nodeHeartbeat.getNodeAction());
 }
  /** Decommissioning using a pre-configured include hosts file */
  @Test
  public void testDecommissionWithIncludeHosts() throws Exception {

    writeToHostsFile("localhost", "host1", "host2");
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());

    rm = new MockRM(conf);
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    MockNM nm3 = rm.registerNode("localhost:4433", 1024);

    ClusterMetrics metrics = ClusterMetrics.getMetrics();
    assert (metrics != null);
    int metricCount = metrics.getNumDecommisionedNMs();

    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));

    // To test that IPs also work
    String ip = NetUtils.normalizeHostName("localhost");
    writeToHostsFile("host1", ip);

    rm.getNodesListManager().refreshNodes(conf);

    checkDecommissionedNMCount(rm, ++metricCount);

    nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    Assert.assertEquals(1, ClusterMetrics.getMetrics().getNumDecommisionedNMs());

    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(
        "Node is not decommisioned.", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));

    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    Assert.assertEquals(metricCount, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
  }
  /** decommissioning using a exclude hosts file */
  @Test
  public void testDecommissionWithExcludeHosts() throws Exception {
    Configuration conf = new Configuration();
    conf.set("yarn.resourcemanager.nodes.exclude-path", hostFile.getAbsolutePath());

    writeToHostsFile("");
    rm = new MockRM(conf);
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);

    int initialMetricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs();

    HeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));

    writeToHostsFile("host2");

    rm.getNodesListManager().refreshNodes();

    nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(
        "The decommisioned metrics are not updated",
        NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
    checkDecommissionedNMCount(rm, ++initialMetricCount);
  }
  @Test
  public void testNMUnregistration() throws Exception {
    Configuration conf = new Configuration();
    rm = new MockRM(conf);
    rm.start();

    ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService();
    MockNM nm1 = rm.registerNode("host1:1234", 5120);

    int shutdownNMsCount = ClusterMetrics.getMetrics().getNumShutdownNMs();
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));

    UnRegisterNodeManagerRequest request = Records.newRecord(UnRegisterNodeManagerRequest.class);
    request.setNodeId(nm1.getNodeId());
    resourceTrackerService.unRegisterNodeManager(request);
    checkShutdownNMCount(rm, ++shutdownNMsCount);

    // The RM should remove the node after unregistration, hence send a reboot
    // command.
    nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.RESYNC.equals(nodeHeartbeat.getNodeAction()));
  }
  @Test
  public void testARRMResponseId() throws Exception {

    MockNM nm1 = rm.registerNode("h1:1234", 5000);

    RMApp app = rm.submitApp(2000);

    // Trigger the scheduling so the AM gets 'launched'
    nm1.nodeHeartbeat(true);

    RMAppAttempt attempt = app.getCurrentAppAttempt();
    MockAM am = rm.sendAMLaunched(attempt.getAppAttemptId());

    am.registerAppAttempt();

    AllocateRequest allocateRequest =
        BuilderUtils.newAllocateRequest(attempt.getAppAttemptId(), 0, 0F, null, null);

    AllocateResponse response = amService.allocate(allocateRequest);
    Assert.assertEquals(1, response.getResponseId());
    Assert.assertFalse(response.getReboot());
    allocateRequest =
        BuilderUtils.newAllocateRequest(
            attempt.getAppAttemptId(), response.getResponseId(), 0F, null, null);

    response = amService.allocate(allocateRequest);
    Assert.assertEquals(2, response.getResponseId());
    /* try resending */
    response = amService.allocate(allocateRequest);
    Assert.assertEquals(2, response.getResponseId());

    /** try sending old request again * */
    allocateRequest = BuilderUtils.newAllocateRequest(attempt.getAppAttemptId(), 0, 0F, null, null);
    response = amService.allocate(allocateRequest);
    Assert.assertTrue(response.getReboot());
  }
  @Test
  public void testInvalidNMUnregistration() throws Exception {
    Configuration conf = new Configuration();
    rm = new MockRM(conf);
    rm.start();
    ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService();
    int shutdownNMsCount = ClusterMetrics.getMetrics().getNumShutdownNMs();
    int decommisionedNMsCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs();

    // Node not found for unregister
    UnRegisterNodeManagerRequest request = Records.newRecord(UnRegisterNodeManagerRequest.class);
    request.setNodeId(BuilderUtils.newNodeId("host", 1234));
    resourceTrackerService.unRegisterNodeManager(request);
    checkShutdownNMCount(rm, 0);
    checkDecommissionedNMCount(rm, 0);

    // 1. Register the Node Manager
    // 2. Exclude the same Node Manager host
    // 3. Give NM heartbeat to RM
    // 4. Unregister the Node Manager
    MockNM nm1 = new MockNM("host1:1234", 5120, resourceTrackerService);
    RegisterNodeManagerResponse response = nm1.registerNode();
    Assert.assertEquals(NodeAction.NORMAL, response.getNodeAction());
    writeToHostsFile("host2");
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    rm.getNodesListManager().refreshNodes(conf);
    NodeHeartbeatResponse heartbeatResponse = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.SHUTDOWN, heartbeatResponse.getNodeAction());
    checkShutdownNMCount(rm, shutdownNMsCount);
    checkDecommissionedNMCount(rm, ++decommisionedNMsCount);
    request.setNodeId(nm1.getNodeId());
    resourceTrackerService.unRegisterNodeManager(request);
    checkShutdownNMCount(rm, shutdownNMsCount);
    checkDecommissionedNMCount(rm, decommisionedNMsCount);

    // 1. Register the Node Manager
    // 2. Exclude the same Node Manager host
    // 3. Unregister the Node Manager
    MockNM nm2 = new MockNM("host2:1234", 5120, resourceTrackerService);
    RegisterNodeManagerResponse response2 = nm2.registerNode();
    Assert.assertEquals(NodeAction.NORMAL, response2.getNodeAction());
    writeToHostsFile("host1");
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    rm.getNodesListManager().refreshNodes(conf);
    request.setNodeId(nm2.getNodeId());
    resourceTrackerService.unRegisterNodeManager(request);
    checkShutdownNMCount(rm, shutdownNMsCount);
    checkDecommissionedNMCount(rm, ++decommisionedNMsCount);
  }
 private void syncNodeHeartbeat(MockNM nm, boolean health) throws Exception {
   nm.nodeHeartbeat(health);
   dispatcher.await();
 }
Ejemplo n.º 18
0
  @Test(timeout = 30000)
  public void testExcessReservationThanNodeManagerCapacity() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    YarnAPIStorageFactory.setConfiguration(conf);
    RMStorageFactory.setConfiguration(conf);
    MockRM rm = new MockRM(conf);
    try {
      rm.start();

      // Register node1
      MockNM nm1 = rm.registerNode("127.0.0.1:1234", 2 * GB, 4);
      MockNM nm2 = rm.registerNode("127.0.0.1:2234", 3 * GB, 4);

      nm1.nodeHeartbeat(true);
      nm2.nodeHeartbeat(true);
      // HOP :: Sleep to allow previous events to be processed
      Thread.sleep(
          conf.getInt(
                  YarnConfiguration.HOPS_PENDING_EVENTS_RETRIEVAL_PERIOD,
                  YarnConfiguration.DEFAULT_HOPS_PENDING_EVENTS_RETRIEVAL_PERIOD)
              * 2);
      // wait..
      int waitCount = 20;
      int size = rm.getRMContext().getActiveRMNodes().size();
      while ((size = rm.getRMContext().getActiveRMNodes().size()) != 2 && waitCount-- > 0) {
        LOG.info("Waiting for node managers to register : " + size);
        Thread.sleep(100);
      }
      Assert.assertEquals(2, rm.getRMContext().getActiveRMNodes().size());
      // Submit an application
      RMApp app1 = rm.submitApp(128);

      // kick the scheduling
      nm1.nodeHeartbeat(true);
      RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
      MockAM am1 = rm.sendAMLaunched(attempt1.getAppAttemptId(), nm1);
      am1.registerAppAttempt();

      LOG.info("sending container requests ");
      am1.addRequests(new String[] {"*"}, 3 * GB, 1, 1);
      AllocateResponse alloc1Response = am1.schedule(); // send the request

      // kick the scheduler
      nm1.nodeHeartbeat(true);
      int waitCounter = 20;
      LOG.info("heartbeating nm1");
      while (alloc1Response.getAllocatedContainers().size() < 1 && waitCounter-- > 0) {
        LOG.info("Waiting for containers to be created for app 1...");
        Thread.sleep(500);
        alloc1Response = am1.schedule();
      }
      LOG.info("received container : " + alloc1Response.getAllocatedContainers().size());

      // No container should be allocated.
      // Internally it should not been reserved.
      Assert.assertTrue(alloc1Response.getAllocatedContainers().size() == 0);

      LOG.info("heartbeating nm2");
      waitCounter = 20;
      nm2.nodeHeartbeat(true);
      while (alloc1Response.getAllocatedContainers().size() < 1 && waitCounter-- > 0) {
        LOG.info("Waiting for containers to be created for app 1...");
        Thread.sleep(500);
        alloc1Response = am1.schedule();
      }
      LOG.info("received container : " + alloc1Response.getAllocatedContainers().size());
      Assert.assertTrue(alloc1Response.getAllocatedContainers().size() == 1);
    } finally {
      rm.stop();
    }
  }
  @Test
  public void testApplicationPriorityAllocation() throws Exception {

    Configuration conf = new Configuration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    // Set Max Application Priority as 10
    conf.setInt(YarnConfiguration.MAX_CLUSTER_LEVEL_APPLICATION_PRIORITY, 10);
    MockRM rm = new MockRM(conf);
    rm.start();

    Priority appPriority1 = Priority.newInstance(5);
    MockNM nm1 = rm.registerNode("127.0.0.1:1234", 16 * GB);
    RMApp app1 = rm.submitApp(1 * GB, appPriority1);

    // kick the scheduler, 1 GB given to AM1, remaining 15GB on nm1
    MockAM am1 = MockRM.launchAM(app1, rm, nm1);
    am1.registerAppAttempt();

    // allocate 7 containers for App1
    List<Container> allocated1 = am1.allocateAndWaitForContainers("127.0.0.1", 7, 2 * GB, nm1);

    Assert.assertEquals(7, allocated1.size());
    Assert.assertEquals(2 * GB, allocated1.get(0).getResource().getMemory());

    // check node report, 15 GB used (1 AM and 7 containers) and 1 GB available
    SchedulerNodeReport report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
    Assert.assertEquals(15 * GB, report_nm1.getUsedResource().getMemory());
    Assert.assertEquals(1 * GB, report_nm1.getAvailableResource().getMemory());

    // Submit the second app App2 with priority 8 (Higher than App1)
    Priority appPriority2 = Priority.newInstance(8);
    RMApp app2 = rm.submitApp(1 * GB, appPriority2);

    // kick the scheduler, 1 GB which was free is given to AM of App2
    MockAM am2 = MockRM.launchAM(app2, rm, nm1);
    am2.registerAppAttempt();

    // check node report, 16 GB used and 0 GB available
    report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
    Assert.assertEquals(16 * GB, report_nm1.getUsedResource().getMemory());
    Assert.assertEquals(0 * GB, report_nm1.getAvailableResource().getMemory());

    // get scheduler
    CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();

    // get scheduler app
    FiCaSchedulerApp schedulerAppAttempt =
        cs.getSchedulerApplications().get(app1.getApplicationId()).getCurrentAppAttempt();

    // kill 2 containers of App1 to free up some space
    int counter = 0;
    for (Container c : allocated1) {
      if (++counter > 2) {
        break;
      }
      cs.killContainer(schedulerAppAttempt.getRMContainer(c.getId()));
    }

    // check node report, 12 GB used and 4 GB available
    report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
    Assert.assertEquals(12 * GB, report_nm1.getUsedResource().getMemory());
    Assert.assertEquals(4 * GB, report_nm1.getAvailableResource().getMemory());

    // send updated request for App1
    am1.allocate("127.0.0.1", 2 * GB, 10, new ArrayList<ContainerId>());

    // kick the scheduler, since App2 priority is more than App1, it will get
    // remaining cluster space.
    List<Container> allocated2 = am2.allocateAndWaitForContainers("127.0.0.1", 2, 2 * GB, nm1);

    // App2 has got 2 containers now.
    Assert.assertEquals(2, allocated2.size());

    // check node report, 16 GB used and 0 GB available
    report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
    Assert.assertEquals(16 * GB, report_nm1.getUsedResource().getMemory());
    Assert.assertEquals(0 * GB, report_nm1.getAvailableResource().getMemory());

    rm.stop();
  }
  @Test
  public void testReconnectNode() throws Exception {
    final DrainDispatcher dispatcher = new DrainDispatcher();
    rm =
        new MockRM() {
          @Override
          protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
            return new SchedulerEventDispatcher(this.scheduler) {
              @Override
              public void handle(SchedulerEvent event) {
                scheduler.handle(event);
              }
            };
          }

          @Override
          protected Dispatcher createDispatcher() {
            return dispatcher;
          }
        };
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 5120);
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(false);
    dispatcher.await();
    checkUnealthyNMCount(rm, nm2, true, 1);
    final int expectedNMs = ClusterMetrics.getMetrics().getNumActiveNMs();
    QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics();
    // TODO Metrics incorrect in case of the FifoScheduler
    Assert.assertEquals(5120, metrics.getAvailableMB());

    // reconnect of healthy node
    nm1 = rm.registerNode("host1:1234", 5120);
    NodeHeartbeatResponse response = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    dispatcher.await();
    Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs());
    checkUnealthyNMCount(rm, nm2, true, 1);

    // reconnect of unhealthy node
    nm2 = rm.registerNode("host2:5678", 5120);
    response = nm2.nodeHeartbeat(false);
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    dispatcher.await();
    Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs());
    checkUnealthyNMCount(rm, nm2, true, 1);

    // unhealthy node changed back to healthy
    nm2 = rm.registerNode("host2:5678", 5120);
    dispatcher.await();
    response = nm2.nodeHeartbeat(true);
    response = nm2.nodeHeartbeat(true);
    dispatcher.await();
    Assert.assertEquals(5120 + 5120, metrics.getAvailableMB());

    // reconnect of node with changed capability
    nm1 = rm.registerNode("host2:5678", 10240);
    dispatcher.await();
    response = nm1.nodeHeartbeat(true);
    dispatcher.await();
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    Assert.assertEquals(5120 + 10240, metrics.getAvailableMB());

    // reconnect of node with changed capability and running applications
    List<ApplicationId> runningApps = new ArrayList<ApplicationId>();
    runningApps.add(ApplicationId.newInstance(1, 0));
    nm1 = rm.registerNode("host2:5678", 15360, 2, runningApps);
    dispatcher.await();
    response = nm1.nodeHeartbeat(true);
    dispatcher.await();
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    Assert.assertEquals(5120 + 15360, metrics.getAvailableMB());

    // reconnect healthy node changing http port
    nm1 = new MockNM("host1:1234", 5120, rm.getResourceTrackerService());
    nm1.setHttpPort(3);
    nm1.registerNode();
    dispatcher.await();
    response = nm1.nodeHeartbeat(true);
    response = nm1.nodeHeartbeat(true);
    dispatcher.await();
    RMNode rmNode = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
    Assert.assertEquals(3, rmNode.getHttpPort());
    Assert.assertEquals(5120, rmNode.getTotalCapability().getMemory());
    Assert.assertEquals(5120 + 15360, metrics.getAvailableMB());
  }
 private void syncNodeLost(MockNM nm) throws Exception {
   rm.sendNodeStarted(nm);
   rm.NMwaitForState(nm.getNodeId(), NodeState.RUNNING);
   rm.sendNodeLost(nm);
   dispatcher.await();
 }
  @Test
  public void testAMRMUnusableNodes() throws Exception {

    MockNM nm1 = rm.registerNode("h1:1234", 5000);
    MockNM nm2 = rm.registerNode("h2:1234", 5000);
    MockNM nm3 = rm.registerNode("h3:1234", 5000);
    MockNM nm4 = rm.registerNode("h4:1234", 5000);

    RMApp app1 = rm.submitApp(2000);

    // Trigger the scheduling so the AM gets 'launched' on nm1
    nm1.nodeHeartbeat(true);

    RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
    MockAM am1 = rm.sendAMLaunched(attempt1.getAppAttemptId());

    // register AM returns no unusable node
    am1.registerAppAttempt();

    // allocate request returns no updated node
    AllocateRequest allocateRequest1 =
        BuilderUtils.newAllocateRequest(attempt1.getAppAttemptId(), 0, 0F, null, null);
    AMResponse response1 = amService.allocate(allocateRequest1).getAMResponse();
    List<NodeReport> updatedNodes = response1.getUpdatedNodes();
    Assert.assertEquals(0, updatedNodes.size());

    syncNodeHeartbeat(nm4, false);

    // allocate request returns updated node
    allocateRequest1 =
        BuilderUtils.newAllocateRequest(
            attempt1.getAppAttemptId(), response1.getResponseId(), 0F, null, null);
    response1 = amService.allocate(allocateRequest1).getAMResponse();
    updatedNodes = response1.getUpdatedNodes();
    Assert.assertEquals(1, updatedNodes.size());
    NodeReport nr = updatedNodes.iterator().next();
    Assert.assertEquals(nm4.getNodeId(), nr.getNodeId());
    Assert.assertEquals(NodeState.UNHEALTHY, nr.getNodeState());

    // resending the allocate request returns the same result
    response1 = amService.allocate(allocateRequest1).getAMResponse();
    updatedNodes = response1.getUpdatedNodes();
    Assert.assertEquals(1, updatedNodes.size());
    nr = updatedNodes.iterator().next();
    Assert.assertEquals(nm4.getNodeId(), nr.getNodeId());
    Assert.assertEquals(NodeState.UNHEALTHY, nr.getNodeState());

    syncNodeLost(nm3);

    // subsequent allocate request returns delta
    allocateRequest1 =
        BuilderUtils.newAllocateRequest(
            attempt1.getAppAttemptId(), response1.getResponseId(), 0F, null, null);
    response1 = amService.allocate(allocateRequest1).getAMResponse();
    updatedNodes = response1.getUpdatedNodes();
    Assert.assertEquals(1, updatedNodes.size());
    nr = updatedNodes.iterator().next();
    Assert.assertEquals(nm3.getNodeId(), nr.getNodeId());
    Assert.assertEquals(NodeState.LOST, nr.getNodeState());

    // registering another AM gives it the complete failed list
    RMApp app2 = rm.submitApp(2000);
    // Trigger nm2 heartbeat so that AM gets launched on it
    nm2.nodeHeartbeat(true);
    RMAppAttempt attempt2 = app2.getCurrentAppAttempt();
    MockAM am2 = rm.sendAMLaunched(attempt2.getAppAttemptId());

    // register AM returns all unusable nodes
    am2.registerAppAttempt();

    // allocate request returns no updated node
    AllocateRequest allocateRequest2 =
        BuilderUtils.newAllocateRequest(attempt2.getAppAttemptId(), 0, 0F, null, null);
    AMResponse response2 = amService.allocate(allocateRequest2).getAMResponse();
    updatedNodes = response2.getUpdatedNodes();
    Assert.assertEquals(0, updatedNodes.size());

    syncNodeHeartbeat(nm4, true);

    // both AM's should get delta updated nodes
    allocateRequest1 =
        BuilderUtils.newAllocateRequest(
            attempt1.getAppAttemptId(), response1.getResponseId(), 0F, null, null);
    response1 = amService.allocate(allocateRequest1).getAMResponse();
    updatedNodes = response1.getUpdatedNodes();
    Assert.assertEquals(1, updatedNodes.size());
    nr = updatedNodes.iterator().next();
    Assert.assertEquals(nm4.getNodeId(), nr.getNodeId());
    Assert.assertEquals(NodeState.RUNNING, nr.getNodeState());

    allocateRequest2 =
        BuilderUtils.newAllocateRequest(
            attempt2.getAppAttemptId(), response2.getResponseId(), 0F, null, null);
    response2 = amService.allocate(allocateRequest2).getAMResponse();
    updatedNodes = response2.getUpdatedNodes();
    Assert.assertEquals(1, updatedNodes.size());
    nr = updatedNodes.iterator().next();
    Assert.assertEquals(nm4.getNodeId(), nr.getNodeId());
    Assert.assertEquals(NodeState.RUNNING, nr.getNodeState());

    // subsequent allocate calls should return no updated nodes
    allocateRequest2 =
        BuilderUtils.newAllocateRequest(
            attempt2.getAppAttemptId(), response2.getResponseId(), 0F, null, null);
    response2 = amService.allocate(allocateRequest2).getAMResponse();
    updatedNodes = response2.getUpdatedNodes();
    Assert.assertEquals(0, updatedNodes.size());

    // how to do the above for LOST node

  }
  /** Decommissioning using a pre-configured exclude hosts file */
  @Test
  public void testDecommissionWithExcludeHosts() throws Exception {
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());

    writeToHostsFile("");
    final DrainDispatcher dispatcher = new DrainDispatcher();
    rm =
        new MockRM(conf) {
          @Override
          protected Dispatcher createDispatcher() {
            return dispatcher;
          }
        };
    rm.start();

    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    MockNM nm3 = rm.registerNode("localhost:4433", 1024);

    dispatcher.await();

    int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs();
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    dispatcher.await();

    // To test that IPs also work
    String ip = NetUtils.normalizeHostName("localhost");
    writeToHostsFile("host2", ip);

    rm.getNodesListManager().refreshNodes(conf);

    checkDecommissionedNMCount(rm, metricCount + 2);

    nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(
        "The decommisioned metrics are not updated",
        NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));

    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue(
        "The decommisioned metrics are not updated",
        NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
    dispatcher.await();

    writeToHostsFile("");
    rm.getNodesListManager().refreshNodes(conf);

    nm3 = rm.registerNode("localhost:4433", 1024);
    dispatcher.await();
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    dispatcher.await();
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    // decommissined node is 1 since 1 node is rejoined after updating exclude
    // file
    checkDecommissionedNMCount(rm, metricCount + 1);
  }