Beispiel #1
0
  /**
   * The test has been passed once the "terminateAfterString" has been seen.
   *
   * @param args Command line arguments for the runner
   * @param terminateAfterString the runner is searching the stdout and stderr for this string. as
   *     soon as it appears, the test has passed
   * @param failOnPatterns The runner is searching stdout and stderr for the pattern (regexp)
   *     specified here. If one appears, the test has failed
   * @param type Set the type of the runner
   * @param expectedReturnValue Expected return code from the runner.
   * @param checkLogForTerminateString If true, the runner checks also the log4j logger for the
   *     terminate string
   */
  protected void runWithArgs(
      String[] args,
      String terminateAfterString,
      String[] failOnPatterns,
      RunTypes type,
      int expectedReturnValue,
      boolean checkLogForTerminateString) {
    LOG.info("Running with args {}", Arrays.toString(args));

    outContent = new ByteArrayOutputStream();
    errContent = new ByteArrayOutputStream();
    System.setOut(new PrintStream(outContent));
    System.setErr(new PrintStream(errContent));

    // we wait for at most three minutes
    final int START_TIMEOUT_SECONDS = 180;
    final long deadline = System.currentTimeMillis() + (START_TIMEOUT_SECONDS * 1000);

    Runner runner = new Runner(args, type, expectedReturnValue);
    runner.start();

    boolean expectedStringSeen = false;
    boolean testPassedFromLog4j = false;
    do {
      sleep(1000);
      String outContentString = outContent.toString();
      String errContentString = errContent.toString();
      if (failOnPatterns != null) {
        for (String failOnString : failOnPatterns) {
          Pattern pattern = Pattern.compile(failOnString);
          if (pattern.matcher(outContentString).find()
              || pattern.matcher(errContentString).find()) {
            LOG.warn("Failing test. Output contained illegal string '" + failOnString + "'");
            sendOutput();
            // stopping runner.
            runner.sendStop();
            Assert.fail("Output contained illegal string '" + failOnString + "'");
          }
        }
      }
      // check output for the expected terminateAfterString.
      if (checkLogForTerminateString) {
        LoggingEvent matchedEvent = UtilsTest.getEventContainingString(terminateAfterString);
        if (matchedEvent != null) {
          testPassedFromLog4j = true;
          LOG.info("Found expected output in logging event {}", matchedEvent);
        }
      }

      if (outContentString.contains(terminateAfterString)
          || errContentString.contains(terminateAfterString)
          || testPassedFromLog4j) {
        expectedStringSeen = true;
        LOG.info("Found expected output in redirected streams");
        // send "stop" command to command line interface
        LOG.info("RunWithArgs: request runner to stop");
        runner.sendStop();
        // wait for the thread to stop
        try {
          runner.join(30000);
        } catch (InterruptedException e) {
          LOG.warn("Interrupted while stopping runner", e);
        }
        LOG.warn("RunWithArgs runner stopped.");
      } else {
        // check if thread died
        if (!runner.isAlive()) {
          // leave loop: the runner died, so we can not expect new strings to show up.
          break;
        }
      }
    } while (runner.getRunnerError() == null
        && !expectedStringSeen
        && System.currentTimeMillis() < deadline);

    sendOutput();

    if (runner.getRunnerError() != null) {
      // this lets the test fail.
      throw new RuntimeException("Runner failed", runner.getRunnerError());
    }
    Assert.assertTrue(
        "During the timeout period of "
            + START_TIMEOUT_SECONDS
            + " seconds the "
            + "expected string did not show up",
        expectedStringSeen);

    LOG.info("Test was successful");
  }
  /** Test TaskManager failure */
  @Test(timeout = 100000) // timeout after 100 seconds
  public void testTaskManagerFailure() {
    LOG.info("Starting testTaskManagerFailure()");
    Runner runner =
        startWithArgs(
            new String[] {
              "-j",
              flinkUberjar.getAbsolutePath(),
              "-t",
              flinkLibFolder.getAbsolutePath(),
              "-n",
              "1",
              "-jm",
              "768",
              "-tm",
              "1024",
              "-nm",
              "customName",
              "-Dfancy-configuration-value=veryFancy",
              "-Dyarn.maximum-failed-containers=3"
            },
            "Number of connected TaskManagers changed to 1. Slots available: 1",
            RunTypes.YARN_SESSION);

    Assert.assertEquals(2, getRunningContainers());

    // ------------------------ Test if JobManager web interface is accessible -------
    try {
      YarnClient yc = YarnClient.createYarnClient();
      yc.init(yarnConfiguration);
      yc.start();
      List<ApplicationReport> apps = yc.getApplications(EnumSet.of(YarnApplicationState.RUNNING));
      Assert.assertEquals(1, apps.size()); // Only one running
      ApplicationReport app = apps.get(0);
      Assert.assertEquals("customName", app.getName());
      String url = app.getTrackingUrl();
      if (!url.endsWith("/")) {
        url += "/";
      }
      if (!url.startsWith("http://")) {
        url = "http://" + url;
      }
      LOG.info("Got application URL from YARN {}", url);

      String response = TestBaseUtils.getFromHTTP(url + "taskmanagers/");
      JSONObject parsedTMs = new JSONObject(response);
      JSONArray taskManagers = parsedTMs.getJSONArray("taskmanagers");
      Assert.assertNotNull(taskManagers);
      Assert.assertEquals(1, taskManagers.length());
      Assert.assertEquals(1, taskManagers.getJSONObject(0).getInt("slotsNumber"));

      // get the configuration from webinterface & check if the dynamic properties from YARN show up
      // there.
      String jsonConfig = TestBaseUtils.getFromHTTP(url + "jobmanager/config");
      JSONArray parsed = new JSONArray(jsonConfig);
      Map<String, String> parsedConfig = WebMonitorUtils.fromKeyValueJsonArray(parsed);

      Assert.assertEquals("veryFancy", parsedConfig.get("fancy-configuration-value"));
      Assert.assertEquals("3", parsedConfig.get("yarn.maximum-failed-containers"));

      // -------------- FLINK-1902: check if jobmanager hostname/port are shown in web interface
      // first, get the hostname/port
      String oC = outContent.toString();
      Pattern p = Pattern.compile("Flink JobManager is now running on ([a-zA-Z0-9.-]+):([0-9]+)");
      Matcher matches = p.matcher(oC);
      String hostname = null;
      String port = null;
      while (matches.find()) {
        hostname = matches.group(1).toLowerCase();
        port = matches.group(2);
      }
      LOG.info("Extracted hostname:port: {} {}", hostname, port);

      Assert.assertEquals(
          "unable to find hostname in " + parsed,
          hostname,
          parsedConfig.get(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY));
      Assert.assertEquals(
          "unable to find port in " + parsed,
          port,
          parsedConfig.get(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY));

      // test logfile access
      String logs = TestBaseUtils.getFromHTTP(url + "jobmanager/log");
      Assert.assertTrue(logs.contains("Starting YARN ApplicationMaster/JobManager (Version"));
    } catch (Throwable e) {
      LOG.warn("Error while running test", e);
      Assert.fail(e.getMessage());
    }

    // ------------------------ Kill container with TaskManager  -------

    // find container id of taskManager:
    ContainerId taskManagerContainer = null;
    NodeManager nodeManager = null;
    UserGroupInformation remoteUgi = null;
    NMTokenIdentifier nmIdent = null;
    try {
      remoteUgi = UserGroupInformation.getCurrentUser();
    } catch (IOException e) {
      LOG.warn("Unable to get curr user", e);
      Assert.fail();
    }
    for (int nmId = 0; nmId < NUM_NODEMANAGERS; nmId++) {
      NodeManager nm = yarnCluster.getNodeManager(nmId);
      ConcurrentMap<ContainerId, Container> containers = nm.getNMContext().getContainers();
      for (Map.Entry<ContainerId, Container> entry : containers.entrySet()) {
        String command = Joiner.on(" ").join(entry.getValue().getLaunchContext().getCommands());
        if (command.contains(YarnTaskManagerRunner.class.getSimpleName())) {
          taskManagerContainer = entry.getKey();
          nodeManager = nm;
          nmIdent =
              new NMTokenIdentifier(taskManagerContainer.getApplicationAttemptId(), null, "", 0);
          // allow myself to do stuff with the container
          // remoteUgi.addCredentials(entry.getValue().getCredentials());
          remoteUgi.addTokenIdentifier(nmIdent);
        }
      }
      sleep(500);
    }

    Assert.assertNotNull("Unable to find container with TaskManager", taskManagerContainer);
    Assert.assertNotNull("Illegal state", nodeManager);

    List<ContainerId> toStop = new LinkedList<ContainerId>();
    toStop.add(taskManagerContainer);
    StopContainersRequest scr = StopContainersRequest.newInstance(toStop);

    try {
      nodeManager.getNMContext().getContainerManager().stopContainers(scr);
    } catch (Throwable e) {
      LOG.warn("Error stopping container", e);
      Assert.fail("Error stopping container: " + e.getMessage());
    }

    // stateful termination check:
    // wait until we saw a container being killed and AFTERWARDS a new one launched
    boolean ok = false;
    do {
      LOG.debug("Waiting for correct order of events. Output: {}", errContent.toString());

      String o = errContent.toString();
      int killedOff = o.indexOf("Container killed by the ApplicationMaster");
      if (killedOff != -1) {
        o = o.substring(killedOff);
        ok = o.indexOf("Launching container") > 0;
      }
      sleep(1000);
    } while (!ok);

    // send "stop" command to command line interface
    runner.sendStop();
    // wait for the thread to stop
    try {
      runner.join(1000);
    } catch (InterruptedException e) {
      LOG.warn("Interrupted while stopping runner", e);
    }
    LOG.warn("stopped");

    // ----------- Send output to logger
    System.setOut(originalStdout);
    System.setErr(originalStderr);
    String oC = outContent.toString();
    String eC = errContent.toString();
    LOG.info("Sending stdout content through logger: \n\n{}\n\n", oC);
    LOG.info("Sending stderr content through logger: \n\n{}\n\n", eC);

    // ------ Check if everything happened correctly
    Assert.assertTrue(
        "Expect to see failed container", eC.contains("New messages from the YARN cluster"));
    Assert.assertTrue(
        "Expect to see failed container", eC.contains("Container killed by the ApplicationMaster"));
    Assert.assertTrue(
        "Expect to see new container started",
        eC.contains("Launching container") && eC.contains("on host"));

    // cleanup auth for the subsequent tests.
    remoteUgi.getTokenIdentifiers().remove(nmIdent);

    LOG.info("Finished testTaskManagerFailure()");
  }