/** * The test has been passed once the "terminateAfterString" has been seen. * * @param args Command line arguments for the runner * @param terminateAfterString the runner is searching the stdout and stderr for this string. as * soon as it appears, the test has passed * @param failOnPatterns The runner is searching stdout and stderr for the pattern (regexp) * specified here. If one appears, the test has failed * @param type Set the type of the runner * @param expectedReturnValue Expected return code from the runner. * @param checkLogForTerminateString If true, the runner checks also the log4j logger for the * terminate string */ protected void runWithArgs( String[] args, String terminateAfterString, String[] failOnPatterns, RunTypes type, int expectedReturnValue, boolean checkLogForTerminateString) { LOG.info("Running with args {}", Arrays.toString(args)); outContent = new ByteArrayOutputStream(); errContent = new ByteArrayOutputStream(); System.setOut(new PrintStream(outContent)); System.setErr(new PrintStream(errContent)); // we wait for at most three minutes final int START_TIMEOUT_SECONDS = 180; final long deadline = System.currentTimeMillis() + (START_TIMEOUT_SECONDS * 1000); Runner runner = new Runner(args, type, expectedReturnValue); runner.start(); boolean expectedStringSeen = false; boolean testPassedFromLog4j = false; do { sleep(1000); String outContentString = outContent.toString(); String errContentString = errContent.toString(); if (failOnPatterns != null) { for (String failOnString : failOnPatterns) { Pattern pattern = Pattern.compile(failOnString); if (pattern.matcher(outContentString).find() || pattern.matcher(errContentString).find()) { LOG.warn("Failing test. Output contained illegal string '" + failOnString + "'"); sendOutput(); // stopping runner. runner.sendStop(); Assert.fail("Output contained illegal string '" + failOnString + "'"); } } } // check output for the expected terminateAfterString. if (checkLogForTerminateString) { LoggingEvent matchedEvent = UtilsTest.getEventContainingString(terminateAfterString); if (matchedEvent != null) { testPassedFromLog4j = true; LOG.info("Found expected output in logging event {}", matchedEvent); } } if (outContentString.contains(terminateAfterString) || errContentString.contains(terminateAfterString) || testPassedFromLog4j) { expectedStringSeen = true; LOG.info("Found expected output in redirected streams"); // send "stop" command to command line interface LOG.info("RunWithArgs: request runner to stop"); runner.sendStop(); // wait for the thread to stop try { runner.join(30000); } catch (InterruptedException e) { LOG.warn("Interrupted while stopping runner", e); } LOG.warn("RunWithArgs runner stopped."); } else { // check if thread died if (!runner.isAlive()) { // leave loop: the runner died, so we can not expect new strings to show up. break; } } } while (runner.getRunnerError() == null && !expectedStringSeen && System.currentTimeMillis() < deadline); sendOutput(); if (runner.getRunnerError() != null) { // this lets the test fail. throw new RuntimeException("Runner failed", runner.getRunnerError()); } Assert.assertTrue( "During the timeout period of " + START_TIMEOUT_SECONDS + " seconds the " + "expected string did not show up", expectedStringSeen); LOG.info("Test was successful"); }
/** Test TaskManager failure */ @Test(timeout = 100000) // timeout after 100 seconds public void testTaskManagerFailure() { LOG.info("Starting testTaskManagerFailure()"); Runner runner = startWithArgs( new String[] { "-j", flinkUberjar.getAbsolutePath(), "-t", flinkLibFolder.getAbsolutePath(), "-n", "1", "-jm", "768", "-tm", "1024", "-nm", "customName", "-Dfancy-configuration-value=veryFancy", "-Dyarn.maximum-failed-containers=3" }, "Number of connected TaskManagers changed to 1. Slots available: 1", RunTypes.YARN_SESSION); Assert.assertEquals(2, getRunningContainers()); // ------------------------ Test if JobManager web interface is accessible ------- try { YarnClient yc = YarnClient.createYarnClient(); yc.init(yarnConfiguration); yc.start(); List<ApplicationReport> apps = yc.getApplications(EnumSet.of(YarnApplicationState.RUNNING)); Assert.assertEquals(1, apps.size()); // Only one running ApplicationReport app = apps.get(0); Assert.assertEquals("customName", app.getName()); String url = app.getTrackingUrl(); if (!url.endsWith("/")) { url += "/"; } if (!url.startsWith("http://")) { url = "http://" + url; } LOG.info("Got application URL from YARN {}", url); String response = TestBaseUtils.getFromHTTP(url + "taskmanagers/"); JSONObject parsedTMs = new JSONObject(response); JSONArray taskManagers = parsedTMs.getJSONArray("taskmanagers"); Assert.assertNotNull(taskManagers); Assert.assertEquals(1, taskManagers.length()); Assert.assertEquals(1, taskManagers.getJSONObject(0).getInt("slotsNumber")); // get the configuration from webinterface & check if the dynamic properties from YARN show up // there. String jsonConfig = TestBaseUtils.getFromHTTP(url + "jobmanager/config"); JSONArray parsed = new JSONArray(jsonConfig); Map<String, String> parsedConfig = WebMonitorUtils.fromKeyValueJsonArray(parsed); Assert.assertEquals("veryFancy", parsedConfig.get("fancy-configuration-value")); Assert.assertEquals("3", parsedConfig.get("yarn.maximum-failed-containers")); // -------------- FLINK-1902: check if jobmanager hostname/port are shown in web interface // first, get the hostname/port String oC = outContent.toString(); Pattern p = Pattern.compile("Flink JobManager is now running on ([a-zA-Z0-9.-]+):([0-9]+)"); Matcher matches = p.matcher(oC); String hostname = null; String port = null; while (matches.find()) { hostname = matches.group(1).toLowerCase(); port = matches.group(2); } LOG.info("Extracted hostname:port: {} {}", hostname, port); Assert.assertEquals( "unable to find hostname in " + parsed, hostname, parsedConfig.get(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY)); Assert.assertEquals( "unable to find port in " + parsed, port, parsedConfig.get(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY)); // test logfile access String logs = TestBaseUtils.getFromHTTP(url + "jobmanager/log"); Assert.assertTrue(logs.contains("Starting YARN ApplicationMaster/JobManager (Version")); } catch (Throwable e) { LOG.warn("Error while running test", e); Assert.fail(e.getMessage()); } // ------------------------ Kill container with TaskManager ------- // find container id of taskManager: ContainerId taskManagerContainer = null; NodeManager nodeManager = null; UserGroupInformation remoteUgi = null; NMTokenIdentifier nmIdent = null; try { remoteUgi = UserGroupInformation.getCurrentUser(); } catch (IOException e) { LOG.warn("Unable to get curr user", e); Assert.fail(); } for (int nmId = 0; nmId < NUM_NODEMANAGERS; nmId++) { NodeManager nm = yarnCluster.getNodeManager(nmId); ConcurrentMap<ContainerId, Container> containers = nm.getNMContext().getContainers(); for (Map.Entry<ContainerId, Container> entry : containers.entrySet()) { String command = Joiner.on(" ").join(entry.getValue().getLaunchContext().getCommands()); if (command.contains(YarnTaskManagerRunner.class.getSimpleName())) { taskManagerContainer = entry.getKey(); nodeManager = nm; nmIdent = new NMTokenIdentifier(taskManagerContainer.getApplicationAttemptId(), null, "", 0); // allow myself to do stuff with the container // remoteUgi.addCredentials(entry.getValue().getCredentials()); remoteUgi.addTokenIdentifier(nmIdent); } } sleep(500); } Assert.assertNotNull("Unable to find container with TaskManager", taskManagerContainer); Assert.assertNotNull("Illegal state", nodeManager); List<ContainerId> toStop = new LinkedList<ContainerId>(); toStop.add(taskManagerContainer); StopContainersRequest scr = StopContainersRequest.newInstance(toStop); try { nodeManager.getNMContext().getContainerManager().stopContainers(scr); } catch (Throwable e) { LOG.warn("Error stopping container", e); Assert.fail("Error stopping container: " + e.getMessage()); } // stateful termination check: // wait until we saw a container being killed and AFTERWARDS a new one launched boolean ok = false; do { LOG.debug("Waiting for correct order of events. Output: {}", errContent.toString()); String o = errContent.toString(); int killedOff = o.indexOf("Container killed by the ApplicationMaster"); if (killedOff != -1) { o = o.substring(killedOff); ok = o.indexOf("Launching container") > 0; } sleep(1000); } while (!ok); // send "stop" command to command line interface runner.sendStop(); // wait for the thread to stop try { runner.join(1000); } catch (InterruptedException e) { LOG.warn("Interrupted while stopping runner", e); } LOG.warn("stopped"); // ----------- Send output to logger System.setOut(originalStdout); System.setErr(originalStderr); String oC = outContent.toString(); String eC = errContent.toString(); LOG.info("Sending stdout content through logger: \n\n{}\n\n", oC); LOG.info("Sending stderr content through logger: \n\n{}\n\n", eC); // ------ Check if everything happened correctly Assert.assertTrue( "Expect to see failed container", eC.contains("New messages from the YARN cluster")); Assert.assertTrue( "Expect to see failed container", eC.contains("Container killed by the ApplicationMaster")); Assert.assertTrue( "Expect to see new container started", eC.contains("Launching container") && eC.contains("on host")); // cleanup auth for the subsequent tests. remoteUgi.getTokenIdentifiers().remove(nmIdent); LOG.info("Finished testTaskManagerFailure()"); }