/** * Test serialization of the container manager with mock execution layer. * * @throws Exception */ private void testContainerManager(StorageAgent agent) throws Exception { FileUtils.deleteDirectory(new File(testMeta.dir)); // clean any state from previous run LogicalPlan dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir); dag.setAttribute(OperatorContext.STORAGE_AGENT, agent); StatsListeningOperator o1 = dag.addOperator("o1", StatsListeningOperator.class); FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)); StreamingContainerManager scm = StreamingContainerManager.getInstance(recoveryHandler, dag, false); File expFile = new File(recoveryHandler.getDir(), FSRecoveryHandler.FILE_SNAPSHOT); Assert.assertTrue("snapshot file " + expFile, expFile.exists()); PhysicalPlan plan = scm.getPhysicalPlan(); assertEquals("number required containers", 1, plan.getContainers().size()); PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0); @SuppressWarnings( "UnusedAssignment") /* sneaky: the constructor does some changes to the container */ MockContainer mc = new MockContainer(scm, o1p1.getContainer()); PTContainer originalContainer = o1p1.getContainer(); Assert.assertNotNull(o1p1.getContainer().bufferServerAddress); assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState()); assertEquals("state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState()); // test restore initial snapshot + log dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir); scm = StreamingContainerManager.getInstance( new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false); dag = scm.getLogicalPlan(); plan = scm.getPhysicalPlan(); o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0); assertEquals("post restore state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState()); o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator(); assertEquals( "containerId", originalContainer.getExternalId(), o1p1.getContainer().getExternalId()); assertEquals("stats listener", 1, o1p1.statsListeners.size()); assertEquals("number stats calls", 0, o1.processStatsCnt); // stats are not logged assertEquals("post restore 1", PTContainer.State.ALLOCATED, o1p1.getContainer().getState()); assertEquals( "post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress); StreamingContainerAgent sca = scm.getContainerAgent(originalContainer.getExternalId()); Assert.assertNotNull("allocated container restored " + originalContainer, sca); assertEquals( "memory usage allocated container", (int) OperatorContext.MEMORY_MB.defaultValue, sca.container.getAllocatedMemoryMB()); // YARN-1490 - simulate container terminated on AM recovery scm.scheduleContainerRestart(originalContainer.getExternalId()); assertEquals("memory usage of failed container", 0, sca.container.getAllocatedMemoryMB()); Checkpoint firstCheckpoint = new Checkpoint(3, 0, 0); mc = new MockContainer(scm, o1p1.getContainer()); checkpoint(scm, o1p1, firstCheckpoint); mc.stats(o1p1.getId()) .deployState(OperatorHeartbeat.DeployState.ACTIVE) .currentWindowId(3) .checkpointWindowId(3); mc.sendHeartbeat(); assertEquals("state " + o1p1, PTOperator.State.ACTIVE, o1p1.getState()); // logical plan modification triggers snapshot CreateOperatorRequest cor = new CreateOperatorRequest(); cor.setOperatorFQCN(GenericTestOperator.class.getName()); cor.setOperatorName("o2"); CreateStreamRequest csr = new CreateStreamRequest(); csr.setSourceOperatorName("o1"); csr.setSourceOperatorPortName("outport"); csr.setSinkOperatorName("o2"); csr.setSinkOperatorPortName("inport1"); FutureTask<?> lpmf = scm.logicalPlanModification(Lists.newArrayList(cor, csr)); while (!lpmf.isDone()) { scm.monitorHeartbeat(); } Assert.assertNull(lpmf.get()); // unmask exception, if any Assert.assertSame("dag references", dag, scm.getLogicalPlan()); assertEquals("number operators after plan modification", 2, dag.getAllOperators().size()); // set operator state triggers journal write o1p1.setState(PTOperator.State.INACTIVE); Checkpoint offlineCheckpoint = new Checkpoint(10, 0, 0); // write checkpoint while AM is out, // it needs to be picked up as part of restore checkpoint(scm, o1p1, offlineCheckpoint); // test restore dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir); scm = StreamingContainerManager.getInstance( new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false); Assert.assertNotSame("dag references", dag, scm.getLogicalPlan()); assertEquals( "number operators after restore", 2, scm.getLogicalPlan().getAllOperators().size()); dag = scm.getLogicalPlan(); plan = scm.getPhysicalPlan(); o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0); assertEquals("post restore state " + o1p1, PTOperator.State.INACTIVE, o1p1.getState()); o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator(); assertEquals("stats listener", 1, o1p1.statsListeners.size()); assertEquals("number stats calls post restore", 1, o1.processStatsCnt); assertEquals("post restore 1", PTContainer.State.ACTIVE, o1p1.getContainer().getState()); assertEquals( "post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress); // offline checkpoint detection assertEquals( "checkpoints after recovery", Lists.newArrayList(firstCheckpoint, offlineCheckpoint), o1p1.checkpoints); }
private void testRestartApp(StorageAgent agent, String appPath1) throws Exception { FileUtils.deleteDirectory(new File(testMeta.dir)); // clean any state from previous run String appId1 = "app1"; String appId2 = "app2"; String appPath2 = testMeta.dir + "/" + appId2; LogicalPlan dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_ID, appId1); dag.setAttribute(LogicalPlan.APPLICATION_PATH, appPath1); dag.setAttribute(LogicalPlan.APPLICATION_ATTEMPT_ID, 1); dag.setAttribute(OperatorContext.STORAGE_AGENT, agent); dag.addOperator("o1", StatsListeningOperator.class); FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)); StreamingContainerManager.getInstance(recoveryHandler, dag, false); // test restore initial snapshot + log dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, appPath1); StreamingContainerManager scm = StreamingContainerManager.getInstance( new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false); PhysicalPlan plan = scm.getPhysicalPlan(); dag = plan.getLogicalPlan(); // original plan Assert.assertNotNull("operator", dag.getOperatorMeta("o1")); PTOperator o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0); long[] ids = new FSStorageAgent(appPath1 + "/" + LogicalPlan.SUBDIR_CHECKPOINTS, new Configuration()) .getWindowIds(o1p1.getId()); Assert.assertArrayEquals(new long[] {o1p1.getRecoveryCheckpoint().getWindowId()}, ids); Assert.assertNull(o1p1.getContainer().getExternalId()); // trigger journal write o1p1.getContainer().setExternalId("cid1"); scm.writeJournal(o1p1.getContainer().getSetContainerState()); dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, appPath2); dag.setAttribute(LogicalPlan.APPLICATION_ID, appId2); StramClient sc = new StramClient(new Configuration(), dag); try { sc.start(); sc.copyInitialState(new Path(appPath1)); } finally { sc.stop(); } scm = StreamingContainerManager.getInstance( new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false); plan = scm.getPhysicalPlan(); dag = plan.getLogicalPlan(); assertEquals("modified appId", appId2, dag.getValue(LogicalPlan.APPLICATION_ID)); assertEquals("modified appPath", appPath2, dag.getValue(LogicalPlan.APPLICATION_PATH)); Assert.assertNotNull("operator", dag.getOperatorMeta("o1")); o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0); assertEquals("journal copied", "cid1", o1p1.getContainer().getExternalId()); ids = new FSStorageAgent(appPath2 + "/" + LogicalPlan.SUBDIR_CHECKPOINTS, new Configuration()) .getWindowIds(o1p1.getId()); Assert.assertArrayEquals( "checkpoints copied", new long[] {o1p1.getRecoveryCheckpoint().getWindowId()}, ids); }
/** * Launch application for the dag represented by this client. * * @throws YarnException * @throws IOException */ public void startApplication() throws YarnException, IOException { Class<?>[] defaultClasses; if (applicationType.equals(YARN_APPLICATION_TYPE)) { // TODO restrict the security check to only check if security is enabled for webservices. if (UserGroupInformation.isSecurityEnabled()) { defaultClasses = DATATORRENT_SECURITY_CLASSES; } else { defaultClasses = DATATORRENT_CLASSES; } } else { throw new IllegalStateException(applicationType + " is not a valid application type."); } LinkedHashSet<String> localJarFiles = findJars(dag, defaultClasses); if (resources != null) { localJarFiles.addAll(resources); } YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); LOG.info( "Got Cluster metric info from ASM" + ", numNodeManagers=" + clusterMetrics.getNumNodeManagers()); // GetClusterNodesRequest clusterNodesReq = Records.newRecord(GetClusterNodesRequest.class); // GetClusterNodesResponse clusterNodesResp = // rmClient.clientRM.getClusterNodes(clusterNodesReq); // LOG.info("Got Cluster node info from ASM"); // for (NodeReport node : clusterNodesResp.getNodeReports()) { // LOG.info("Got node report from ASM for" // + ", nodeId=" + node.getNodeId() // + ", nodeAddress" + node.getHttpAddress() // + ", nodeRackName" + node.getRackName() // + ", nodeNumContainers" + node.getNumContainers() // + ", nodeHealthStatus" + node.getHealthReport()); // } List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo(); for (QueueUserACLInfo aclInfo : listAclInfo) { for (QueueACL userAcl : aclInfo.getUserAcls()) { LOG.info( "User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name()); } } // Get a new application id YarnClientApplication newApp = yarnClient.createApplication(); appId = newApp.getNewApplicationResponse().getApplicationId(); // Dump out information about cluster capability as seen by the resource manager int maxMem = newApp.getNewApplicationResponse().getMaximumResourceCapability().getMemory(); LOG.info("Max mem capabililty of resources in this cluster " + maxMem); int amMemory = dag.getMasterMemoryMB(); if (amMemory > maxMem) { LOG.info( "AM memory specified above max threshold of cluster. Using max value." + ", specified=" + amMemory + ", max=" + maxMem); amMemory = maxMem; } if (dag.getAttributes().get(LogicalPlan.APPLICATION_ID) == null) { dag.setAttribute(LogicalPlan.APPLICATION_ID, appId.toString()); } // Create launch context for app master LOG.info("Setting up application submission context for ASM"); ApplicationSubmissionContext appContext = Records.newRecord(ApplicationSubmissionContext.class); // set the application id appContext.setApplicationId(appId); // set the application name appContext.setApplicationName(dag.getValue(LogicalPlan.APPLICATION_NAME)); appContext.setApplicationType(this.applicationType); if (YARN_APPLICATION_TYPE.equals(this.applicationType)) { // appContext.setMaxAppAttempts(1); // no retries until Stram is HA } // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); // Setup security tokens // If security is enabled get ResourceManager and NameNode delegation tokens. // Set these tokens on the container so that they are sent as part of application submission. // This also sets them up for renewal by ResourceManager. The NameNode delegation rmToken // is also used by ResourceManager to fetch the jars from HDFS and set them up for the // application master launch. if (UserGroupInformation.isSecurityEnabled()) { Credentials credentials = new Credentials(); String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer"); } // For now, only getting tokens for the default file-system. FileSystem fs = StramClientUtils.newFileSystemInstance(conf); try { final Token<?> tokens[] = fs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token<?> token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } } finally { fs.close(); } addRMDelegationToken(tokenRenewer, credentials); DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(fsTokens); } // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master is part of the local resources Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); // copy required jar files to dfs, to be localized for containers FileSystem fs = StramClientUtils.newFileSystemInstance(conf); try { Path appsBasePath = new Path(StramClientUtils.getDTDFSRootDir(fs, conf), StramClientUtils.SUBDIR_APPS); Path appPath = new Path(appsBasePath, appId.toString()); String libJarsCsv = copyFromLocal(fs, appPath, localJarFiles.toArray(new String[] {})); LOG.info("libjars: {}", libJarsCsv); dag.getAttributes().put(LogicalPlan.LIBRARY_JARS, libJarsCsv); LaunchContainerRunnable.addFilesToLocalResources( LocalResourceType.FILE, libJarsCsv, localResources, fs); if (archives != null) { String[] localFiles = archives.split(","); String archivesCsv = copyFromLocal(fs, appPath, localFiles); LOG.info("archives: {}", archivesCsv); dag.getAttributes().put(LogicalPlan.ARCHIVES, archivesCsv); LaunchContainerRunnable.addFilesToLocalResources( LocalResourceType.ARCHIVE, archivesCsv, localResources, fs); } if (files != null) { String[] localFiles = files.split(","); String filesCsv = copyFromLocal(fs, appPath, localFiles); LOG.info("files: {}", filesCsv); dag.getAttributes().put(LogicalPlan.FILES, filesCsv); LaunchContainerRunnable.addFilesToLocalResources( LocalResourceType.FILE, filesCsv, localResources, fs); } dag.getAttributes().put(LogicalPlan.APPLICATION_PATH, appPath.toString()); if (dag.getAttributes().get(OperatorContext.STORAGE_AGENT) == null) { /* which would be the most likely case */ Path checkpointPath = new Path(appPath, LogicalPlan.SUBDIR_CHECKPOINTS); // use conf client side to pickup any proxy settings from dt-site.xml dag.setAttribute( OperatorContext.STORAGE_AGENT, new FSStorageAgent(checkpointPath.toString(), conf)); } if (dag.getAttributes().get(LogicalPlan.CONTAINER_OPTS_CONFIGURATOR) == null) { dag.setAttribute( LogicalPlan.CONTAINER_OPTS_CONFIGURATOR, new BasicContainerOptConfigurator()); } // Set the log4j properties if needed if (!log4jPropFile.isEmpty()) { Path log4jSrc = new Path(log4jPropFile); Path log4jDst = new Path(appPath, "log4j.props"); fs.copyFromLocalFile(false, true, log4jSrc, log4jDst); FileStatus log4jFileStatus = fs.getFileStatus(log4jDst); LocalResource log4jRsrc = Records.newRecord(LocalResource.class); log4jRsrc.setType(LocalResourceType.FILE); log4jRsrc.setVisibility(LocalResourceVisibility.APPLICATION); log4jRsrc.setResource(ConverterUtils.getYarnUrlFromURI(log4jDst.toUri())); log4jRsrc.setTimestamp(log4jFileStatus.getModificationTime()); log4jRsrc.setSize(log4jFileStatus.getLen()); localResources.put("log4j.properties", log4jRsrc); } if (originalAppId != null) { Path origAppPath = new Path(appsBasePath, this.originalAppId); LOG.info("Restart from {}", origAppPath); copyInitialState(origAppPath); } // push logical plan to DFS location Path cfgDst = new Path(appPath, LogicalPlan.SER_FILE_NAME); FSDataOutputStream outStream = fs.create(cfgDst, true); LogicalPlan.write(this.dag, outStream); outStream.close(); Path launchConfigDst = new Path(appPath, LogicalPlan.LAUNCH_CONFIG_FILE_NAME); outStream = fs.create(launchConfigDst, true); conf.writeXml(outStream); outStream.close(); FileStatus topologyFileStatus = fs.getFileStatus(cfgDst); LocalResource topologyRsrc = Records.newRecord(LocalResource.class); topologyRsrc.setType(LocalResourceType.FILE); topologyRsrc.setVisibility(LocalResourceVisibility.APPLICATION); topologyRsrc.setResource(ConverterUtils.getYarnUrlFromURI(cfgDst.toUri())); topologyRsrc.setTimestamp(topologyFileStatus.getModificationTime()); topologyRsrc.setSize(topologyFileStatus.getLen()); localResources.put(LogicalPlan.SER_FILE_NAME, topologyRsrc); // Set local resource info into app master container launch context amContainer.setLocalResources(localResources); // Set the necessary security tokens as needed // amContainer.setContainerTokens(containerToken); // Set the env variables to be setup in the env where the application master will be run LOG.info("Set the environment for the application master"); Map<String, String> env = new HashMap<String, String>(); // Add application jar(s) location to classpath // At some point we should not be required to add // the hadoop specific classpaths to the env. // It should be provided out of the box. // For now setting all required classpaths including // the classpath to "." for the application jar(s) // including ${CLASSPATH} will duplicate the class path in app master, removing it for now // StringBuilder classPathEnv = new StringBuilder("${CLASSPATH}:./*"); StringBuilder classPathEnv = new StringBuilder("./*"); String classpath = conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH); for (String c : StringUtils.isBlank(classpath) ? YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH : classpath.split(",")) { if (c.equals("$HADOOP_CLIENT_CONF_DIR")) { // SPOI-2501 continue; } classPathEnv.append(':'); classPathEnv.append(c.trim()); } env.put("CLASSPATH", classPathEnv.toString()); // propagate to replace node managers user name (effective in non-secure mode) env.put("HADOOP_USER_NAME", UserGroupInformation.getLoginUser().getUserName()); amContainer.setEnvironment(env); // Set the necessary command to execute the application master ArrayList<CharSequence> vargs = new ArrayList<CharSequence>(30); // Set java executable command LOG.info("Setting up app master command"); vargs.add(javaCmd); if (dag.isDebug()) { vargs.add("-agentlib:jdwp=transport=dt_socket,server=y,suspend=n"); } // Set Xmx based on am memory size // default heap size 75% of total memory vargs.add("-Xmx" + (amMemory * 3 / 4) + "m"); vargs.add("-XX:+HeapDumpOnOutOfMemoryError"); vargs.add("-XX:HeapDumpPath=/tmp/dt-heap-" + appId.getId() + ".bin"); vargs.add("-Dhadoop.root.logger=" + (dag.isDebug() ? "DEBUG" : "INFO") + ",RFA"); vargs.add("-Dhadoop.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR); vargs.add(String.format("-D%s=%s", StreamingContainer.PROP_APP_PATH, dag.assertAppPath())); if (dag.isDebug()) { vargs.add("-Dlog4j.debug=true"); } String loggersLevel = conf.get(DTLoggerFactory.DT_LOGGERS_LEVEL); if (loggersLevel != null) { vargs.add(String.format("-D%s=%s", DTLoggerFactory.DT_LOGGERS_LEVEL, loggersLevel)); } vargs.add(StreamingAppMaster.class.getName()); vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr"); // Get final command StringBuilder command = new StringBuilder(9 * vargs.size()); for (CharSequence str : vargs) { command.append(str).append(" "); } LOG.info("Completed setting up app master command " + command.toString()); List<String> commands = new ArrayList<String>(); commands.add(command.toString()); amContainer.setCommands(commands); // Set up resource type requirements // For now, only memory is supported so we set memory requirements Resource capability = Records.newRecord(Resource.class); capability.setMemory(amMemory); appContext.setResource(capability); // Service data is a binary blob that can be passed to the application // Not needed in this scenario // amContainer.setServiceData(serviceData); appContext.setAMContainerSpec(amContainer); // Set the priority for the application master Priority pri = Records.newRecord(Priority.class); pri.setPriority(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(queueName); // Submit the application to the applications manager // SubmitApplicationResponse submitResp = rmClient.submitApplication(appRequest); // Ignore the response as either a valid response object is returned on success // or an exception thrown to denote some form of a failure String specStr = Objects.toStringHelper("Submitting application: ") .add("name", appContext.getApplicationName()) .add("queue", appContext.getQueue()) .add("user", UserGroupInformation.getLoginUser()) .add("resource", appContext.getResource()) .toString(); LOG.info(specStr); if (dag.isDebug()) { // LOG.info("Full submission context: " + appContext); } yarnClient.submitApplication(appContext); } finally { fs.close(); } }