/** * Test serialization of the container manager with mock execution layer. * * @throws Exception */ private void testContainerManager(StorageAgent agent) throws Exception { FileUtils.deleteDirectory(new File(testMeta.dir)); // clean any state from previous run LogicalPlan dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir); dag.setAttribute(OperatorContext.STORAGE_AGENT, agent); StatsListeningOperator o1 = dag.addOperator("o1", StatsListeningOperator.class); FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)); StreamingContainerManager scm = StreamingContainerManager.getInstance(recoveryHandler, dag, false); File expFile = new File(recoveryHandler.getDir(), FSRecoveryHandler.FILE_SNAPSHOT); Assert.assertTrue("snapshot file " + expFile, expFile.exists()); PhysicalPlan plan = scm.getPhysicalPlan(); assertEquals("number required containers", 1, plan.getContainers().size()); PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0); @SuppressWarnings( "UnusedAssignment") /* sneaky: the constructor does some changes to the container */ MockContainer mc = new MockContainer(scm, o1p1.getContainer()); PTContainer originalContainer = o1p1.getContainer(); Assert.assertNotNull(o1p1.getContainer().bufferServerAddress); assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState()); assertEquals("state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState()); // test restore initial snapshot + log dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir); scm = StreamingContainerManager.getInstance( new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false); dag = scm.getLogicalPlan(); plan = scm.getPhysicalPlan(); o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0); assertEquals("post restore state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState()); o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator(); assertEquals( "containerId", originalContainer.getExternalId(), o1p1.getContainer().getExternalId()); assertEquals("stats listener", 1, o1p1.statsListeners.size()); assertEquals("number stats calls", 0, o1.processStatsCnt); // stats are not logged assertEquals("post restore 1", PTContainer.State.ALLOCATED, o1p1.getContainer().getState()); assertEquals( "post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress); StreamingContainerAgent sca = scm.getContainerAgent(originalContainer.getExternalId()); Assert.assertNotNull("allocated container restored " + originalContainer, sca); assertEquals( "memory usage allocated container", (int) OperatorContext.MEMORY_MB.defaultValue, sca.container.getAllocatedMemoryMB()); // YARN-1490 - simulate container terminated on AM recovery scm.scheduleContainerRestart(originalContainer.getExternalId()); assertEquals("memory usage of failed container", 0, sca.container.getAllocatedMemoryMB()); Checkpoint firstCheckpoint = new Checkpoint(3, 0, 0); mc = new MockContainer(scm, o1p1.getContainer()); checkpoint(scm, o1p1, firstCheckpoint); mc.stats(o1p1.getId()) .deployState(OperatorHeartbeat.DeployState.ACTIVE) .currentWindowId(3) .checkpointWindowId(3); mc.sendHeartbeat(); assertEquals("state " + o1p1, PTOperator.State.ACTIVE, o1p1.getState()); // logical plan modification triggers snapshot CreateOperatorRequest cor = new CreateOperatorRequest(); cor.setOperatorFQCN(GenericTestOperator.class.getName()); cor.setOperatorName("o2"); CreateStreamRequest csr = new CreateStreamRequest(); csr.setSourceOperatorName("o1"); csr.setSourceOperatorPortName("outport"); csr.setSinkOperatorName("o2"); csr.setSinkOperatorPortName("inport1"); FutureTask<?> lpmf = scm.logicalPlanModification(Lists.newArrayList(cor, csr)); while (!lpmf.isDone()) { scm.monitorHeartbeat(); } Assert.assertNull(lpmf.get()); // unmask exception, if any Assert.assertSame("dag references", dag, scm.getLogicalPlan()); assertEquals("number operators after plan modification", 2, dag.getAllOperators().size()); // set operator state triggers journal write o1p1.setState(PTOperator.State.INACTIVE); Checkpoint offlineCheckpoint = new Checkpoint(10, 0, 0); // write checkpoint while AM is out, // it needs to be picked up as part of restore checkpoint(scm, o1p1, offlineCheckpoint); // test restore dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir); scm = StreamingContainerManager.getInstance( new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false); Assert.assertNotSame("dag references", dag, scm.getLogicalPlan()); assertEquals( "number operators after restore", 2, scm.getLogicalPlan().getAllOperators().size()); dag = scm.getLogicalPlan(); plan = scm.getPhysicalPlan(); o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0); assertEquals("post restore state " + o1p1, PTOperator.State.INACTIVE, o1p1.getState()); o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator(); assertEquals("stats listener", 1, o1p1.statsListeners.size()); assertEquals("number stats calls post restore", 1, o1.processStatsCnt); assertEquals("post restore 1", PTContainer.State.ACTIVE, o1p1.getContainer().getState()); assertEquals( "post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress); // offline checkpoint detection assertEquals( "checkpoints after recovery", Lists.newArrayList(firstCheckpoint, offlineCheckpoint), o1p1.checkpoints); }
@Test public void testOperatorShutdown() { dag.setAttribute(OperatorContext.STORAGE_AGENT, new MemoryStorageAgent()); GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class); GenericTestOperator o2 = dag.addOperator("o2", GenericTestOperator.class); GenericTestOperator o3 = dag.addOperator("o3", GenericTestOperator.class); dag.addStream("stream1", o1.outport1, o2.inport1); dag.addStream("stream2", o2.outport1, o3.inport1); dag.setAttribute( o2, OperatorContext.PARTITIONER, new StatelessPartitioner<GenericTestOperator>(2)); StreamingContainerManager scm = new StreamingContainerManager(dag); PhysicalPlan physicalPlan = scm.getPhysicalPlan(); Map<PTContainer, MockContainer> mockContainers = new HashMap<>(); for (PTContainer c : physicalPlan.getContainers()) { MockContainer mc = new MockContainer(scm, c); mockContainers.put(c, mc); } // deploy all containers for (Map.Entry<PTContainer, MockContainer> ce : mockContainers.entrySet()) { ce.getValue().deploy(); } for (Map.Entry<PTContainer, MockContainer> ce : mockContainers.entrySet()) { // skip buffer server purge in monitorHeartbeat ce.getKey().bufferServerAddress = null; } PTOperator o1p1 = physicalPlan.getOperators(dag.getMeta(o1)).get(0); MockContainer mc1 = mockContainers.get(o1p1.getContainer()); MockOperatorStats o1p1mos = mc1.stats(o1p1.getId()); o1p1mos.currentWindowId(1).checkpointWindowId(1).deployState(DeployState.ACTIVE); mc1.sendHeartbeat(); PTOperator o2p1 = physicalPlan.getOperators(dag.getMeta(o2)).get(0); MockContainer mc2 = mockContainers.get(o2p1.getContainer()); MockOperatorStats o2p1mos = mc2.stats(o2p1.getId()); o2p1mos.currentWindowId(1).checkpointWindowId(1).deployState(DeployState.ACTIVE); mc2.sendHeartbeat(); Assert.assertEquals("2 partitions", 2, physicalPlan.getOperators(dag.getMeta(o2)).size()); PTOperator o2p2 = physicalPlan.getOperators(dag.getMeta(o2)).get(1); MockContainer mc3 = mockContainers.get(o2p2.getContainer()); MockOperatorStats o2p2mos = mc3.stats(o2p2.getId()); o2p2mos.currentWindowId(1).checkpointWindowId(1).deployState(DeployState.ACTIVE); mc3.sendHeartbeat(); PTOperator o3p1 = physicalPlan.getOperators(dag.getMeta(o3)).get(0); MockContainer mc4 = mockContainers.get(o3p1.getContainer()); MockOperatorStats o3p1mos = mc4.stats(o3p1.getId()); o3p1mos.currentWindowId(1).checkpointWindowId(1).deployState(DeployState.ACTIVE); mc4.sendHeartbeat(); // unifier PTOperator unifier = physicalPlan.getMergeOperators(dag.getMeta(o2)).get(0); MockContainer mc5 = mockContainers.get(unifier.getContainer()); MockOperatorStats unifierp1mos = mc5.stats(unifier.getId()); unifierp1mos.currentWindowId(1).checkpointWindowId(1).deployState(DeployState.ACTIVE); mc5.sendHeartbeat(); o1p1mos.currentWindowId(2).deployState(DeployState.SHUTDOWN); mc1.sendHeartbeat(); scm.monitorHeartbeat(); Assert.assertEquals("committedWindowId", -1, scm.getCommittedWindowId()); scm.monitorHeartbeat(); // committedWindowId updated in next cycle Assert.assertEquals("committedWindowId", 1, scm.getCommittedWindowId()); scm.processEvents(); Assert.assertEquals( "containers at committedWindowId=1", 5, physicalPlan.getContainers().size()); // checkpoint window 2 o1p1mos.checkpointWindowId(2); mc1.sendHeartbeat(); scm.monitorHeartbeat(); Assert.assertEquals("committedWindowId", 1, scm.getCommittedWindowId()); o2p1mos.currentWindowId(2).checkpointWindowId(2); o2p2mos.currentWindowId(2).checkpointWindowId(2); o3p1mos.currentWindowId(2).checkpointWindowId(2); unifierp1mos.currentWindowId(2).checkpointWindowId(2); mc2.sendHeartbeat(); mc3.sendHeartbeat(); mc4.sendHeartbeat(); mc5.sendHeartbeat(); scm.monitorHeartbeat(); // Operators are shutdown when both operators reach window Id 2 Assert.assertEquals(0, o1p1.getContainer().getOperators().size()); Assert.assertEquals(0, o2p1.getContainer().getOperators().size()); Assert.assertEquals(0, physicalPlan.getContainers().size()); }