public static void checkpoint( StreamingContainerManager scm, PTOperator oper, Checkpoint checkpoint) throws Exception { // write checkpoint while AM is out, // it needs to be picked up as part of restore StorageAgent sa = oper.getOperatorMeta().getValue(OperatorContext.STORAGE_AGENT); sa.save(oper.getOperatorMeta().getOperator(), oper.getId(), checkpoint.windowId); }
public static InputPortMeta getIdentifyingInputPortMeta(PTOperator.PTInput input) { InputPortMeta inputPortMeta; PTOperator inputTarget = input.target; StreamMeta streamMeta = input.logicalStream; if (!inputTarget.isUnifier()) { inputPortMeta = getInputPortMeta(inputTarget.getOperatorMeta(), streamMeta); } else { PTOperator destTarget = getIdentifyingOperator(inputTarget); inputPortMeta = getInputPortMeta(destTarget.getOperatorMeta(), streamMeta); } return inputPortMeta; }
/** * Test serialization of the container manager with mock execution layer. * * @throws Exception */ private void testContainerManager(StorageAgent agent) throws Exception { FileUtils.deleteDirectory(new File(testMeta.dir)); // clean any state from previous run LogicalPlan dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir); dag.setAttribute(OperatorContext.STORAGE_AGENT, agent); StatsListeningOperator o1 = dag.addOperator("o1", StatsListeningOperator.class); FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)); StreamingContainerManager scm = StreamingContainerManager.getInstance(recoveryHandler, dag, false); File expFile = new File(recoveryHandler.getDir(), FSRecoveryHandler.FILE_SNAPSHOT); Assert.assertTrue("snapshot file " + expFile, expFile.exists()); PhysicalPlan plan = scm.getPhysicalPlan(); assertEquals("number required containers", 1, plan.getContainers().size()); PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0); @SuppressWarnings( "UnusedAssignment") /* sneaky: the constructor does some changes to the container */ MockContainer mc = new MockContainer(scm, o1p1.getContainer()); PTContainer originalContainer = o1p1.getContainer(); Assert.assertNotNull(o1p1.getContainer().bufferServerAddress); assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState()); assertEquals("state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState()); // test restore initial snapshot + log dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir); scm = StreamingContainerManager.getInstance( new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false); dag = scm.getLogicalPlan(); plan = scm.getPhysicalPlan(); o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0); assertEquals("post restore state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState()); o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator(); assertEquals( "containerId", originalContainer.getExternalId(), o1p1.getContainer().getExternalId()); assertEquals("stats listener", 1, o1p1.statsListeners.size()); assertEquals("number stats calls", 0, o1.processStatsCnt); // stats are not logged assertEquals("post restore 1", PTContainer.State.ALLOCATED, o1p1.getContainer().getState()); assertEquals( "post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress); StreamingContainerAgent sca = scm.getContainerAgent(originalContainer.getExternalId()); Assert.assertNotNull("allocated container restored " + originalContainer, sca); assertEquals( "memory usage allocated container", (int) OperatorContext.MEMORY_MB.defaultValue, sca.container.getAllocatedMemoryMB()); // YARN-1490 - simulate container terminated on AM recovery scm.scheduleContainerRestart(originalContainer.getExternalId()); assertEquals("memory usage of failed container", 0, sca.container.getAllocatedMemoryMB()); Checkpoint firstCheckpoint = new Checkpoint(3, 0, 0); mc = new MockContainer(scm, o1p1.getContainer()); checkpoint(scm, o1p1, firstCheckpoint); mc.stats(o1p1.getId()) .deployState(OperatorHeartbeat.DeployState.ACTIVE) .currentWindowId(3) .checkpointWindowId(3); mc.sendHeartbeat(); assertEquals("state " + o1p1, PTOperator.State.ACTIVE, o1p1.getState()); // logical plan modification triggers snapshot CreateOperatorRequest cor = new CreateOperatorRequest(); cor.setOperatorFQCN(GenericTestOperator.class.getName()); cor.setOperatorName("o2"); CreateStreamRequest csr = new CreateStreamRequest(); csr.setSourceOperatorName("o1"); csr.setSourceOperatorPortName("outport"); csr.setSinkOperatorName("o2"); csr.setSinkOperatorPortName("inport1"); FutureTask<?> lpmf = scm.logicalPlanModification(Lists.newArrayList(cor, csr)); while (!lpmf.isDone()) { scm.monitorHeartbeat(); } Assert.assertNull(lpmf.get()); // unmask exception, if any Assert.assertSame("dag references", dag, scm.getLogicalPlan()); assertEquals("number operators after plan modification", 2, dag.getAllOperators().size()); // set operator state triggers journal write o1p1.setState(PTOperator.State.INACTIVE); Checkpoint offlineCheckpoint = new Checkpoint(10, 0, 0); // write checkpoint while AM is out, // it needs to be picked up as part of restore checkpoint(scm, o1p1, offlineCheckpoint); // test restore dag = new LogicalPlan(); dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir); scm = StreamingContainerManager.getInstance( new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false); Assert.assertNotSame("dag references", dag, scm.getLogicalPlan()); assertEquals( "number operators after restore", 2, scm.getLogicalPlan().getAllOperators().size()); dag = scm.getLogicalPlan(); plan = scm.getPhysicalPlan(); o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0); assertEquals("post restore state " + o1p1, PTOperator.State.INACTIVE, o1p1.getState()); o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator(); assertEquals("stats listener", 1, o1p1.statsListeners.size()); assertEquals("number stats calls post restore", 1, o1.processStatsCnt); assertEquals("post restore 1", PTContainer.State.ACTIVE, o1p1.getContainer().getState()); assertEquals( "post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress); // offline checkpoint detection assertEquals( "checkpoints after recovery", Lists.newArrayList(firstCheckpoint, offlineCheckpoint), o1p1.checkpoints); }
/** * Create deploy info for operator. * * <p> * * @return {@link com.datatorrent.stram.api.OperatorDeployInfo} */ private OperatorDeployInfo createOperatorDeployInfo(PTOperator oper) { OperatorDeployInfo ndi; if (oper.isUnifier()) { UnifierDeployInfo udi = new UnifierDeployInfo(); /* the constructor auto sets the type */ try { udi.operatorAttributes = oper.getUnifiedOperatorMeta().getAttributes().clone(); } catch (CloneNotSupportedException ex) { throw new RuntimeException("Cannot clone unifier attributes", ex); } ndi = udi; } else { ndi = new OperatorDeployInfo(); Operator operator = oper.getOperatorMeta().getOperator(); if (operator instanceof InputOperator) { ndi.type = OperatorType.INPUT; if (!oper.getInputs().isEmpty()) { // If there are no input ports then it has to be an input operator. But if there are input // ports then // we check if any input port is connected which would make it a Generic operator. for (PTOperator.PTInput ptInput : oper.getInputs()) { if (ptInput.logicalStream != null && ptInput.logicalStream.getSource() != null) { ndi.type = OperatorType.GENERIC; break; } } } } else { ndi.type = OperatorType.GENERIC; } } Checkpoint checkpoint = oper.getRecoveryCheckpoint(); ProcessingMode pm = oper.getOperatorMeta().getValue(OperatorContext.PROCESSING_MODE); if (pm == ProcessingMode.AT_MOST_ONCE || pm == ProcessingMode.EXACTLY_ONCE) { // TODO: following should be handled in the container at deploy time // for exactly once container should also purge previous checkpoint // whenever new checkpoint is written. StorageAgent agent = oper.getOperatorMeta().getAttributes().get(OperatorContext.STORAGE_AGENT); if (agent == null) { agent = initCtx.getValue(OperatorContext.STORAGE_AGENT); } // pick checkpoint most recently written try { long[] windowIds = agent.getWindowIds(oper.getId()); long checkpointId = Stateless.WINDOW_ID; for (long windowId : windowIds) { if (windowId > checkpointId) { checkpointId = windowId; } } if (checkpoint == null || checkpoint.windowId != checkpointId) { checkpoint = new Checkpoint(checkpointId, 0, 0); } } catch (Exception e) { throw new RuntimeException("Failed to determine checkpoint window id " + oper, e); } } LOG.debug("{} recovery checkpoint {}", oper, checkpoint); ndi.checkpoint = checkpoint; ndi.name = oper.getOperatorMeta().getName(); ndi.id = oper.getId(); try { // clone map before modifying it ndi.contextAttributes = oper.getOperatorMeta().getAttributes().clone(); } catch (CloneNotSupportedException ex) { throw new RuntimeException("Cannot clone operator attributes", ex); } if (oper.isOperatorStateLess()) { ndi.contextAttributes.put(OperatorContext.STATELESS, true); } return ndi; }
/** * Create deploy info for StramChild. * * @param operators * @return StreamingContainerContext */ public List<OperatorDeployInfo> getDeployInfoList(Collection<PTOperator> operators) { if (container.bufferServerAddress == null) { throw new AssertionError("No buffer server address assigned"); } Map<OperatorDeployInfo, PTOperator> nodes = new LinkedHashMap<>(); HashSet<PTOperator.PTOutput> publishers = new HashSet<>(); PhysicalPlan physicalPlan = dnmgr.getPhysicalPlan(); for (PTOperator oper : operators) { if (oper.getState() != State.PENDING_DEPLOY) { LOG.debug("Skipping deploy for operator {} state {}", oper, oper.getState()); continue; } OperatorDeployInfo ndi = createOperatorDeployInfo(oper); nodes.put(ndi, oper); ndi.inputs = new ArrayList<>(oper.getInputs().size()); ndi.outputs = new ArrayList<>(oper.getOutputs().size()); for (PTOperator.PTOutput out : oper.getOutputs()) { final StreamMeta streamMeta = out.logicalStream; // buffer server or inline publisher OutputDeployInfo portInfo = new OutputDeployInfo(); portInfo.declaredStreamId = streamMeta.getName(); portInfo.portName = out.portName; try { portInfo.contextAttributes = streamMeta.getSource().getAttributes().clone(); } catch (CloneNotSupportedException ex) { throw new RuntimeException("Cannot clone attributes", ex); } boolean outputUnified = false; for (PTOperator.PTInput input : out.sinks) { if (input.target.isUnifier()) { outputUnified = true; break; } } portInfo.contextAttributes.put(PortContext.IS_OUTPUT_UNIFIED, outputUnified); if (ndi.type == OperatorDeployInfo.OperatorType.UNIFIER) { // input attributes of the downstream operator for (InputPortMeta sink : streamMeta.getSinks()) { portInfo.contextAttributes = sink.getAttributes(); break; } } if (!out.isDownStreamInline()) { portInfo.bufferServerHost = oper.getContainer().bufferServerAddress.getHostName(); portInfo.bufferServerPort = oper.getContainer().bufferServerAddress.getPort(); portInfo.bufferServerToken = oper.getContainer().getBufferServerToken(); // Build the stream codec configuration of all sinks connected to this port for (PTOperator.PTInput input : out.sinks) { // Create mappings for all non-inline operators if (input.target.getContainer() != out.source.getContainer()) { InputPortMeta inputPortMeta = getIdentifyingInputPortMeta(input); StreamCodec<?> streamCodecInfo = getStreamCodec(inputPortMeta); Integer id = physicalPlan.getStreamCodecIdentifier(streamCodecInfo); if (!portInfo.streamCodecs.containsKey(id)) { portInfo.streamCodecs.put(id, streamCodecInfo); } } } } ndi.outputs.add(portInfo); publishers.add(out); } } // after we know all publishers within container, determine subscribers for (Map.Entry<OperatorDeployInfo, PTOperator> operEntry : nodes.entrySet()) { OperatorDeployInfo ndi = operEntry.getKey(); PTOperator oper = operEntry.getValue(); for (PTOperator.PTInput in : oper.getInputs()) { final StreamMeta streamMeta = in.logicalStream; if (streamMeta.getSource() == null) { throw new AssertionError("source is null: " + in); } PTOperator.PTOutput sourceOutput = in.source; InputDeployInfo inputInfo = new InputDeployInfo(); inputInfo.declaredStreamId = streamMeta.getName(); inputInfo.portName = in.portName; InputPortMeta inputPortMeta = getInputPortMeta(oper.getOperatorMeta(), streamMeta); if (inputPortMeta != null) { inputInfo.contextAttributes = inputPortMeta.getAttributes(); } if (inputInfo.contextAttributes == null && ndi.type == OperatorDeployInfo.OperatorType.UNIFIER) { inputInfo.contextAttributes = in.source.logicalStream.getSource().getAttributes(); } inputInfo.sourceNodeId = sourceOutput.source.getId(); inputInfo.sourcePortName = sourceOutput.portName; if (in.partitions != null && in.partitions.mask != 0) { inputInfo.partitionMask = in.partitions.mask; inputInfo.partitionKeys = in.partitions.partitions; } if (sourceOutput.source.getContainer() == oper.getContainer()) { // both operators in same container if (!publishers.contains(sourceOutput)) { throw new AssertionError( "Source not deployed for container local stream " + sourceOutput + " " + in); } if (streamMeta.getLocality() == Locality.THREAD_LOCAL) { inputInfo.locality = Locality.THREAD_LOCAL; ndi.type = OperatorType.OIO; } else { inputInfo.locality = Locality.CONTAINER_LOCAL; } } else { // buffer server input PTContainer container = sourceOutput.source.getContainer(); InetSocketAddress addr = container.bufferServerAddress; if (addr == null) { throw new AssertionError("upstream address not assigned: " + sourceOutput); } inputInfo.bufferServerHost = addr.getHostName(); inputInfo.bufferServerPort = addr.getPort(); inputInfo.bufferServerToken = container.getBufferServerToken(); } // On the input side there is a unlikely scenario of partitions even for inline stream that // is being // handled. Always specifying a stream codec configuration in case that scenario happens. InputPortMeta idInputPortMeta = getIdentifyingInputPortMeta(in); StreamCodec<?> streamCodecInfo = getStreamCodec(idInputPortMeta); Integer id = physicalPlan.getStreamCodecIdentifier(streamCodecInfo); inputInfo.streamCodecs.put(id, streamCodecInfo); ndi.inputs.add(inputInfo); } } return new ArrayList<>(nodes.keySet()); }