Example #1
0
  public void copyInitialState(Path origAppDir) throws IOException {
    // locate previous snapshot
    String newAppDir = this.dag.assertAppPath();

    FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(origAppDir.toString(), conf);
    // read snapshot against new dependencies
    Object snapshot = recoveryHandler.restore();
    if (snapshot == null) {
      throw new IllegalArgumentException("No previous application state found in " + origAppDir);
    }
    InputStream logIs = recoveryHandler.getLog();

    // modify snapshot state to switch app id
    ((StreamingContainerManager.CheckpointState) snapshot).setApplicationId(this.dag, conf);
    Path checkpointPath = new Path(newAppDir, LogicalPlan.SUBDIR_CHECKPOINTS);

    FileSystem fs = FileSystem.newInstance(origAppDir.toUri(), conf);
    // remove the path that was created by the storage agent during deserialization and replacement
    fs.delete(checkpointPath, true);

    // write snapshot to new location
    recoveryHandler = new FSRecoveryHandler(newAppDir, conf);
    recoveryHandler.save(snapshot);
    OutputStream logOs = recoveryHandler.rotateLog();
    IOUtils.copy(logIs, logOs);
    logOs.flush();
    logOs.close();
    logIs.close();

    // copy sub directories that are not present in target
    FileStatus[] lFiles = fs.listStatus(origAppDir);
    for (FileStatus f : lFiles) {
      if (f.isDirectory()) {
        String targetPath = f.getPath().toString().replace(origAppDir.toString(), newAppDir);
        if (!fs.exists(new Path(targetPath))) {
          LOG.debug("Copying {} to {}", f.getPath(), targetPath);
          FileUtil.copy(fs, f.getPath(), fs, new Path(targetPath), false, conf);
          // FSUtil.copy(fs, f, fs, new Path(targetPath), false, false, conf);
        } else {
          LOG.debug("Ignoring {} as it already exists under {}", f.getPath(), targetPath);
          // FSUtil.setPermission(fs, new Path(targetPath), new FsPermission((short)0777));
        }
      }
    }
  }
Example #2
0
  @Test
  public void testRpcFailover() throws Exception {
    String appPath = testMeta.dir;
    Configuration conf = new Configuration(false);
    final AtomicBoolean timedout = new AtomicBoolean();

    StreamingContainerUmbilicalProtocol impl =
        MockitoUtil.mockProtocol(StreamingContainerUmbilicalProtocol.class);

    Mockito.doAnswer(
            new org.mockito.stubbing.Answer<Void>() {
              @Override
              public Void answer(InvocationOnMock invocation) {
                LOG.debug("got call: " + invocation.getMethod());
                if (!timedout.get()) {
                  try {
                    timedout.set(true);
                    Thread.sleep(1000);
                  } catch (Exception e) {
                  }
                  // throw new RuntimeException("fail");
                }
                return null;
              }
            })
        .when(impl)
        .log("containerId", "timeout");

    Server server =
        new RPC.Builder(conf)
            .setProtocol(StreamingContainerUmbilicalProtocol.class)
            .setInstance(impl)
            .setBindAddress("0.0.0.0")
            .setPort(0)
            .setNumHandlers(1)
            .setVerbose(false)
            .build();
    server.start();
    InetSocketAddress address = NetUtils.getConnectAddress(server);
    LOG.info("Mock server listening at " + address);

    int rpcTimeoutMillis = 500;
    int retryDelayMillis = 100;
    int retryTimeoutMillis = 500;

    FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(appPath, conf);
    URI uri =
        RecoverableRpcProxy.toConnectURI(
            address, rpcTimeoutMillis, retryDelayMillis, retryTimeoutMillis);
    recoveryHandler.writeConnectUri(uri.toString());

    RecoverableRpcProxy rp = new RecoverableRpcProxy(appPath, conf);
    StreamingContainerUmbilicalProtocol protocolProxy = rp.getProxy();
    protocolProxy.log("containerId", "msg");
    // simulate socket read timeout
    try {
      protocolProxy.log("containerId", "timeout");
      Assert.fail("expected socket timeout");
    } catch (java.net.SocketTimeoutException e) {
      // expected
    }
    Assert.assertTrue("timedout", timedout.get());
    rp.close();

    // test success on retry
    timedout.set(false);
    retryTimeoutMillis = 1500;
    uri =
        RecoverableRpcProxy.toConnectURI(
            address, rpcTimeoutMillis, retryDelayMillis, retryTimeoutMillis);
    recoveryHandler.writeConnectUri(uri.toString());

    protocolProxy.log("containerId", "timeout");
    Assert.assertTrue("timedout", timedout.get());

    rp.close();
    server.stop();
  }
Example #3
0
  /**
   * Test serialization of the container manager with mock execution layer.
   *
   * @throws Exception
   */
  private void testContainerManager(StorageAgent agent) throws Exception {
    FileUtils.deleteDirectory(new File(testMeta.dir)); // clean any state from previous run

    LogicalPlan dag = new LogicalPlan();
    dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir);
    dag.setAttribute(OperatorContext.STORAGE_AGENT, agent);

    StatsListeningOperator o1 = dag.addOperator("o1", StatsListeningOperator.class);

    FSRecoveryHandler recoveryHandler =
        new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false));
    StreamingContainerManager scm =
        StreamingContainerManager.getInstance(recoveryHandler, dag, false);
    File expFile = new File(recoveryHandler.getDir(), FSRecoveryHandler.FILE_SNAPSHOT);
    Assert.assertTrue("snapshot file " + expFile, expFile.exists());

    PhysicalPlan plan = scm.getPhysicalPlan();
    assertEquals("number required containers", 1, plan.getContainers().size());

    PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0);

    @SuppressWarnings(
        "UnusedAssignment") /* sneaky: the constructor does some changes to the container */
    MockContainer mc = new MockContainer(scm, o1p1.getContainer());
    PTContainer originalContainer = o1p1.getContainer();

    Assert.assertNotNull(o1p1.getContainer().bufferServerAddress);
    assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState());
    assertEquals("state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState());

    // test restore initial snapshot + log
    dag = new LogicalPlan();
    dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir);
    scm =
        StreamingContainerManager.getInstance(
            new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false);
    dag = scm.getLogicalPlan();
    plan = scm.getPhysicalPlan();

    o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0);
    assertEquals("post restore state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState());
    o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator();
    assertEquals(
        "containerId", originalContainer.getExternalId(), o1p1.getContainer().getExternalId());
    assertEquals("stats listener", 1, o1p1.statsListeners.size());
    assertEquals("number stats calls", 0, o1.processStatsCnt); // stats are not logged
    assertEquals("post restore 1", PTContainer.State.ALLOCATED, o1p1.getContainer().getState());
    assertEquals(
        "post restore 1",
        originalContainer.bufferServerAddress,
        o1p1.getContainer().bufferServerAddress);

    StreamingContainerAgent sca = scm.getContainerAgent(originalContainer.getExternalId());
    Assert.assertNotNull("allocated container restored " + originalContainer, sca);
    assertEquals(
        "memory usage allocated container",
        (int) OperatorContext.MEMORY_MB.defaultValue,
        sca.container.getAllocatedMemoryMB());

    // YARN-1490 - simulate container terminated on AM recovery
    scm.scheduleContainerRestart(originalContainer.getExternalId());
    assertEquals("memory usage of failed container", 0, sca.container.getAllocatedMemoryMB());

    Checkpoint firstCheckpoint = new Checkpoint(3, 0, 0);
    mc = new MockContainer(scm, o1p1.getContainer());
    checkpoint(scm, o1p1, firstCheckpoint);
    mc.stats(o1p1.getId())
        .deployState(OperatorHeartbeat.DeployState.ACTIVE)
        .currentWindowId(3)
        .checkpointWindowId(3);
    mc.sendHeartbeat();
    assertEquals("state " + o1p1, PTOperator.State.ACTIVE, o1p1.getState());

    // logical plan modification triggers snapshot
    CreateOperatorRequest cor = new CreateOperatorRequest();
    cor.setOperatorFQCN(GenericTestOperator.class.getName());
    cor.setOperatorName("o2");
    CreateStreamRequest csr = new CreateStreamRequest();
    csr.setSourceOperatorName("o1");
    csr.setSourceOperatorPortName("outport");
    csr.setSinkOperatorName("o2");
    csr.setSinkOperatorPortName("inport1");
    FutureTask<?> lpmf = scm.logicalPlanModification(Lists.newArrayList(cor, csr));
    while (!lpmf.isDone()) {
      scm.monitorHeartbeat();
    }
    Assert.assertNull(lpmf.get()); // unmask exception, if any

    Assert.assertSame("dag references", dag, scm.getLogicalPlan());
    assertEquals("number operators after plan modification", 2, dag.getAllOperators().size());

    // set operator state triggers journal write
    o1p1.setState(PTOperator.State.INACTIVE);

    Checkpoint offlineCheckpoint = new Checkpoint(10, 0, 0);
    // write checkpoint while AM is out,
    // it needs to be picked up as part of restore
    checkpoint(scm, o1p1, offlineCheckpoint);

    // test restore
    dag = new LogicalPlan();
    dag.setAttribute(LogicalPlan.APPLICATION_PATH, testMeta.dir);
    scm =
        StreamingContainerManager.getInstance(
            new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false);

    Assert.assertNotSame("dag references", dag, scm.getLogicalPlan());
    assertEquals(
        "number operators after restore", 2, scm.getLogicalPlan().getAllOperators().size());

    dag = scm.getLogicalPlan();
    plan = scm.getPhysicalPlan();

    o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0);
    assertEquals("post restore state " + o1p1, PTOperator.State.INACTIVE, o1p1.getState());
    o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator();
    assertEquals("stats listener", 1, o1p1.statsListeners.size());
    assertEquals("number stats calls post restore", 1, o1.processStatsCnt);
    assertEquals("post restore 1", PTContainer.State.ACTIVE, o1p1.getContainer().getState());
    assertEquals(
        "post restore 1",
        originalContainer.bufferServerAddress,
        o1p1.getContainer().bufferServerAddress);

    // offline checkpoint detection
    assertEquals(
        "checkpoints after recovery",
        Lists.newArrayList(firstCheckpoint, offlineCheckpoint),
        o1p1.checkpoints);
  }