/** Test that there are under replication blocks after vol failures */
  @Test
  public void testUnderReplicationAfterVolFailure() throws Exception {
    // The test uses DataNodeTestUtils#injectDataDirFailure() to simulate
    // volume failures which is currently not supported on Windows.
    assumeTrue(!Path.WINDOWS);

    // Bring up one more datanode
    cluster.startDataNodes(conf, 1, true, null, null);
    cluster.waitActive();

    final BlockManager bm = cluster.getNamesystem().getBlockManager();

    Path file1 = new Path("/test1");
    DFSTestUtil.createFile(fs, file1, 1024, (short) 3, 1L);
    DFSTestUtil.waitReplication(fs, file1, (short) 3);

    // Fail the first volume on both datanodes
    File dn1Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
    File dn2Vol1 = new File(dataDir, "data" + (2 * 1 + 1));
    DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);

    Path file2 = new Path("/test2");
    DFSTestUtil.createFile(fs, file2, 1024, (short) 3, 1L);
    DFSTestUtil.waitReplication(fs, file2, (short) 3);

    // underReplicatedBlocks are due to failed volumes
    int underReplicatedBlocks =
        BlockManagerTestUtil.checkHeartbeatAndGetUnderReplicatedBlocksCount(
            cluster.getNamesystem(), bm);
    assertTrue(
        "There is no under replicated block after volume failure", underReplicatedBlocks > 0);
  }
  /** Test that the NN re-learns of volume failures after restart. */
  @Test
  public void testVolFailureStatsPreservedOnNNRestart() throws Exception {
    assumeTrue(!System.getProperty("os.name").startsWith("Windows"));

    // Bring up two more datanodes that can tolerate 1 failure
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive();

    final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();
    long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
    long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);

    // Fail the first volume on both datanodes (we have to keep the
    // third healthy so one node in the pipeline will not fail).
    File dn1Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
    File dn2Vol1 = new File(dataDir, "data" + (2 * 1 + 1));
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));

    Path file1 = new Path("/test1");
    DFSTestUtil.createFile(fs, file1, 1024, (short) 2, 1L);
    DFSTestUtil.waitReplication(fs, file1, (short) 2);

    // The NN reports two volumes failures
    DFSTestUtil.waitForDatanodeStatus(
        dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);

    // After restarting the NN it still see the two failures
    cluster.restartNameNode(0);
    cluster.waitActive();
    DFSTestUtil.waitForDatanodeStatus(
        dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
  }
  /* This test start a one-node cluster, fill the node to be 30% full;
   * It then adds an empty node and start balancing.
   * @param newCapacity new node's capacity
   * @param new
   */
  private void test(long[] capacities, String[] racks, long newCapacity, String newRack)
      throws Exception {
    int numOfDatanodes = capacities.length;
    assertEquals(numOfDatanodes, racks.length);
    cluster = new MiniDFSCluster(0, CONF, capacities.length, true, true, null, racks, capacities);
    try {
      cluster.waitActive();
      client = DFSClient.createNamenode(CONF);

      long totalCapacity = 0L;
      for (long capacity : capacities) {
        totalCapacity += capacity;
      }
      // fill up the cluster to be 30% full
      long totalUsedSpace = totalCapacity * 3 / 10;
      createFile(totalUsedSpace / numOfDatanodes, (short) numOfDatanodes);
      // start up an empty node with the same capacity and on the same rack
      cluster.startDataNodes(CONF, 1, true, null, new String[] {newRack}, new long[] {newCapacity});

      totalCapacity += newCapacity;

      // run balancer and validate results
      runBalancer(totalUsedSpace, totalCapacity);
    } finally {
      cluster.shutdown();
    }
  }
  /** do fsck */
  public void testFsck() throws Exception {
    DFSTestUtil util = new DFSTestUtil("TestFsck", 20, 3, 8 * 1024);
    MiniDFSCluster cluster = null;
    FileSystem fs = null;
    try {
      Configuration conf = new Configuration();
      conf.setLong("dfs.blockreport.intervalMsec", 10000L);
      cluster = new MiniDFSCluster(conf, 4, true, null);
      fs = cluster.getFileSystem();
      util.createFiles(fs, "/srcdat");
      util.waitReplication(fs, "/srcdat", (short) 3);
      String outStr = runFsck(conf, 0, true, "/");
      assertTrue(outStr.contains(NamenodeFsck.HEALTHY_STATUS));
      System.out.println(outStr);
      if (fs != null) {
        try {
          fs.close();
        } catch (Exception e) {
        }
      }
      cluster.shutdown();

      // restart the cluster; bring up namenode but not the data nodes
      cluster = new MiniDFSCluster(conf, 0, false, null);
      outStr = runFsck(conf, 1, true, "/");
      // expect the result is corrupt
      assertTrue(outStr.contains(NamenodeFsck.CORRUPT_STATUS));
      System.out.println(outStr);

      // bring up data nodes & cleanup cluster
      cluster.startDataNodes(conf, 4, true, null, null);
      cluster.waitActive();
      cluster.waitClusterUp();
      fs = cluster.getFileSystem();
      util.cleanup(fs, "/srcdat");
    } finally {
      if (fs != null) {
        try {
          fs.close();
        } catch (Exception e) {
        }
      }
      if (cluster != null) {
        cluster.shutdown();
      }
    }
  }
Exemple #5
0
 /**
  * Checks whether {@link DataNode#checkDiskErrorAsync()} is being called or not. Before
  * refactoring the code the above function was not getting called
  *
  * @throws IOException, InterruptedException
  */
 @Test
 public void testcheckDiskError() throws IOException, InterruptedException {
   if (cluster.getDataNodes().size() <= 0) {
     cluster.startDataNodes(conf, 1, true, null, null);
     cluster.waitActive();
   }
   DataNode dataNode = cluster.getDataNodes().get(0);
   long slackTime = dataNode.checkDiskErrorInterval / 2;
   // checking for disk error
   dataNode.checkDiskErrorAsync();
   Thread.sleep(dataNode.checkDiskErrorInterval);
   long lastDiskErrorCheck = dataNode.getLastDiskErrorCheck();
   assertTrue(
       "Disk Error check is not performed within  " + dataNode.checkDiskErrorInterval + "  ms",
       ((Time.monotonicNow() - lastDiskErrorCheck)
           < (dataNode.checkDiskErrorInterval + slackTime)));
 }
Exemple #6
0
  /** Test to check that a DN goes down when all its volumes have failed. */
  @Test
  public void testShutdown() throws Exception {
    if (System.getProperty("os.name").startsWith("Windows")) {
      /**
       * This test depends on OS not allowing file creations on a directory that does not have write
       * permissions for the user. Apparently it is not the case on Windows (at least under Cygwin),
       * and possibly AIX. This is disabled on Windows.
       */
      return;
    }
    // Bring up two more datanodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive();
    final int dnIndex = 0;
    String bpid = cluster.getNamesystem().getBlockPoolId();
    File storageDir = cluster.getInstanceStorageDir(dnIndex, 0);
    File dir1 = MiniDFSCluster.getRbwDir(storageDir, bpid);
    storageDir = cluster.getInstanceStorageDir(dnIndex, 1);
    File dir2 = MiniDFSCluster.getRbwDir(storageDir, bpid);
    try {
      // make the data directory of the first datanode to be readonly
      assertTrue("Couldn't chmod local vol", dir1.setReadOnly());
      assertTrue("Couldn't chmod local vol", dir2.setReadOnly());

      // create files and make sure that first datanode will be down
      DataNode dn = cluster.getDataNodes().get(dnIndex);
      for (int i = 0; dn.isDatanodeUp(); i++) {
        Path fileName = new Path("/test.txt" + i);
        DFSTestUtil.createFile(fs, fileName, 1024, (short) 2, 1L);
        DFSTestUtil.waitReplication(fs, fileName, (short) 2);
        fs.delete(fileName, true);
      }
    } finally {
      // restore its old permission
      FileUtil.setWritable(dir1, true);
      FileUtil.setWritable(dir2, true);
    }
  }
  /**
   * Test that individual volume failures do not cause DNs to fail, that all volumes failed on a
   * single datanode do cause it to fail, and that the capacities and liveliness is adjusted
   * correctly in the NN.
   */
  @Test
  public void testSuccessiveVolumeFailures() throws Exception {
    assumeTrue(!System.getProperty("os.name").startsWith("Windows"));

    // Bring up two more datanodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive();

    /*
     * Calculate the total capacity of all the datanodes. Sleep for
     * three seconds to be sure the datanodes have had a chance to
     * heartbeat their capacities.
     */
    Thread.sleep(WAIT_FOR_HEARTBEATS);
    final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();

    final long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
    long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);

    File dn1Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
    File dn2Vol1 = new File(dataDir, "data" + (2 * 1 + 1));
    File dn3Vol1 = new File(dataDir, "data" + (2 * 2 + 1));
    File dn3Vol2 = new File(dataDir, "data" + (2 * 2 + 2));

    /*
     * Make the 1st volume directories on the first two datanodes
     * non-accessible.  We don't make all three 1st volume directories
     * readonly since that would cause the entire pipeline to
     * fail. The client does not retry failed nodes even though
     * perhaps they could succeed because just a single volume failed.
     */
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));

    /*
     * Create file1 and wait for 3 replicas (ie all DNs can still
     * store a block).  Then assert that all DNs are up, despite the
     * volume failures.
     */
    Path file1 = new Path("/test1");
    DFSTestUtil.createFile(fs, file1, 1024, (short) 3, 1L);
    DFSTestUtil.waitReplication(fs, file1, (short) 3);
    ArrayList<DataNode> dns = cluster.getDataNodes();
    assertTrue("DN1 should be up", dns.get(0).isDatanodeUp());
    assertTrue("DN2 should be up", dns.get(1).isDatanodeUp());
    assertTrue("DN3 should be up", dns.get(2).isDatanodeUp());

    /*
     * The metrics should confirm the volume failures.
     */
    assertCounter("VolumeFailures", 1L, getMetrics(dns.get(0).getMetrics().name()));
    assertCounter("VolumeFailures", 1L, getMetrics(dns.get(1).getMetrics().name()));
    assertCounter("VolumeFailures", 0L, getMetrics(dns.get(2).getMetrics().name()));

    // Ensure we wait a sufficient amount of time
    assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;

    // Eventually the NN should report two volume failures
    DFSTestUtil.waitForDatanodeStatus(
        dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);

    /*
     * Now fail a volume on the third datanode. We should be able to get
     * three replicas since we've already identified the other failures.
     */
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, false));
    Path file2 = new Path("/test2");
    DFSTestUtil.createFile(fs, file2, 1024, (short) 3, 1L);
    DFSTestUtil.waitReplication(fs, file2, (short) 3);
    assertTrue("DN3 should still be up", dns.get(2).isDatanodeUp());
    assertCounter("VolumeFailures", 1L, getMetrics(dns.get(2).getMetrics().name()));

    ArrayList<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
    ArrayList<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
    dm.fetchDatanodes(live, dead, false);
    live.clear();
    dead.clear();
    dm.fetchDatanodes(live, dead, false);
    assertEquals("DN3 should have 1 failed volume", 1, live.get(2).getVolumeFailures());

    /*
     * Once the datanodes have a chance to heartbeat their new capacity the
     * total capacity should be down by three volumes (assuming the host
     * did not grow or shrink the data volume while the test was running).
     */
    dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);
    DFSTestUtil.waitForDatanodeStatus(
        dm, 3, 0, 3, origCapacity - (3 * dnCapacity), WAIT_FOR_HEARTBEATS);

    /*
     * Now fail the 2nd volume on the 3rd datanode. All its volumes
     * are now failed and so it should report two volume failures
     * and that it's no longer up. Only wait for two replicas since
     * we'll never get a third.
     */
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, false));
    Path file3 = new Path("/test3");
    DFSTestUtil.createFile(fs, file3, 1024, (short) 3, 1L);
    DFSTestUtil.waitReplication(fs, file3, (short) 2);

    // The DN should consider itself dead
    DFSTestUtil.waitForDatanodeDeath(dns.get(2));

    // And report two failed volumes
    assertCounter("VolumeFailures", 2L, getMetrics(dns.get(2).getMetrics().name()));

    // The NN considers the DN dead
    DFSTestUtil.waitForDatanodeStatus(
        dm, 2, 1, 2, origCapacity - (4 * dnCapacity), WAIT_FOR_HEARTBEATS);

    /*
     * The datanode never tries to restore the failed volume, even if
     * it's subsequently repaired, but it should see this volume on
     * restart, so file creation should be able to succeed after
     * restoring the data directories and restarting the datanodes.
     */
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true));
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, true));
    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, true));
    cluster.restartDataNodes();
    cluster.waitActive();
    Path file4 = new Path("/test4");
    DFSTestUtil.createFile(fs, file4, 1024, (short) 3, 1L);
    DFSTestUtil.waitReplication(fs, file4, (short) 3);

    /*
     * Eventually the capacity should be restored to its original value,
     * and that the volume failure count should be reported as zero by
     * both the metrics and the NN.
     */
    DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WAIT_FOR_HEARTBEATS);
  }
Exemple #8
0
  /**
   * Test that when there is a failure replicating a block the temporary and meta files are cleaned
   * up and subsequent replication succeeds.
   */
  @Test
  public void testReplicationError() throws Exception {
    // create a file of replication factor of 1
    final Path fileName = new Path("/test.txt");
    final int fileLen = 1;
    DFSTestUtil.createFile(fs, fileName, 1, (short) 1, 1L);
    DFSTestUtil.waitReplication(fs, fileName, (short) 1);

    // get the block belonged to the created file
    LocatedBlocks blocks =
        NameNodeAdapter.getBlockLocations(
            cluster.getNameNode(), fileName.toString(), 0, (long) fileLen);
    assertEquals("Should only find 1 block", blocks.locatedBlockCount(), 1);
    LocatedBlock block = blocks.get(0);

    // bring up a second datanode
    cluster.startDataNodes(conf, 1, true, null, null);
    cluster.waitActive();
    final int sndNode = 1;
    DataNode datanode = cluster.getDataNodes().get(sndNode);

    // replicate the block to the second datanode
    InetSocketAddress target = datanode.getXferAddress();
    Socket s = new Socket(target.getAddress(), target.getPort());
    // write the header.
    DataOutputStream out = new DataOutputStream(s.getOutputStream());

    DataChecksum checksum = DataChecksum.newDataChecksum(DataChecksum.Type.CRC32, 512);
    new Sender(out)
        .writeBlock(
            block.getBlock(),
            StorageType.DEFAULT,
            BlockTokenSecretManager.DUMMY_TOKEN,
            "",
            new DatanodeInfo[0],
            new StorageType[0],
            null,
            BlockConstructionStage.PIPELINE_SETUP_CREATE,
            1,
            0L,
            0L,
            0L,
            checksum,
            CachingStrategy.newDefaultStrategy(),
            false);
    out.flush();

    // close the connection before sending the content of the block
    out.close();

    // the temporary block & meta files should be deleted
    String bpid = cluster.getNamesystem().getBlockPoolId();
    File storageDir = cluster.getInstanceStorageDir(sndNode, 0);
    File dir1 = MiniDFSCluster.getRbwDir(storageDir, bpid);
    storageDir = cluster.getInstanceStorageDir(sndNode, 1);
    File dir2 = MiniDFSCluster.getRbwDir(storageDir, bpid);
    while (dir1.listFiles().length != 0 || dir2.listFiles().length != 0) {
      Thread.sleep(100);
    }

    // then increase the file's replication factor
    fs.setReplication(fileName, (short) 2);
    // replication should succeed
    DFSTestUtil.waitReplication(fs, fileName, (short) 1);

    // clean up the file
    fs.delete(fileName, false);
  }