/** Test that there are under replication blocks after vol failures */ @Test public void testUnderReplicationAfterVolFailure() throws Exception { // The test uses DataNodeTestUtils#injectDataDirFailure() to simulate // volume failures which is currently not supported on Windows. assumeTrue(!Path.WINDOWS); // Bring up one more datanode cluster.startDataNodes(conf, 1, true, null, null); cluster.waitActive(); final BlockManager bm = cluster.getNamesystem().getBlockManager(); Path file1 = new Path("/test1"); DFSTestUtil.createFile(fs, file1, 1024, (short) 3, 1L); DFSTestUtil.waitReplication(fs, file1, (short) 3); // Fail the first volume on both datanodes File dn1Vol1 = new File(dataDir, "data" + (2 * 0 + 1)); File dn2Vol1 = new File(dataDir, "data" + (2 * 1 + 1)); DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1); Path file2 = new Path("/test2"); DFSTestUtil.createFile(fs, file2, 1024, (short) 3, 1L); DFSTestUtil.waitReplication(fs, file2, (short) 3); // underReplicatedBlocks are due to failed volumes int underReplicatedBlocks = BlockManagerTestUtil.checkHeartbeatAndGetUnderReplicatedBlocksCount( cluster.getNamesystem(), bm); assertTrue( "There is no under replicated block after volume failure", underReplicatedBlocks > 0); }
/** Test that the NN re-learns of volume failures after restart. */ @Test public void testVolFailureStatsPreservedOnNNRestart() throws Exception { assumeTrue(!System.getProperty("os.name").startsWith("Windows")); // Bring up two more datanodes that can tolerate 1 failure cluster.startDataNodes(conf, 2, true, null, null); cluster.waitActive(); final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager(); long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0); // Fail the first volume on both datanodes (we have to keep the // third healthy so one node in the pipeline will not fail). File dn1Vol1 = new File(dataDir, "data" + (2 * 0 + 1)); File dn2Vol1 = new File(dataDir, "data" + (2 * 1 + 1)); assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false)); Path file1 = new Path("/test1"); DFSTestUtil.createFile(fs, file1, 1024, (short) 2, 1L); DFSTestUtil.waitReplication(fs, file1, (short) 2); // The NN reports two volumes failures DFSTestUtil.waitForDatanodeStatus( dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS); // After restarting the NN it still see the two failures cluster.restartNameNode(0); cluster.waitActive(); DFSTestUtil.waitForDatanodeStatus( dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS); }
/* This test start a one-node cluster, fill the node to be 30% full; * It then adds an empty node and start balancing. * @param newCapacity new node's capacity * @param new */ private void test(long[] capacities, String[] racks, long newCapacity, String newRack) throws Exception { int numOfDatanodes = capacities.length; assertEquals(numOfDatanodes, racks.length); cluster = new MiniDFSCluster(0, CONF, capacities.length, true, true, null, racks, capacities); try { cluster.waitActive(); client = DFSClient.createNamenode(CONF); long totalCapacity = 0L; for (long capacity : capacities) { totalCapacity += capacity; } // fill up the cluster to be 30% full long totalUsedSpace = totalCapacity * 3 / 10; createFile(totalUsedSpace / numOfDatanodes, (short) numOfDatanodes); // start up an empty node with the same capacity and on the same rack cluster.startDataNodes(CONF, 1, true, null, new String[] {newRack}, new long[] {newCapacity}); totalCapacity += newCapacity; // run balancer and validate results runBalancer(totalUsedSpace, totalCapacity); } finally { cluster.shutdown(); } }
/** do fsck */ public void testFsck() throws Exception { DFSTestUtil util = new DFSTestUtil("TestFsck", 20, 3, 8 * 1024); MiniDFSCluster cluster = null; FileSystem fs = null; try { Configuration conf = new Configuration(); conf.setLong("dfs.blockreport.intervalMsec", 10000L); cluster = new MiniDFSCluster(conf, 4, true, null); fs = cluster.getFileSystem(); util.createFiles(fs, "/srcdat"); util.waitReplication(fs, "/srcdat", (short) 3); String outStr = runFsck(conf, 0, true, "/"); assertTrue(outStr.contains(NamenodeFsck.HEALTHY_STATUS)); System.out.println(outStr); if (fs != null) { try { fs.close(); } catch (Exception e) { } } cluster.shutdown(); // restart the cluster; bring up namenode but not the data nodes cluster = new MiniDFSCluster(conf, 0, false, null); outStr = runFsck(conf, 1, true, "/"); // expect the result is corrupt assertTrue(outStr.contains(NamenodeFsck.CORRUPT_STATUS)); System.out.println(outStr); // bring up data nodes & cleanup cluster cluster.startDataNodes(conf, 4, true, null, null); cluster.waitActive(); cluster.waitClusterUp(); fs = cluster.getFileSystem(); util.cleanup(fs, "/srcdat"); } finally { if (fs != null) { try { fs.close(); } catch (Exception e) { } } if (cluster != null) { cluster.shutdown(); } } }
/** * Checks whether {@link DataNode#checkDiskErrorAsync()} is being called or not. Before * refactoring the code the above function was not getting called * * @throws IOException, InterruptedException */ @Test public void testcheckDiskError() throws IOException, InterruptedException { if (cluster.getDataNodes().size() <= 0) { cluster.startDataNodes(conf, 1, true, null, null); cluster.waitActive(); } DataNode dataNode = cluster.getDataNodes().get(0); long slackTime = dataNode.checkDiskErrorInterval / 2; // checking for disk error dataNode.checkDiskErrorAsync(); Thread.sleep(dataNode.checkDiskErrorInterval); long lastDiskErrorCheck = dataNode.getLastDiskErrorCheck(); assertTrue( "Disk Error check is not performed within " + dataNode.checkDiskErrorInterval + " ms", ((Time.monotonicNow() - lastDiskErrorCheck) < (dataNode.checkDiskErrorInterval + slackTime))); }
/** Test to check that a DN goes down when all its volumes have failed. */ @Test public void testShutdown() throws Exception { if (System.getProperty("os.name").startsWith("Windows")) { /** * This test depends on OS not allowing file creations on a directory that does not have write * permissions for the user. Apparently it is not the case on Windows (at least under Cygwin), * and possibly AIX. This is disabled on Windows. */ return; } // Bring up two more datanodes cluster.startDataNodes(conf, 2, true, null, null); cluster.waitActive(); final int dnIndex = 0; String bpid = cluster.getNamesystem().getBlockPoolId(); File storageDir = cluster.getInstanceStorageDir(dnIndex, 0); File dir1 = MiniDFSCluster.getRbwDir(storageDir, bpid); storageDir = cluster.getInstanceStorageDir(dnIndex, 1); File dir2 = MiniDFSCluster.getRbwDir(storageDir, bpid); try { // make the data directory of the first datanode to be readonly assertTrue("Couldn't chmod local vol", dir1.setReadOnly()); assertTrue("Couldn't chmod local vol", dir2.setReadOnly()); // create files and make sure that first datanode will be down DataNode dn = cluster.getDataNodes().get(dnIndex); for (int i = 0; dn.isDatanodeUp(); i++) { Path fileName = new Path("/test.txt" + i); DFSTestUtil.createFile(fs, fileName, 1024, (short) 2, 1L); DFSTestUtil.waitReplication(fs, fileName, (short) 2); fs.delete(fileName, true); } } finally { // restore its old permission FileUtil.setWritable(dir1, true); FileUtil.setWritable(dir2, true); } }
/** * Test that individual volume failures do not cause DNs to fail, that all volumes failed on a * single datanode do cause it to fail, and that the capacities and liveliness is adjusted * correctly in the NN. */ @Test public void testSuccessiveVolumeFailures() throws Exception { assumeTrue(!System.getProperty("os.name").startsWith("Windows")); // Bring up two more datanodes cluster.startDataNodes(conf, 2, true, null, null); cluster.waitActive(); /* * Calculate the total capacity of all the datanodes. Sleep for * three seconds to be sure the datanodes have had a chance to * heartbeat their capacities. */ Thread.sleep(WAIT_FOR_HEARTBEATS); final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager(); final long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0); File dn1Vol1 = new File(dataDir, "data" + (2 * 0 + 1)); File dn2Vol1 = new File(dataDir, "data" + (2 * 1 + 1)); File dn3Vol1 = new File(dataDir, "data" + (2 * 2 + 1)); File dn3Vol2 = new File(dataDir, "data" + (2 * 2 + 2)); /* * Make the 1st volume directories on the first two datanodes * non-accessible. We don't make all three 1st volume directories * readonly since that would cause the entire pipeline to * fail. The client does not retry failed nodes even though * perhaps they could succeed because just a single volume failed. */ assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false)); /* * Create file1 and wait for 3 replicas (ie all DNs can still * store a block). Then assert that all DNs are up, despite the * volume failures. */ Path file1 = new Path("/test1"); DFSTestUtil.createFile(fs, file1, 1024, (short) 3, 1L); DFSTestUtil.waitReplication(fs, file1, (short) 3); ArrayList<DataNode> dns = cluster.getDataNodes(); assertTrue("DN1 should be up", dns.get(0).isDatanodeUp()); assertTrue("DN2 should be up", dns.get(1).isDatanodeUp()); assertTrue("DN3 should be up", dns.get(2).isDatanodeUp()); /* * The metrics should confirm the volume failures. */ assertCounter("VolumeFailures", 1L, getMetrics(dns.get(0).getMetrics().name())); assertCounter("VolumeFailures", 1L, getMetrics(dns.get(1).getMetrics().name())); assertCounter("VolumeFailures", 0L, getMetrics(dns.get(2).getMetrics().name())); // Ensure we wait a sufficient amount of time assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH; // Eventually the NN should report two volume failures DFSTestUtil.waitForDatanodeStatus( dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS); /* * Now fail a volume on the third datanode. We should be able to get * three replicas since we've already identified the other failures. */ assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, false)); Path file2 = new Path("/test2"); DFSTestUtil.createFile(fs, file2, 1024, (short) 3, 1L); DFSTestUtil.waitReplication(fs, file2, (short) 3); assertTrue("DN3 should still be up", dns.get(2).isDatanodeUp()); assertCounter("VolumeFailures", 1L, getMetrics(dns.get(2).getMetrics().name())); ArrayList<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); ArrayList<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>(); dm.fetchDatanodes(live, dead, false); live.clear(); dead.clear(); dm.fetchDatanodes(live, dead, false); assertEquals("DN3 should have 1 failed volume", 1, live.get(2).getVolumeFailures()); /* * Once the datanodes have a chance to heartbeat their new capacity the * total capacity should be down by three volumes (assuming the host * did not grow or shrink the data volume while the test was running). */ dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0); DFSTestUtil.waitForDatanodeStatus( dm, 3, 0, 3, origCapacity - (3 * dnCapacity), WAIT_FOR_HEARTBEATS); /* * Now fail the 2nd volume on the 3rd datanode. All its volumes * are now failed and so it should report two volume failures * and that it's no longer up. Only wait for two replicas since * we'll never get a third. */ assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, false)); Path file3 = new Path("/test3"); DFSTestUtil.createFile(fs, file3, 1024, (short) 3, 1L); DFSTestUtil.waitReplication(fs, file3, (short) 2); // The DN should consider itself dead DFSTestUtil.waitForDatanodeDeath(dns.get(2)); // And report two failed volumes assertCounter("VolumeFailures", 2L, getMetrics(dns.get(2).getMetrics().name())); // The NN considers the DN dead DFSTestUtil.waitForDatanodeStatus( dm, 2, 1, 2, origCapacity - (4 * dnCapacity), WAIT_FOR_HEARTBEATS); /* * The datanode never tries to restore the failed volume, even if * it's subsequently repaired, but it should see this volume on * restart, so file creation should be able to succeed after * restoring the data directories and restarting the datanodes. */ assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true)); assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true)); assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, true)); assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, true)); cluster.restartDataNodes(); cluster.waitActive(); Path file4 = new Path("/test4"); DFSTestUtil.createFile(fs, file4, 1024, (short) 3, 1L); DFSTestUtil.waitReplication(fs, file4, (short) 3); /* * Eventually the capacity should be restored to its original value, * and that the volume failure count should be reported as zero by * both the metrics and the NN. */ DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WAIT_FOR_HEARTBEATS); }
/** * Test that when there is a failure replicating a block the temporary and meta files are cleaned * up and subsequent replication succeeds. */ @Test public void testReplicationError() throws Exception { // create a file of replication factor of 1 final Path fileName = new Path("/test.txt"); final int fileLen = 1; DFSTestUtil.createFile(fs, fileName, 1, (short) 1, 1L); DFSTestUtil.waitReplication(fs, fileName, (short) 1); // get the block belonged to the created file LocatedBlocks blocks = NameNodeAdapter.getBlockLocations( cluster.getNameNode(), fileName.toString(), 0, (long) fileLen); assertEquals("Should only find 1 block", blocks.locatedBlockCount(), 1); LocatedBlock block = blocks.get(0); // bring up a second datanode cluster.startDataNodes(conf, 1, true, null, null); cluster.waitActive(); final int sndNode = 1; DataNode datanode = cluster.getDataNodes().get(sndNode); // replicate the block to the second datanode InetSocketAddress target = datanode.getXferAddress(); Socket s = new Socket(target.getAddress(), target.getPort()); // write the header. DataOutputStream out = new DataOutputStream(s.getOutputStream()); DataChecksum checksum = DataChecksum.newDataChecksum(DataChecksum.Type.CRC32, 512); new Sender(out) .writeBlock( block.getBlock(), StorageType.DEFAULT, BlockTokenSecretManager.DUMMY_TOKEN, "", new DatanodeInfo[0], new StorageType[0], null, BlockConstructionStage.PIPELINE_SETUP_CREATE, 1, 0L, 0L, 0L, checksum, CachingStrategy.newDefaultStrategy(), false); out.flush(); // close the connection before sending the content of the block out.close(); // the temporary block & meta files should be deleted String bpid = cluster.getNamesystem().getBlockPoolId(); File storageDir = cluster.getInstanceStorageDir(sndNode, 0); File dir1 = MiniDFSCluster.getRbwDir(storageDir, bpid); storageDir = cluster.getInstanceStorageDir(sndNode, 1); File dir2 = MiniDFSCluster.getRbwDir(storageDir, bpid); while (dir1.listFiles().length != 0 || dir2.listFiles().length != 0) { Thread.sleep(100); } // then increase the file's replication factor fs.setReplication(fileName, (short) 2); // replication should succeed DFSTestUtil.waitReplication(fs, fileName, (short) 1); // clean up the file fs.delete(fileName, false); }