@Test public void testDnFencing() throws Exception { // Create a file with replication level 3. DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30 * SMALL_BLOCK, (short) 3, 1L); ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, TEST_FILE_PATH); // Drop its replication count to 1, so it becomes over-replicated. // Then compute the invalidation of the extra blocks and trigger // heartbeats so the invalidations are flushed to the DNs. nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); BlockManagerTestUtil.computeInvalidationWork(nn1.getNamesystem().getBlockManager()); cluster.triggerHeartbeats(); // Transition nn2 to active even though nn1 still thinks it's active. banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.abortEditLogs(nn1); NameNodeAdapter.enterSafeMode(nn1, false); cluster.transitionToActive(1); // Check that the standby picked up the replication change. assertEquals(1, nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); // Dump some info for debugging purposes. banner("NN2 Metadata immediately after failover"); doMetasave(nn2); // Even though NN2 considers the blocks over-replicated, it should // post-pone the block invalidation because the DNs are still "stale". assertEquals(30, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); banner("Metadata after nodes have all block-reported"); doMetasave(nn2); // The blocks should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); banner("Making sure the file is still readable"); FileSystem fs2 = cluster.getFileSystem(1); DFSTestUtil.readFile(fs2, TEST_FILE_PATH); banner("Waiting for the actual block files to get deleted from DNs."); waitForTrueReplication(cluster, block, 1); }
/** * Test case that reduces replication of a file with a lot of blocks and then fails over right * after those blocks enter the DN invalidation queues on the active. Ensures that fencing is * correct and no replicas are lost. */ @Test public void testNNClearsCommandsOnFailoverWithReplChanges() throws Exception { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30 * SMALL_BLOCK, (short) 1, 1L); banner("rolling NN1's edit log, forcing catch-up"); HATestUtil.waitForStandbyToCatchUp(nn1, nn2); // Get some new replicas reported so that NN2 now considers // them over-replicated and schedules some more deletions nn1.getRpcServer().setReplication(TEST_FILE, (short) 2); while (BlockManagerTestUtil.getComputedDatanodeWork(nn1.getNamesystem().getBlockManager()) > 0) { LOG.info("Getting more replication work computed"); } BlockManager bm1 = nn1.getNamesystem().getBlockManager(); while (bm1.getPendingReplicationBlocksCount() > 0) { BlockManagerTestUtil.updateState(bm1); cluster.triggerHeartbeats(); Thread.sleep(1000); } banner("triggering BRs"); cluster.triggerBlockReports(); nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); banner("computing invalidation on nn1"); BlockManagerTestUtil.computeInvalidationWork(nn1.getNamesystem().getBlockManager()); doMetasave(nn1); banner("computing invalidation on nn2"); BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); doMetasave(nn2); // Dump some info for debugging purposes. banner("Metadata immediately before failover"); doMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.abortEditLogs(nn1); NameNodeAdapter.enterSafeMode(nn1, false); BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); cluster.transitionToActive(1); // Check that the standby picked up the replication change. assertEquals(1, nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); // Dump some info for debugging purposes. banner("Metadata immediately after failover"); doMetasave(nn2); banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); banner("Metadata after nodes have all block-reported"); doMetasave(nn2); // The block should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); HATestUtil.waitForNNToIssueDeletions(nn2); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); banner("Making sure the file is still readable"); FileSystem fs2 = cluster.getFileSystem(1); DFSTestUtil.readFile(fs2, TEST_FILE_PATH); }
/** * Test case which restarts the standby node in such a way that, when it exits safemode, it will * want to invalidate a bunch of over-replicated block replicas. Ensures that if we failover at * this point it won't lose data. */ @Test public void testNNClearsCommandsOnFailoverAfterStartup() throws Exception { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30 * SMALL_BLOCK, (short) 3, 1L); banner("Shutting down NN2"); cluster.shutdownNameNode(1); banner("Setting replication to 1, rolling edit log."); nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); nn1.getRpcServer().rollEditLog(); // Start NN2 again. When it starts up, it will see all of the // blocks as over-replicated, since it has the metadata for // replication=1, but the DNs haven't yet processed the deletions. banner("Starting NN2 again."); cluster.restartNameNode(1); nn2 = cluster.getNameNode(1); banner("triggering BRs"); cluster.triggerBlockReports(); // We expect that both NN1 and NN2 will have some number of // deletions queued up for the DNs. banner("computing invalidation on nn1"); BlockManagerTestUtil.computeInvalidationWork(nn1.getNamesystem().getBlockManager()); banner("computing invalidation on nn2"); BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); // Dump some info for debugging purposes. banner("Metadata immediately before failover"); doMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.abortEditLogs(nn1); NameNodeAdapter.enterSafeMode(nn1, false); cluster.transitionToActive(1); // Check that the standby picked up the replication change. assertEquals(1, nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); // Dump some info for debugging purposes. banner("Metadata immediately after failover"); doMetasave(nn2); banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); banner("Metadata after nodes have all block-reported"); doMetasave(nn2); // The block should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); HATestUtil.waitForNNToIssueDeletions(nn2); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); banner("Making sure the file is still readable"); FileSystem fs2 = cluster.getFileSystem(1); DFSTestUtil.readFile(fs2, TEST_FILE_PATH); }