@Test public void testOpenFilesWithRename() throws Exception { Path path = new Path("/test"); doWriteAndAbort(fs, path); // check for zero sized blocks Path fileWithEmptyBlock = new Path("/test/test/test4"); fs.create(fileWithEmptyBlock); NamenodeProtocols nameNodeRpc = cluster.getNameNodeRpc(); String clientName = fs.getClient().getClientName(); // create one empty block nameNodeRpc.addBlock( fileWithEmptyBlock.toString(), clientName, null, null, HdfsConstants.GRANDFATHER_INODE_ID, null); fs.createSnapshot(path, "s2"); fs.rename(new Path("/test/test"), new Path("/test/test-renamed")); fs.delete(new Path("/test/test-renamed"), true); NameNode nameNode = cluster.getNameNode(); NameNodeAdapter.enterSafeMode(nameNode, false); NameNodeAdapter.saveNamespace(nameNode); NameNodeAdapter.leaveSafeMode(nameNode); cluster.restartNameNode(true); }
/** * Restart the cluster, optionally saving a new checkpoint. * * @param checkpoint boolean true to save a new checkpoint * @throws Exception if restart fails */ private static void restart(boolean checkpoint) throws Exception { NameNode nameNode = cluster.getNameNode(); if (checkpoint) { NameNodeAdapter.enterSafeMode(nameNode, false); NameNodeAdapter.saveNamespace(nameNode); } shutdown(); initCluster(false); }
@Test public void testDnFencing() throws Exception { // Create a file with replication level 3. DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30 * SMALL_BLOCK, (short) 3, 1L); ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, TEST_FILE_PATH); // Drop its replication count to 1, so it becomes over-replicated. // Then compute the invalidation of the extra blocks and trigger // heartbeats so the invalidations are flushed to the DNs. nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); BlockManagerTestUtil.computeInvalidationWork(nn1.getNamesystem().getBlockManager()); cluster.triggerHeartbeats(); // Transition nn2 to active even though nn1 still thinks it's active. banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.abortEditLogs(nn1); NameNodeAdapter.enterSafeMode(nn1, false); cluster.transitionToActive(1); // Check that the standby picked up the replication change. assertEquals(1, nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); // Dump some info for debugging purposes. banner("NN2 Metadata immediately after failover"); doMetasave(nn2); // Even though NN2 considers the blocks over-replicated, it should // post-pone the block invalidation because the DNs are still "stale". assertEquals(30, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); banner("Metadata after nodes have all block-reported"); doMetasave(nn2); // The blocks should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); banner("Making sure the file is still readable"); FileSystem fs2 = cluster.getFileSystem(1); DFSTestUtil.readFile(fs2, TEST_FILE_PATH); banner("Waiting for the actual block files to get deleted from DNs."); waitForTrueReplication(cluster, block, 1); }
private void doTestMultipleSnapshots(boolean saveNamespace) throws IOException { Path path = new Path("/test"); doWriteAndAbort(fs, path); fs.createSnapshot(path, "s2"); fs.delete(new Path("/test/test"), true); fs.deleteSnapshot(path, "s2"); cluster.triggerBlockReports(); if (saveNamespace) { NameNode nameNode = cluster.getNameNode(); NameNodeAdapter.enterSafeMode(nameNode, false); NameNodeAdapter.saveNamespace(nameNode); NameNodeAdapter.leaveSafeMode(nameNode); } cluster.restartNameNode(true); }
@BeforeClass public static void setUp() throws Exception { Configuration conf = new Configuration(); conf.set("hadoop.security.auth_to_local", "RULE:[2:$1]"); dfsCluster = new MiniDFSCluster(conf, numSlaves, true, null); jConf = new JobConf(conf); mrCluster = new MiniMRCluster( 0, 0, numSlaves, dfsCluster.getFileSystem().getUri().toString(), 1, null, null, null, jConf); createTokenFileJson(); verifySecretKeysInJSONFile(); NameNodeAdapter.getDtSecretManager(dfsCluster.getNamesystem()).startThreads(); FileSystem fs = dfsCluster.getFileSystem(); p1 = new Path("file1"); p2 = new Path("file2"); p1 = fs.makeQualified(p1); }
@Test public void testWithCheckpoint() throws Exception { Path path = new Path("/test"); doWriteAndAbort(fs, path); fs.delete(new Path("/test/test"), true); NameNode nameNode = cluster.getNameNode(); NameNodeAdapter.enterSafeMode(nameNode, false); NameNodeAdapter.saveNamespace(nameNode); NameNodeAdapter.leaveSafeMode(nameNode); cluster.restartNameNode(true); // read snapshot file after restart String test2snapshotPath = Snapshot.getSnapshotPath(path.toString(), "s1/test/test2"); DFSTestUtil.readFile(fs, new Path(test2snapshotPath)); String test3snapshotPath = Snapshot.getSnapshotPath(path.toString(), "s1/test/test3"); DFSTestUtil.readFile(fs, new Path(test3snapshotPath)); }
/** * The following test first creates a file with a few blocks. It randomly truncates the replica of * the last block stored in each datanode. Finally, it triggers block synchronization to * synchronize all stored block. */ public void testBlockSynchronization() throws Exception { final int ORG_FILE_SIZE = 3000; Configuration conf = new HdfsConfiguration(); conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); MiniDFSCluster cluster = null; try { cluster = new MiniDFSCluster.Builder(conf).numDataNodes(5).build(); cluster.waitActive(); // create a file DistributedFileSystem dfs = (DistributedFileSystem) cluster.getFileSystem(); String filestr = "/foo"; Path filepath = new Path(filestr); DFSTestUtil.createFile(dfs, filepath, ORG_FILE_SIZE, REPLICATION_NUM, 0L); assertTrue(dfs.exists(filepath)); DFSTestUtil.waitReplication(dfs, filepath, REPLICATION_NUM); // get block info for the last block LocatedBlock locatedblock = TestInterDatanodeProtocol.getLastLocatedBlock(dfs.dfs.getNamenode(), filestr); DatanodeInfo[] datanodeinfos = locatedblock.getLocations(); assertEquals(REPLICATION_NUM, datanodeinfos.length); // connect to data nodes DataNode[] datanodes = new DataNode[REPLICATION_NUM]; for (int i = 0; i < REPLICATION_NUM; i++) { datanodes[i] = cluster.getDataNode(datanodeinfos[i].getIpcPort()); assertTrue(datanodes[i] != null); } // verify Block Info ExtendedBlock lastblock = locatedblock.getBlock(); DataNode.LOG.info("newblocks=" + lastblock); for (int i = 0; i < REPLICATION_NUM; i++) { checkMetaInfo(lastblock, datanodes[i]); } DataNode.LOG.info("dfs.dfs.clientName=" + dfs.dfs.clientName); cluster.getNameNodeRpc().append(filestr, dfs.dfs.clientName); // expire lease to trigger block recovery. waitLeaseRecovery(cluster); Block[] updatedmetainfo = new Block[REPLICATION_NUM]; long oldSize = lastblock.getNumBytes(); lastblock = TestInterDatanodeProtocol.getLastLocatedBlock(dfs.dfs.getNamenode(), filestr).getBlock(); long currentGS = lastblock.getGenerationStamp(); for (int i = 0; i < REPLICATION_NUM; i++) { updatedmetainfo[i] = DataNodeTestUtils.getFSDataset(datanodes[i]) .getStoredBlock(lastblock.getBlockPoolId(), lastblock.getBlockId()); assertEquals(lastblock.getBlockId(), updatedmetainfo[i].getBlockId()); assertEquals(oldSize, updatedmetainfo[i].getNumBytes()); assertEquals(currentGS, updatedmetainfo[i].getGenerationStamp()); } // verify that lease recovery does not occur when namenode is in safemode System.out.println("Testing that lease recovery cannot happen during safemode."); filestr = "/foo.safemode"; filepath = new Path(filestr); dfs.create(filepath, (short) 1); cluster.getNameNodeRpc().setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_ENTER); assertTrue(dfs.dfs.exists(filestr)); DFSTestUtil.waitReplication(dfs, filepath, (short) 1); waitLeaseRecovery(cluster); // verify that we still cannot recover the lease LeaseManager lm = NameNodeAdapter.getLeaseManager(cluster.getNamesystem()); assertTrue("Found " + lm.countLease() + " lease, expected 1", lm.countLease() == 1); cluster.getNameNodeRpc().setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_LEAVE); } finally { if (cluster != null) { cluster.shutdown(); } } }
@Test public void testGetTokensForHftpFS() throws IOException, URISyntaxException { HftpFileSystem hfs = mock(HftpFileSystem.class); DelegationTokenSecretManager dtSecretManager = NameNodeAdapter.getDtSecretManager(dfsCluster.getNamesystem()); String renewer = "renewer"; jConf.set(JTConfig.JT_USER_NAME, renewer); DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(new Text("user"), new Text(renewer), null); final Token<DelegationTokenIdentifier> t = new Token<DelegationTokenIdentifier>(dtId, dtSecretManager); final URI uri = new URI("hftp://host:2222/file1"); final String fs_addr = SecurityUtil.buildDTServiceName(uri, NameNode.DEFAULT_PORT); t.setService(new Text(fs_addr)); // when(hfs.getUri()).thenReturn(uri); Mockito.doAnswer( new Answer<URI>() { @Override public URI answer(InvocationOnMock invocation) throws Throwable { return uri; } }) .when(hfs) .getUri(); // when(hfs.getDelegationToken()).thenReturn((Token<? extends TokenIdentifier>) t); Mockito.doAnswer( new Answer<Token<DelegationTokenIdentifier>>() { @Override public Token<DelegationTokenIdentifier> answer(InvocationOnMock invocation) throws Throwable { return t; } }) .when(hfs) .getDelegationToken(renewer); // when(hfs.getDelegationTokens()).thenReturn((Token<? extends TokenIdentifier>) t); Mockito.doAnswer( new Answer<List<Token<DelegationTokenIdentifier>>>() { @Override public List<Token<DelegationTokenIdentifier>> answer(InvocationOnMock invocation) throws Throwable { return Collections.singletonList(t); } }) .when(hfs) .getDelegationTokens(renewer); // when(hfs.getCanonicalServiceName).thenReturn(fs_addr); Mockito.doAnswer( new Answer<String>() { @Override public String answer(InvocationOnMock invocation) throws Throwable { return fs_addr; } }) .when(hfs) .getCanonicalServiceName(); Credentials credentials = new Credentials(); Path p = new Path(uri.toString()); System.out.println("Path for hftp=" + p + "; fs_addr=" + fs_addr + "; rn=" + renewer); TokenCache.obtainTokensForNamenodesInternal(hfs, credentials, jConf); Collection<Token<? extends TokenIdentifier>> tns = credentials.getAllTokens(); assertEquals("number of tokens is not 1", 1, tns.size()); boolean found = false; for (Token<? extends TokenIdentifier> tt : tns) { System.out.println("token=" + tt); if (tt.getKind().equals(DelegationTokenIdentifier.HDFS_DELEGATION_KIND) && tt.getService().equals(new Text(fs_addr))) { found = true; assertEquals("different token", tt, t); } assertTrue("didn't find token for " + p, found); } }
/** * Test if {@link FSNamesystem#handleHeartbeat} can pick up replication and/or invalidate requests * and observes the max limit */ @Test public void testHeartbeat() throws Exception { final Configuration conf = new HdfsConfiguration(); final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build(); try { cluster.waitActive(); final FSNamesystem namesystem = cluster.getNamesystem(); final HeartbeatManager hm = namesystem.getBlockManager().getDatanodeManager().getHeartbeatManager(); final String poolId = namesystem.getBlockPoolId(); final DatanodeRegistration nodeReg = DataNodeTestUtils.getDNRegistrationForBP(cluster.getDataNodes().get(0), poolId); final DatanodeDescriptor dd = NameNodeAdapter.getDatanode(namesystem, nodeReg); final String storageID = DatanodeStorage.generateUuid(); dd.updateStorage(new DatanodeStorage(storageID)); final int REMAINING_BLOCKS = 1; final int MAX_REPLICATE_LIMIT = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 2); final int MAX_INVALIDATE_LIMIT = DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_DEFAULT; final int MAX_INVALIDATE_BLOCKS = 2 * MAX_INVALIDATE_LIMIT + REMAINING_BLOCKS; final int MAX_REPLICATE_BLOCKS = 2 * MAX_REPLICATE_LIMIT + REMAINING_BLOCKS; final DatanodeStorageInfo[] ONE_TARGET = {dd.getStorageInfo(storageID)}; try { namesystem.writeLock(); synchronized (hm) { for (int i = 0; i < MAX_REPLICATE_BLOCKS; i++) { dd.addBlockToBeReplicated( new Block(i, 0, GenerationStamp.LAST_RESERVED_STAMP), ONE_TARGET); } DatanodeCommand[] cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem).getCommands(); assertEquals(1, cmds.length); assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction()); assertEquals(MAX_REPLICATE_LIMIT, ((BlockCommand) cmds[0]).getBlocks().length); ArrayList<Block> blockList = new ArrayList<Block>(MAX_INVALIDATE_BLOCKS); for (int i = 0; i < MAX_INVALIDATE_BLOCKS; i++) { blockList.add(new Block(i, 0, GenerationStamp.LAST_RESERVED_STAMP)); } dd.addBlocksToBeInvalidated(blockList); cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem).getCommands(); assertEquals(2, cmds.length); assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction()); assertEquals(MAX_REPLICATE_LIMIT, ((BlockCommand) cmds[0]).getBlocks().length); assertEquals(DatanodeProtocol.DNA_INVALIDATE, cmds[1].getAction()); assertEquals(MAX_INVALIDATE_LIMIT, ((BlockCommand) cmds[1]).getBlocks().length); cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem).getCommands(); assertEquals(2, cmds.length); assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction()); assertEquals(REMAINING_BLOCKS, ((BlockCommand) cmds[0]).getBlocks().length); assertEquals(DatanodeProtocol.DNA_INVALIDATE, cmds[1].getAction()); assertEquals(MAX_INVALIDATE_LIMIT, ((BlockCommand) cmds[1]).getBlocks().length); cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem).getCommands(); assertEquals(1, cmds.length); assertEquals(DatanodeProtocol.DNA_INVALIDATE, cmds[0].getAction()); assertEquals(REMAINING_BLOCKS, ((BlockCommand) cmds[0]).getBlocks().length); cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem).getCommands(); assertEquals(0, cmds.length); } } finally { namesystem.writeUnlock(); } } finally { cluster.shutdown(); } }
/** * Test if {@link FSNamesystem#handleHeartbeat} correctly selects data node targets for block * recovery. */ @Test public void testHeartbeatBlockRecovery() throws Exception { final Configuration conf = new HdfsConfiguration(); final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); try { cluster.waitActive(); final FSNamesystem namesystem = cluster.getNamesystem(); final HeartbeatManager hm = namesystem.getBlockManager().getDatanodeManager().getHeartbeatManager(); final String poolId = namesystem.getBlockPoolId(); final DatanodeRegistration nodeReg1 = DataNodeTestUtils.getDNRegistrationForBP(cluster.getDataNodes().get(0), poolId); final DatanodeDescriptor dd1 = NameNodeAdapter.getDatanode(namesystem, nodeReg1); dd1.updateStorage(new DatanodeStorage(DatanodeStorage.generateUuid())); final DatanodeRegistration nodeReg2 = DataNodeTestUtils.getDNRegistrationForBP(cluster.getDataNodes().get(1), poolId); final DatanodeDescriptor dd2 = NameNodeAdapter.getDatanode(namesystem, nodeReg2); dd2.updateStorage(new DatanodeStorage(DatanodeStorage.generateUuid())); final DatanodeRegistration nodeReg3 = DataNodeTestUtils.getDNRegistrationForBP(cluster.getDataNodes().get(2), poolId); final DatanodeDescriptor dd3 = NameNodeAdapter.getDatanode(namesystem, nodeReg3); dd3.updateStorage(new DatanodeStorage(DatanodeStorage.generateUuid())); try { namesystem.writeLock(); synchronized (hm) { NameNodeAdapter.sendHeartBeat(nodeReg1, dd1, namesystem); NameNodeAdapter.sendHeartBeat(nodeReg2, dd2, namesystem); NameNodeAdapter.sendHeartBeat(nodeReg3, dd3, namesystem); // Test with all alive nodes. DFSTestUtil.resetLastUpdatesWithOffset(dd1, 0); DFSTestUtil.resetLastUpdatesWithOffset(dd2, 0); DFSTestUtil.resetLastUpdatesWithOffset(dd3, 0); final DatanodeStorageInfo[] storages = { dd1.getStorageInfos()[0], dd2.getStorageInfos()[0], dd3.getStorageInfos()[0] }; BlockInfo blockInfo = new BlockInfoContiguous( new Block(0, 0, GenerationStamp.LAST_RESERVED_STAMP), (short) 3); blockInfo.convertToBlockUnderConstruction(BlockUCState.UNDER_RECOVERY, storages); dd1.addBlockToBeRecovered(blockInfo); DatanodeCommand[] cmds = NameNodeAdapter.sendHeartBeat(nodeReg1, dd1, namesystem).getCommands(); assertEquals(1, cmds.length); assertEquals(DatanodeProtocol.DNA_RECOVERBLOCK, cmds[0].getAction()); BlockRecoveryCommand recoveryCommand = (BlockRecoveryCommand) cmds[0]; assertEquals(1, recoveryCommand.getRecoveringBlocks().size()); DatanodeInfo[] recoveringNodes = recoveryCommand .getRecoveringBlocks() .toArray(new BlockRecoveryCommand.RecoveringBlock[0])[0] .getLocations(); assertEquals(3, recoveringNodes.length); assertEquals(recoveringNodes[0], dd1); assertEquals(recoveringNodes[1], dd2); assertEquals(recoveringNodes[2], dd3); // Test with one stale node. DFSTestUtil.resetLastUpdatesWithOffset(dd1, 0); // More than the default stale interval of 30 seconds. DFSTestUtil.resetLastUpdatesWithOffset(dd2, -40 * 1000); DFSTestUtil.resetLastUpdatesWithOffset(dd3, 0); blockInfo = new BlockInfoContiguous( new Block(0, 0, GenerationStamp.LAST_RESERVED_STAMP), (short) 3); blockInfo.convertToBlockUnderConstruction(BlockUCState.UNDER_RECOVERY, storages); dd1.addBlockToBeRecovered(blockInfo); cmds = NameNodeAdapter.sendHeartBeat(nodeReg1, dd1, namesystem).getCommands(); assertEquals(1, cmds.length); assertEquals(DatanodeProtocol.DNA_RECOVERBLOCK, cmds[0].getAction()); recoveryCommand = (BlockRecoveryCommand) cmds[0]; assertEquals(1, recoveryCommand.getRecoveringBlocks().size()); recoveringNodes = recoveryCommand .getRecoveringBlocks() .toArray(new BlockRecoveryCommand.RecoveringBlock[0])[0] .getLocations(); assertEquals(2, recoveringNodes.length); // dd2 is skipped. assertEquals(recoveringNodes[0], dd1); assertEquals(recoveringNodes[1], dd3); // Test with all stale node. DFSTestUtil.resetLastUpdatesWithOffset(dd1, -60 * 1000); // More than the default stale interval of 30 seconds. DFSTestUtil.resetLastUpdatesWithOffset(dd2, -40 * 1000); DFSTestUtil.resetLastUpdatesWithOffset(dd3, -80 * 1000); blockInfo = new BlockInfoContiguous( new Block(0, 0, GenerationStamp.LAST_RESERVED_STAMP), (short) 3); blockInfo.convertToBlockUnderConstruction(BlockUCState.UNDER_RECOVERY, storages); dd1.addBlockToBeRecovered(blockInfo); cmds = NameNodeAdapter.sendHeartBeat(nodeReg1, dd1, namesystem).getCommands(); assertEquals(1, cmds.length); assertEquals(DatanodeProtocol.DNA_RECOVERBLOCK, cmds[0].getAction()); recoveryCommand = (BlockRecoveryCommand) cmds[0]; assertEquals(1, recoveryCommand.getRecoveringBlocks().size()); recoveringNodes = recoveryCommand .getRecoveringBlocks() .toArray(new BlockRecoveryCommand.RecoveringBlock[0])[0] .getLocations(); // Only dd1 is included since it heart beated and hence its not stale // when the list of recovery blocks is constructed. assertEquals(3, recoveringNodes.length); assertEquals(recoveringNodes[0], dd1); assertEquals(recoveringNodes[1], dd2); assertEquals(recoveringNodes[2], dd3); } } finally { namesystem.writeUnlock(); } } finally { cluster.shutdown(); } }
/** * Test case that reduces replication of a file with a lot of blocks and then fails over right * after those blocks enter the DN invalidation queues on the active. Ensures that fencing is * correct and no replicas are lost. */ @Test public void testNNClearsCommandsOnFailoverWithReplChanges() throws Exception { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30 * SMALL_BLOCK, (short) 1, 1L); banner("rolling NN1's edit log, forcing catch-up"); HATestUtil.waitForStandbyToCatchUp(nn1, nn2); // Get some new replicas reported so that NN2 now considers // them over-replicated and schedules some more deletions nn1.getRpcServer().setReplication(TEST_FILE, (short) 2); while (BlockManagerTestUtil.getComputedDatanodeWork(nn1.getNamesystem().getBlockManager()) > 0) { LOG.info("Getting more replication work computed"); } BlockManager bm1 = nn1.getNamesystem().getBlockManager(); while (bm1.getPendingReplicationBlocksCount() > 0) { BlockManagerTestUtil.updateState(bm1); cluster.triggerHeartbeats(); Thread.sleep(1000); } banner("triggering BRs"); cluster.triggerBlockReports(); nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); banner("computing invalidation on nn1"); BlockManagerTestUtil.computeInvalidationWork(nn1.getNamesystem().getBlockManager()); doMetasave(nn1); banner("computing invalidation on nn2"); BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); doMetasave(nn2); // Dump some info for debugging purposes. banner("Metadata immediately before failover"); doMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.abortEditLogs(nn1); NameNodeAdapter.enterSafeMode(nn1, false); BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); cluster.transitionToActive(1); // Check that the standby picked up the replication change. assertEquals(1, nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); // Dump some info for debugging purposes. banner("Metadata immediately after failover"); doMetasave(nn2); banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); banner("Metadata after nodes have all block-reported"); doMetasave(nn2); // The block should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); HATestUtil.waitForNNToIssueDeletions(nn2); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); banner("Making sure the file is still readable"); FileSystem fs2 = cluster.getFileSystem(1); DFSTestUtil.readFile(fs2, TEST_FILE_PATH); }
/** * Test case which restarts the standby node in such a way that, when it exits safemode, it will * want to invalidate a bunch of over-replicated block replicas. Ensures that if we failover at * this point it won't lose data. */ @Test public void testNNClearsCommandsOnFailoverAfterStartup() throws Exception { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30 * SMALL_BLOCK, (short) 3, 1L); banner("Shutting down NN2"); cluster.shutdownNameNode(1); banner("Setting replication to 1, rolling edit log."); nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); nn1.getRpcServer().rollEditLog(); // Start NN2 again. When it starts up, it will see all of the // blocks as over-replicated, since it has the metadata for // replication=1, but the DNs haven't yet processed the deletions. banner("Starting NN2 again."); cluster.restartNameNode(1); nn2 = cluster.getNameNode(1); banner("triggering BRs"); cluster.triggerBlockReports(); // We expect that both NN1 and NN2 will have some number of // deletions queued up for the DNs. banner("computing invalidation on nn1"); BlockManagerTestUtil.computeInvalidationWork(nn1.getNamesystem().getBlockManager()); banner("computing invalidation on nn2"); BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); // Dump some info for debugging purposes. banner("Metadata immediately before failover"); doMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.abortEditLogs(nn1); NameNodeAdapter.enterSafeMode(nn1, false); cluster.transitionToActive(1); // Check that the standby picked up the replication change. assertEquals(1, nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); // Dump some info for debugging purposes. banner("Metadata immediately after failover"); doMetasave(nn2); banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); banner("Metadata after nodes have all block-reported"); doMetasave(nn2); // The block should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.computeInvalidationWork(nn2.getNamesystem().getBlockManager()); HATestUtil.waitForNNToIssueDeletions(nn2); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); banner("Making sure the file is still readable"); FileSystem fs2 = cluster.getFileSystem(1); DFSTestUtil.readFile(fs2, TEST_FILE_PATH); }
/** * Test that when there is a failure replicating a block the temporary and meta files are cleaned * up and subsequent replication succeeds. */ @Test public void testReplicationError() throws Exception { // create a file of replication factor of 1 final Path fileName = new Path("/test.txt"); final int fileLen = 1; DFSTestUtil.createFile(fs, fileName, 1, (short) 1, 1L); DFSTestUtil.waitReplication(fs, fileName, (short) 1); // get the block belonged to the created file LocatedBlocks blocks = NameNodeAdapter.getBlockLocations( cluster.getNameNode(), fileName.toString(), 0, (long) fileLen); assertEquals("Should only find 1 block", blocks.locatedBlockCount(), 1); LocatedBlock block = blocks.get(0); // bring up a second datanode cluster.startDataNodes(conf, 1, true, null, null); cluster.waitActive(); final int sndNode = 1; DataNode datanode = cluster.getDataNodes().get(sndNode); // replicate the block to the second datanode InetSocketAddress target = datanode.getXferAddress(); Socket s = new Socket(target.getAddress(), target.getPort()); // write the header. DataOutputStream out = new DataOutputStream(s.getOutputStream()); DataChecksum checksum = DataChecksum.newDataChecksum(DataChecksum.Type.CRC32, 512); new Sender(out) .writeBlock( block.getBlock(), StorageType.DEFAULT, BlockTokenSecretManager.DUMMY_TOKEN, "", new DatanodeInfo[0], new StorageType[0], null, BlockConstructionStage.PIPELINE_SETUP_CREATE, 1, 0L, 0L, 0L, checksum, CachingStrategy.newDefaultStrategy(), false); out.flush(); // close the connection before sending the content of the block out.close(); // the temporary block & meta files should be deleted String bpid = cluster.getNamesystem().getBlockPoolId(); File storageDir = cluster.getInstanceStorageDir(sndNode, 0); File dir1 = MiniDFSCluster.getRbwDir(storageDir, bpid); storageDir = cluster.getInstanceStorageDir(sndNode, 1); File dir2 = MiniDFSCluster.getRbwDir(storageDir, bpid); while (dir1.listFiles().length != 0 || dir2.listFiles().length != 0) { Thread.sleep(100); } // then increase the file's replication factor fs.setReplication(fileName, (short) 2); // replication should succeed DFSTestUtil.waitReplication(fs, fileName, (short) 1); // clean up the file fs.delete(fileName, false); }