/** * tests that the distributed block fixer obeys the limit on how many jobs to submit * simultaneously. */ @Test public void testMaxPendingJobs() throws Exception { LOG.info("Test testMaxPendingJobs started."); int stripeLength = 3; mySetup(stripeLength); long[] crcs1 = new long[3]; int[] seeds1 = new int[3]; long[] crcs2 = new long[3]; int[] seeds2 = new int[3]; Path dirPath1 = new Path("/user/dhruba/raidtestrs/1"); Path[] files1 = TestRaidDfs.createTestFiles( dirPath1, fileSizes, blockSizes, crcs1, seeds1, fileSys, (short) 1); Path dirPath2 = new Path("/user/dhruba/raidtestrs/2"); Path[] files2 = TestRaidDfs.createTestFiles( dirPath2, fileSizes, blockSizes, crcs2, seeds2, fileSys, (short) 1); Path destPath = new Path("/destraidrs/user/dhruba/raidtestrs"); LOG.info("Test testMaxPendingJobs created test files"); Configuration localConf = this.getRaidNodeConfig(conf, false); localConf.setLong("raid.blockfix.maxpendingjobs", 1L); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath1, destPath); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath2, destPath); cnode.stop(); cnode.join(); DistributedFileSystem dfs = (DistributedFileSystem) fileSys; String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals( "filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); // corrupt directory 1 this.corruptFiles( dirPath1, crcs1, rsCorruptFileIdx1, dfs, files1, rsNumCorruptBlocksInFiles1); cnode = RaidNode.createRaidNode(null, localConf); DistBlockIntegrityMonitor blockFixer = (DistBlockIntegrityMonitor) cnode.blockIntegrityMonitor; long start = System.currentTimeMillis(); while (blockFixer.jobsRunning() < 1 && System.currentTimeMillis() - start < 60000) { LOG.info("Test testDirBlockFix waiting for fixing job 1 to start"); Thread.sleep(1000); } assertEquals("job not running", 1, blockFixer.jobsRunning()); // corrupt directory 2 this.corruptFiles( dirPath2, crcs2, rsCorruptFileIdx2, dfs, files2, rsNumCorruptBlocksInFiles2); // wait until both files are fixed while (blockFixer.getNumFilesFixed() < 6 && System.currentTimeMillis() - start < 240000) { // make sure the block fixer does not start a second job while // the first one is still running assertTrue("too many jobs running", blockFixer.jobsRunning() <= 1); Thread.sleep(1000); } TestBlockFixer.verifyMetrics( fileSys, cnode, false, 6L, getTotal(rsNumCorruptBlocksInFiles1) + getTotal(rsNumCorruptBlocksInFiles2)); dfs = getDFS(conf, dfs); for (int i = 0; i < fileSizes.length; i++) { assertTrue( "file " + files1[i] + " not fixed", TestRaidDfs.validateFile(dfs, files1[i], fileSizes[i], crcs1[i])); } for (int i = 0; i < fileSizes.length; i++) { assertTrue( "file " + files2[i] + " not fixed", TestRaidDfs.validateFile(dfs, files2[i], fileSizes[i], crcs2[i])); } } catch (Exception e) { LOG.info("Test testMaxPendingJobs exception " + e + StringUtils.stringifyException(e)); throw e; } finally { myTearDown(); } }
/** tests that we can have 2 concurrent jobs fixing files (dist block fixer) */ @Test public void testConcurrentJobs() throws Exception { LOG.info("Test testConcurrentJobs started."); int stripeLength = 3; mySetup(stripeLength); long[] crcs1 = new long[3]; int[] seeds1 = new int[3]; long[] crcs2 = new long[3]; int[] seeds2 = new int[3]; Path dirPath1 = new Path("/user/dhruba/raidtestrs/1"); Path[] files1 = TestRaidDfs.createTestFiles( dirPath1, fileSizes, blockSizes, crcs1, seeds1, fileSys, (short) 1); Path dirPath2 = new Path("/user/dhruba/raidtestrs/2"); Path[] files2 = TestRaidDfs.createTestFiles( dirPath2, fileSizes, blockSizes, crcs2, seeds2, fileSys, (short) 1); Path destPath = new Path("/destraidrs/user/dhruba/raidtestrs"); LOG.info("Test testConcurrentJobs created test files"); Configuration localConf = this.getRaidNodeConfig(conf, false); localConf.setLong(BlockIntegrityMonitor.BLOCKCHECK_INTERVAL, 15000L); localConf.setLong(DistBlockIntegrityMonitor.RAIDNODE_BLOCK_FIX_SUBMISSION_INTERVAL_KEY, 15000L); localConf.setLong( DistBlockIntegrityMonitor.RAIDNODE_BLOCK_FIX_SCAN_SUBMISSION_INTERVAL_KEY, 3600000); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath1, destPath); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath2, destPath); cnode.stop(); cnode.join(); DistributedFileSystem dfs = (DistributedFileSystem) fileSys; String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals( "filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); // corrupt directory 1 this.corruptFiles( dirPath1, crcs1, rsCorruptFileIdx1, dfs, files1, rsNumCorruptBlocksInFiles1); cnode = RaidNode.createRaidNode(null, localConf); DistBlockIntegrityMonitor blockFixer = (DistBlockIntegrityMonitor) cnode.blockIntegrityMonitor; long start = System.currentTimeMillis(); // All files are HIGH-PRI corrupt files while (blockFixer.jobsRunning() < 1 && System.currentTimeMillis() - start < 60000) { LOG.info("Test testDirBlockFix waiting for fixing job 1 to start"); Thread.sleep(1000); } assertEquals("job 1 not running", 1, blockFixer.jobsRunning()); // Corrupt directory 2 this.corruptFiles( dirPath2, crcs2, rsCorruptFileIdx2, dfs, files2, rsNumCorruptBlocksInFiles2); // 1 LOW-PRI file and 2 HIGH-PRI files while (blockFixer.jobsRunning() < 3 && System.currentTimeMillis() - start < 60000) { LOG.info("Test testDirBlockFix waiting for fixing job 2 and 3 to start"); Thread.sleep(1000); } assertTrue("more than 3 jobs are running", blockFixer.jobsRunning() >= 3); while (blockFixer.getNumFilesFixed() < 6 && System.currentTimeMillis() - start < 240000) { LOG.info("Test testDirBlockFix waiting for files to be fixed."); Thread.sleep(1000); } TestBlockFixer.verifyMetrics( fileSys, cnode, false, 6L, getTotal(rsNumCorruptBlocksInFiles1) + getTotal(rsNumCorruptBlocksInFiles2)); dfs = getDFS(conf, dfs); for (int i = 0; i < fileSizes.length; i++) { assertTrue( "file " + files1[i] + " not fixed", TestRaidDfs.validateFile(dfs, files1[i], fileSizes[i], crcs1[i])); } for (int i = 0; i < fileSizes.length; i++) { assertTrue( "file " + files2[i] + " not fixed", TestRaidDfs.validateFile(dfs, files2[i], fileSizes[i], crcs2[i])); } } catch (Exception e) { LOG.info("Test testConcurrentJobs exception " + e, e); throw e; } finally { myTearDown(); } }
/** * Tests integrity of generated block. Create a file and delete a block entirely. Wait for the * block to be regenerated. Now stop RaidNode and corrupt the generated block. Test that * corruption in the generated block can be detected by clients. */ private void generatedBlockTestCommon(String testName, int blockToCorrupt, boolean local) throws Exception { LOG.info("Test " + testName + " started."); int stripeLength = 3; mySetup(stripeLength); long[] crcs = new long[3]; int[] seeds = new int[3]; Path dirPath = new Path("/user/dhruba/raidtest"); Path[] files = TestRaidDfs.createTestFiles( dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short) 1); Path destPath = new Path("/destraid/user/dhruba"); LOG.info("Test " + testName + " created test files"); Configuration localConf = this.getRaidNodeConfig(conf, local); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath, destPath); cnode.stop(); cnode.join(); DistributedFileSystem dfs = (DistributedFileSystem) fileSys; String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals( "filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); Integer[] corruptBlockIdxs = new Integer[] {blockToCorrupt}; TestDirectoryRaidDfs.corruptBlocksInDirectory( conf, dirPath, crcs, corruptBlockIdxs, fileSys, dfsCluster, false, true); corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("files not corrupted", corruptBlockIdxs.length, corruptFiles.length); int corruptFileIdx = -1; for (int i = 0; i < files.length; i++) { if (files[i].toUri().getPath().equals(corruptFiles[0])) { corruptFileIdx = i; break; } } assertNotSame("Wrong corrupt file", -1, corruptFileIdx); cnode = RaidNode.createRaidNode(null, localConf); long start = System.currentTimeMillis(); while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 1 && System.currentTimeMillis() - start < 120000) { LOG.info("Test testDirBlockFix waiting for files to be fixed."); Thread.sleep(1000); } TestBlockFixer.verifyMetrics(fileSys, cnode, local, 1L, corruptBlockIdxs.length); // Stop RaidNode cnode.stop(); cnode.join(); cnode = null; // The block has successfully been reconstructed. dfs = getDFS(conf, dfs); assertTrue( "file not fixed", TestRaidDfs.validateFile( dfs, files[corruptFileIdx], fileSizes[corruptFileIdx], crcs[corruptFileIdx])); // Now corrupt the generated block. TestDirectoryRaidDfs.corruptBlocksInDirectory( conf, dirPath, crcs, corruptBlockIdxs, dfs, dfsCluster, false, false); try { TestRaidDfs.validateFile( dfs, files[corruptFileIdx], fileSizes[corruptFileIdx], crcs[corruptFileIdx]); fail("Expected exception not thrown"); } catch (org.apache.hadoop.fs.ChecksumException ce) { } catch (org.apache.hadoop.fs.BlockMissingException bme) { } } catch (Exception e) { LOG.info("Test " + testName + " Exception " + e, e); throw e; } finally { myTearDown(); } LOG.info("Test " + testName + " completed."); }
/** Corrupt a parity file and wait for it to get fixed. */ private void implParityBlockFix(String testName, boolean local) throws Exception { LOG.info("Test " + testName + " started."); int stripeLength = 3; mySetup(stripeLength); long[] crcs = new long[3]; int[] seeds = new int[3]; Path dirPath = new Path("/user/dhruba/raidtest"); Path[] files = TestRaidDfs.createTestFiles( dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short) 1); Path destPath = new Path("/destraid/user/dhruba"); Path parityFile = new Path("/destraid/user/dhruba/raidtest"); LOG.info("Test " + testName + " created test files"); Configuration localConf = this.getRaidNodeConfig(conf, local); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath, destPath); cnode.stop(); cnode.join(); long parityCRC = RaidDFSUtil.getCRC(fileSys, parityFile); FileStatus parityStat = fileSys.getFileStatus(parityFile); DistributedFileSystem dfs = (DistributedFileSystem) fileSys; LocatedBlocks locs = RaidDFSUtil.getBlockLocations(dfs, parityFile.toUri().getPath(), 0, parityStat.getLen()); String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals( "filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); // Corrupt parity blocks for different stripes. int[] corruptBlockIdxs = new int[] {0, 1, 2}; for (int idx : corruptBlockIdxs) corruptBlock(locs.get(idx).getBlock(), dfsCluster); RaidDFSUtil.reportCorruptBlocks(dfs, parityFile, corruptBlockIdxs, 2 * blockSize); corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("file not corrupted", 1, corruptFiles.length); assertEquals("wrong file corrupted", corruptFiles[0], parityFile.toUri().getPath()); cnode = RaidNode.createRaidNode(null, localConf); long start = System.currentTimeMillis(); while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 1 && System.currentTimeMillis() - start < 120000) { LOG.info("Test " + testName + " waiting for files to be fixed."); Thread.sleep(3000); } TestBlockFixer.verifyMetrics(fileSys, cnode, local, 1L, corruptBlockIdxs.length); long checkCRC = RaidDFSUtil.getCRC(fileSys, parityFile); assertEquals("file not fixed", parityCRC, checkCRC); } catch (Exception e) { LOG.info("Test " + testName + " Exception " + e + StringUtils.stringifyException(e)); throw e; } finally { myTearDown(); } LOG.info("Test " + testName + " completed."); }
/** * Create a file with three stripes, corrupt a block each in two stripes, and wait for the the * file to be fixed. */ private void implDirBlockFix(boolean local, boolean hasStripeInfo, boolean corruptStripe) throws Exception { LOG.info( "Test testDirBlockFix started. local:" + local + " hasStripeInfo:" + hasStripeInfo + " corruptStripe:" + corruptStripe); int stripeLength = 3; mySetup(stripeLength); long[] crcs = new long[3]; int[] seeds = new int[3]; Path dirPath = new Path("/user/dhruba/raidtestrs"); Path[] files = TestRaidDfs.createTestFiles( dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short) 1); Path destPath = new Path("/destraidrs/user/dhruba"); LOG.info("Test testDirBlockFix created test files"); Configuration localConf = this.getRaidNodeConfig(conf, local); // Not allow multiple running jobs localConf.setLong("raid.blockfix.maxpendingjobs", 1L); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath, destPath); cnode.stop(); cnode.join(); DistributedFileSystem dfs = (DistributedFileSystem) fileSys; String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals( "filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); if (!hasStripeInfo) { // clear out all stripes LocalStripeStore lss = new LocalStripeStore(); lss.initialize(localConf, false, dfs); lss.clear(); } if (corruptStripe) { LocalStripeStore lss = new LocalStripeStore(); lss.initialize(localConf, false, dfs); Set<List<Block>> corruptCandidates = new HashSet<List<Block>>(lss.stripeSet.keySet()); for (List<Block> lb : corruptCandidates) { for (Codec codec : Codec.getCodecs()) { StripeInfo si = lss.getStripe(codec, lb.get(0)); if (si == null) { continue; } String oldSi = si.toString(); Collections.rotate(si.parityBlocks, 1); Collections.rotate(si.srcBlocks, 1); lss.putStripe(codec, si.parityBlocks, si.srcBlocks); String newSi = lss.getStripe(codec, lb.get(0)).toString(); LOG.info("Corrupt the stripe info old : " + oldSi + " new : " + newSi); } } } this.corruptFiles(dirPath, crcs, rsCorruptFileIdx1, dfs, files, rsNumCorruptBlocksInFiles1); cnode = RaidNode.createRaidNode(null, localConf); long start = System.currentTimeMillis(); while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 3 && cnode.blockIntegrityMonitor.getNumFileFixFailures() < 3 && System.currentTimeMillis() - start < 120000) { LOG.info("Test testDirBlockFix waiting for files to be fixed."); Thread.sleep(1000); } long totalCorruptBlocks = getTotal(rsNumCorruptBlocksInFiles1); if (hasStripeInfo) { if (!corruptStripe) { TestBlockFixer.verifyMetrics(fileSys, cnode, local, 3L, totalCorruptBlocks); dfs = getDFS(conf, dfs); for (int i = 0; i < fileSizes.length; i++) { assertTrue( "file " + files[i] + " not fixed", TestRaidDfs.validateFile(dfs, files[i], fileSizes[i], crcs[i])); } } else { TestBlockFixer.verifyMetrics(fileSys, cnode, local, 0L, 0L); assertTrue( "should fail to fix more than 3 files", cnode.blockIntegrityMonitor.getNumFileFixFailures() >= 3L); TestBlockFixer.verifyMetrics( fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_FILE, LOGRESULTS.FAILURE, 3L, true); // Will throw stripe mismatch exception for the first blocks of 3 files TestBlockFixer.verifyMetrics( fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_STRIPE_VERIFICATION, LOGRESULTS.FAILURE, 3L, true); } } else { TestBlockFixer.verifyMetrics(fileSys, cnode, local, 0L, 0L); assertTrue( "should fail to fix more than 3 files", cnode.blockIntegrityMonitor.getNumFileFixFailures() >= 3L); TestBlockFixer.verifyMetrics( fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_GET_STRIPE, LOGRESULTS.FAILURE, totalCorruptBlocks, true); TestBlockFixer.verifyMetrics( fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_FILE, LOGRESULTS.FAILURE, 3L, true); } } catch (Exception e) { LOG.info("Test testDirBlockFix Exception " + e, e); throw e; } finally { myTearDown(); } LOG.info("Test testDirBlockFix completed."); }