/** * Test interruptable while blocking wait on root and meta. * * @throws IOException * @throws InterruptedException */ @Test public void testInterruptWaitOnMetaAndRoot() throws IOException, InterruptedException { HRegionInterface implementation = Mockito.mock(HRegionInterface.class); HConnection connection = mockConnection(implementation); final CatalogTracker ct = constructAndStartCatalogTracker(connection); ServerName hsa = ct.getRootLocation(); Assert.assertNull(hsa); ServerName meta = ct.getMetaLocation(); Assert.assertNull(meta); Thread t = new Thread() { @Override public void run() { try { ct.waitForMeta(); } catch (InterruptedException e) { throw new RuntimeException("Interrupted", e); } } }; t.start(); while (!t.isAlive()) Threads.sleep(1); Threads.sleep(1); assertTrue(t.isAlive()); ct.stop(); // Join the thread... should exit shortly. t.join(); }
private void startWaitAliveThenWaitItLives(final Thread t, final int ms) { t.start(); while (!t.isAlive()) { // Wait } // Wait one second. Threads.sleep(ms); Assert.assertTrue("Assert " + t.getName() + " still waiting", t.isAlive()); }
private int runTest() throws Exception { LOG.info("Starting the test"); String runtimeKey = String.format(RUN_TIME_KEY, this.getClass().getSimpleName()); long runtime = util.getConfiguration().getLong(runtimeKey, DEFAULT_RUN_TIME); String numThreadKey = String.format(NUM_THREADS_KEY, this.getClass().getSimpleName()); numThreads = util.getConfiguration().getInt(numThreadKey, DEFAULT_NUM_THREADS); ArrayList<Worker> workers = new ArrayList<>(); for (int i = 0; i < numThreads; i++) { checkException(workers); Worker worker = new Worker(); LOG.info("Launching worker thread " + worker.getName()); workers.add(worker); worker.start(); } Threads.sleep(runtime / 2); LOG.info("Stopping creating new tables"); create_table.set(false); Threads.sleep(runtime / 2); LOG.info("Runtime is up"); running.set(false); checkException(workers); for (Worker worker : workers) { worker.join(); } LOG.info("All Worker threads stopped"); // verify LOG.info("Verify actions of all threads succeeded"); checkException(workers); LOG.info("Verify namespaces"); verifyNamespaces(); LOG.info("Verify states of all tables"); verifyTables(); // RUN HBCK HBaseFsck hbck = null; try { LOG.info("Running hbck"); hbck = HbckTestingUtil.doFsck(util.getConfiguration(), false); if (HbckTestingUtil.inconsistencyFound(hbck)) { // Find the inconsistency during HBCK. Leave table and namespace undropped so that // we can check outside the test. keepObjectsAtTheEnd = true; } HbckTestingUtil.assertNoErrors(hbck); LOG.info("Finished hbck"); } finally { if (hbck != null) { hbck.close(); } } return 0; }
@Test public void testPreWALRestoreSkip() throws Exception { LOG.info(TestRegionObserverInterface.class.getName() + ".testPreWALRestoreSkip"); TableName tableName = TableName.valueOf(SimpleRegionObserver.TABLE_SKIPPED); HTable table = util.createTable(tableName, new byte[][] {A, B, C}); JVMClusterUtil.RegionServerThread rs1 = cluster.startRegionServer(); ServerName sn2 = rs1.getRegionServer().getServerName(); String regEN = table.getRegionLocations().firstEntry().getKey().getEncodedName(); util.getHBaseAdmin().move(regEN.getBytes(), sn2.getServerName().getBytes()); while (!sn2.equals(table.getRegionLocations().firstEntry().getValue())) { Thread.sleep(100); } Put put = new Put(ROW); put.add(A, A, A); put.add(B, B, B); put.add(C, C, C); table.put(put); table.flushCommits(); cluster.killRegionServer(rs1.getRegionServer().getServerName()); Threads.sleep(20000); // just to be sure that the kill has fully started. util.waitUntilAllRegionsAssigned(tableName); verifyMethodResult( SimpleRegionObserver.class, new String[] {"getCtPreWALRestore", "getCtPostWALRestore"}, tableName, new Integer[] {0, 0}); util.deleteTable(tableName); table.close(); }
@Override public void run() { while (running.get()) { switch (random.nextInt() % 2) { case 0: // start a server try { cluster.startServer(); } catch (Exception e) { LOG.warn(e); exception.compareAndSet(null, e); } break; case 1: // stop a server try { cluster.stopRandomServer(); } catch (Exception e) { LOG.warn(e); exception.compareAndSet(null, e); } default: } Threads.sleep(100); } }
@Test(timeout = 30000) public void testInfo() { HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); MetricsMasterWrapperImpl info = new MetricsMasterWrapperImpl(master); assertEquals(master.getSplitPlanCount(), info.getSplitPlanCount(), 0); assertEquals(master.getMergePlanCount(), info.getMergePlanCount(), 0); assertEquals(master.getAverageLoad(), info.getAverageLoad(), 0); assertEquals(master.getClusterId(), info.getClusterId()); assertEquals(master.getMasterActiveTime(), info.getActiveTime()); assertEquals(master.getMasterStartTime(), info.getStartTime()); assertEquals(master.getMasterCoprocessors().length, info.getCoprocessors().length); assertEquals( master.getServerManager().getOnlineServersList().size(), info.getNumRegionServers()); assertEquals(5, info.getNumRegionServers()); String zkServers = info.getZookeeperQuorum(); assertEquals(zkServers.split(",").length, TEST_UTIL.getZkCluster().getZooKeeperServerNum()); final int index = 3; LOG.info("Stopping " + TEST_UTIL.getMiniHBaseCluster().getRegionServer(index)); TEST_UTIL.getMiniHBaseCluster().stopRegionServer(index, false); TEST_UTIL.getMiniHBaseCluster().waitOnRegionServer(index); // We stopped the regionserver but could take a while for the master to notice it so hang here // until it does... then move forward to see if metrics wrapper notices. while (TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServers().size() != 4) { Threads.sleep(10); } assertEquals(4, info.getNumRegionServers()); assertEquals(1, info.getNumDeadRegionServers()); assertEquals(1, info.getNumWALFiles()); }
/** * Returns a Thread pool for the RPC's to region replicas. Similar to Connection's thread pool. */ private ExecutorService getDefaultThreadPool(Configuration conf) { int maxThreads = conf.getInt("hbase.region.replica.replication.threads.max", 256); int coreThreads = conf.getInt("hbase.region.replica.replication.threads.core", 16); if (maxThreads == 0) { maxThreads = Runtime.getRuntime().availableProcessors() * 8; } if (coreThreads == 0) { coreThreads = Runtime.getRuntime().availableProcessors() * 8; } long keepAliveTime = conf.getLong("hbase.region.replica.replication.threads.keepalivetime", 60); LinkedBlockingQueue<Runnable> workQueue = new LinkedBlockingQueue<Runnable>( maxThreads * conf.getInt( HConstants.HBASE_CLIENT_MAX_TOTAL_TASKS, HConstants.DEFAULT_HBASE_CLIENT_MAX_TOTAL_TASKS)); ThreadPoolExecutor tpe = new ThreadPoolExecutor( coreThreads, maxThreads, keepAliveTime, TimeUnit.SECONDS, workQueue, Threads.newDaemonThreadFactory(this.getClass().getSimpleName() + "-rpc-shared-")); tpe.allowCoreThreadTimeOut(true); return tpe; }
@Override public void run() { while (!isStopped()) { try { NavigableMap<HRegionInfo, ServerName> regions = MetaScanner.allTableRegions(connection, TABLENAME); LOG.info("-------"); byte[] lastEndKey = HConstants.EMPTY_START_ROW; for (HRegionInfo hri : regions.navigableKeySet()) { long startKey = 0, endKey = Long.MAX_VALUE; if (!Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())) { startKey = Bytes.toLong(hri.getStartKey()); } if (!Bytes.equals(HConstants.EMPTY_END_ROW, hri.getEndKey())) { endKey = Bytes.toLong(hri.getEndKey()); } LOG.info("start:" + startKey + " end:" + endKey + " hri:" + hri); Assert.assertTrue( "lastEndKey=" + Bytes.toString(lastEndKey) + ", startKey=" + Bytes.toString(hri.getStartKey()), Bytes.equals(lastEndKey, hri.getStartKey())); lastEndKey = hri.getEndKey(); } Assert.assertTrue(Bytes.equals(lastEndKey, HConstants.EMPTY_END_ROW)); LOG.info("-------"); Threads.sleep(10 + random.nextInt(50)); } catch (Throwable e) { ex = e; Assert.fail(StringUtils.stringifyException(e)); } } }
@Override public void preGetOp( final ObserverContext<RegionCoprocessorEnvironment> e, final Get get, final List<Cell> results) throws IOException { Threads.sleep(2500); }
/** * sleeping logic for static methods; handles the interrupt exception. Keeping a static version * for this to avoid re-looking for the integer values. */ protected static void sleepBeforeRetry(String msg, int sleepMultiplier) { if (sleepMultiplier > hdfsClientRetriesNumber) { LOG.warn(msg + ", retries exhausted"); return; } LOG.info(msg + ", sleeping " + baseSleepBeforeRetries + " times " + sleepMultiplier); Threads.sleep(baseSleepBeforeRetries * sleepMultiplier); }
/** * sleeping logic for static methods; handles the interrupt exception. Keeping a static version * for this to avoid re-looking for the integer values. */ private static void sleepBeforeRetry( String msg, int sleepMultiplier, int baseSleepBeforeRetries, int hdfsClientRetriesNumber) { if (sleepMultiplier > hdfsClientRetriesNumber) { LOG.debug(msg + ", retries exhausted"); return; } LOG.debug(msg + ", sleeping " + baseSleepBeforeRetries + " times " + sleepMultiplier); Threads.sleep((long) baseSleepBeforeRetries * sleepMultiplier); }
@Test public void testRecovery() throws Exception { LOG.info(TestRegionObserverInterface.class.getName() + ".testRecovery"); TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + ".testRecovery"); HTable table = util.createTable(tableName, new byte[][] {A, B, C}); try { JVMClusterUtil.RegionServerThread rs1 = cluster.startRegionServer(); ServerName sn2 = rs1.getRegionServer().getServerName(); String regEN = table.getRegionLocations().firstEntry().getKey().getEncodedName(); util.getHBaseAdmin().move(regEN.getBytes(), sn2.getServerName().getBytes()); while (!sn2.equals(table.getRegionLocations().firstEntry().getValue())) { Thread.sleep(100); } Put put = new Put(ROW); put.add(A, A, A); put.add(B, B, B); put.add(C, C, C); table.put(put); verifyMethodResult( SimpleRegionObserver.class, new String[] { "hadPreGet", "hadPostGet", "hadPrePut", "hadPostPut", "hadPreBatchMutate", "hadPostBatchMutate", "hadDelete" }, tableName, new Boolean[] {false, false, true, true, true, true, false}); verifyMethodResult( SimpleRegionObserver.class, new String[] {"getCtPreWALRestore", "getCtPostWALRestore", "getCtPrePut", "getCtPostPut"}, tableName, new Integer[] {0, 0, 1, 1}); cluster.killRegionServer(rs1.getRegionServer().getServerName()); Threads.sleep(1000); // Let the kill soak in. util.waitUntilAllRegionsAssigned(tableName); LOG.info("All regions assigned"); verifyMethodResult( SimpleRegionObserver.class, new String[] {"getCtPrePut", "getCtPostPut"}, tableName, new Integer[] {0, 0}); } finally { util.deleteTable(tableName); table.close(); } }
@Override public void run() { try { Thread.sleep(timeout); Threads.printThreadInfo(System.err, "TEST TIMEOUT STACK DUMP"); System.exit(1); // a timeout happened } catch (InterruptedException e) { // this is what we want } }
@Override protected void beforeWaitOnSafePoint() { if (throwException) { LOG.info("COUNTDOWN"); // Don't countdown latch until someone waiting on it otherwise, the above // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll // be stuck; test won't go down while (this.latch.getCount() <= 0) Threads.sleep(1); this.latch.countDown(); } }
private ExecutorService createScanExecutor(Id.Stream streamId) { ThreadFactory threadFactory = Threads.newDaemonThreadFactory( String.format( "stream-%s-%s-consumer-scanner-", streamId.getNamespaceId(), streamId.getId())); ThreadPoolExecutor executor = new ThreadPoolExecutor( 1, 20, 60, TimeUnit.SECONDS, new SynchronousQueue<Runnable>(), threadFactory); executor.allowCoreThreadTimeOut(true); return executor; }
@Override public Result postAppend( final ObserverContext<RegionCoprocessorEnvironment> e, final Append append, final Result result) throws IOException { if (ct.incrementAndGet() == 1) { Threads.sleep(sleepTime.get()); } return result; }
/** * @param masters * @param fservers */ public static void shutdown(final List<MasterThread> masters, final List<FServerThread> fservers) throws IOException { LOG.debug("Shutting down HBase Cluster"); if (masters != null) { // Do backups first. JVMClusterUtil.MasterThread activeMaster = null; for (JVMClusterUtil.MasterThread t : masters) { if (!t.master.isActiveMaster()) { t.master.stopMaster(); } else { activeMaster = t; } } // Do active after. if (activeMaster != null) activeMaster.master.shutdown(); } if (fservers != null) { for (FServerThread t : fservers) { if (t.isAlive()) { try { t.getFServer().stop("Shutdown requested"); t.join(); } catch (InterruptedException e) { // continue } } } } if (masters != null) { for (JVMClusterUtil.MasterThread t : masters) { while (t.master.isAlive()) { try { // The below has been replaced to debug sometime hangs on end of // tests. // this.master.join(): Threads.threadDumpingIsAlive(t.master.getThread()); } catch (InterruptedException e) { // continue } } } } LOG.info( "Shutdown of " + ((masters != null) ? masters.size() : "0") + " master(s) and " + ((fservers != null) ? fservers.size() : "0") + " fserver(s) complete"); }
@Override public MultiResponse multi(RpcController controller, MultiRequest request) throws ServiceException { int concurrentInvocations = this.multiInvocationsCount.incrementAndGet(); try { if (concurrentInvocations >= tooManyMultiRequests) { throw new ServiceException( new RegionTooBusyException("concurrentInvocations=" + concurrentInvocations)); } Threads.sleep(multiPause); return doMultiResponse(meta, sequenceids, request); } finally { this.multiInvocationsCount.decrementAndGet(); } }
/** * Wait for Mini HBase Cluster to shut down. Presumes you've already called {@link #shutdown()}. */ public void join() { if (this.regionThreads != null) { for (Thread t : this.regionThreads) { if (t.isAlive()) { try { Threads.threadDumpingIsAlive(t); } catch (InterruptedException e) { LOG.debug("Interrupted", e); } } } } if (this.masterThreads != null) { for (Thread t : this.masterThreads) { if (t.isAlive()) { try { Threads.threadDumpingIsAlive(t); } catch (InterruptedException e) { LOG.debug("Interrupted", e); } } } } }
public void testRpcWithChaosMonkey(boolean isSyncClient) throws Throwable { LOG.info("Starting test"); Cluster cluster = new Cluster(10, 100); for (int i = 0; i < 10; i++) { cluster.startServer(); } ArrayList<SimpleClient> clients = new ArrayList<>(); // all threads should share the same rpc client AbstractRpcClient<?> rpcClient = createRpcClient(conf, isSyncClient); for (int i = 0; i < 30; i++) { String clientId = "client_" + i + "_"; LOG.info("Starting client: " + clientId); SimpleClient client = new SimpleClient(cluster, rpcClient, clientId); client.start(); clients.add(client); } LOG.info("Starting MiniChaosMonkey"); MiniChaosMonkey cm = new MiniChaosMonkey(cluster); cm.start(); Threads.sleep(30000); LOG.info("Stopping MiniChaosMonkey"); cm.stopRunning(); cm.join(); cm.rethrowException(); LOG.info("Stopping clients"); for (SimpleClient client : clients) { LOG.info("Stopping client: " + client.id); LOG.info(client.id + " numCalls:" + client.numCalls); client.stopRunning(); client.join(); client.rethrowException(); assertTrue(client.numCalls > 10); } LOG.info("Stopping RpcClient"); rpcClient.close(); LOG.info("Stopping Cluster"); cluster.stopRunning(); }
@Override public void run() { while (!isStopped()) { try { List<HRegionInfo> regions = MetaScanner.listAllRegions(TEST_UTIL.getConfiguration(), connection, false); // select a random region HRegionInfo parent = regions.get(random.nextInt(regions.size())); if (parent == null || !TABLENAME.equals(parent.getTable())) { continue; } long startKey = 0, endKey = Long.MAX_VALUE; byte[] start = parent.getStartKey(); byte[] end = parent.getEndKey(); if (!Bytes.equals(HConstants.EMPTY_START_ROW, parent.getStartKey())) { startKey = Bytes.toLong(parent.getStartKey()); } if (!Bytes.equals(HConstants.EMPTY_END_ROW, parent.getEndKey())) { endKey = Bytes.toLong(parent.getEndKey()); } if (startKey == endKey) { continue; } long midKey = BigDecimal.valueOf(startKey) .add(BigDecimal.valueOf(endKey)) .divideToIntegralValue(BigDecimal.valueOf(2)) .longValue(); HRegionInfo splita = new HRegionInfo(TABLENAME, start, Bytes.toBytes(midKey)); HRegionInfo splitb = new HRegionInfo(TABLENAME, Bytes.toBytes(midKey), end); MetaTableAccessor.splitRegion( connection, parent, splita, splitb, ServerName.valueOf("fooserver", 1, 0)); Threads.sleep(random.nextInt(200)); } catch (Throwable e) { ex = e; Assert.fail(StringUtils.stringifyException(e)); } } }
@Test(timeout = 300000) public void testClusterRestart() throws Exception { UTIL.startMiniCluster(3); while (!UTIL.getMiniHBaseCluster().getMaster().isInitialized()) { Threads.sleep(1); } LOG.info("\n\nCreating tables"); for (byte[] TABLE : TABLES) { UTIL.createTable(TABLE, FAMILY); } for (byte[] TABLE : TABLES) { UTIL.waitTableEnabled(TABLE); } List<HRegionInfo> allRegions = MetaScanner.listAllRegions(UTIL.getConfiguration(), true); assertEquals(4, allRegions.size()); LOG.info("\n\nShutting down cluster"); UTIL.shutdownMiniHBaseCluster(); LOG.info("\n\nSleeping a bit"); Thread.sleep(2000); LOG.info("\n\nStarting cluster the second time"); UTIL.restartHBaseCluster(3); // Need to use a new 'Configuration' so we make a new HConnection. // Otherwise we're reusing an HConnection that has gone stale because // the shutdown of the cluster also called shut of the connection. allRegions = MetaScanner.listAllRegions(new Configuration(UTIL.getConfiguration()), true); assertEquals(4, allRegions.size()); LOG.info("\n\nWaiting for tables to be available"); for (byte[] TABLE : TABLES) { try { UTIL.createTable(TABLE, FAMILY); assertTrue("Able to create table that should already exist", false); } catch (TableExistsException tee) { LOG.info("Table already exists as expected"); } UTIL.waitTableAvailable(TABLE); } }
public static void stopMasterAndAssignMeta(HBaseTestingUtility HTU) throws IOException, InterruptedException { // Stop master HMaster master = HTU.getHBaseCluster().getMaster(); ServerName masterAddr = master.getServerName(); master.stopMaster(); Log.info("Waiting until master thread exits"); while (HTU.getHBaseCluster().getMasterThread() != null && HTU.getHBaseCluster().getMasterThread().isAlive()) { Threads.sleep(100); } HRegionServer.TEST_SKIP_REPORTING_TRANSITION = true; // Master is down, so is the meta. We need to assign it somewhere // so that regions can be assigned during the mocking phase. HRegionServer hrs = HTU.getHBaseCluster().getLiveRegionServerThreads().get(0).getRegionServer(); ZooKeeperWatcher zkw = hrs.getZooKeeper(); MetaTableLocator mtl = new MetaTableLocator(); ServerName sn = mtl.getMetaRegionLocation(zkw); if (sn != null && !masterAddr.equals(sn)) { return; } ProtobufUtil.openRegion( hrs.getRSRpcServices(), hrs.getServerName(), HRegionInfo.FIRST_META_REGIONINFO); while (true) { sn = mtl.getMetaRegionLocation(zkw); if (sn != null && sn.equals(hrs.getServerName()) && hrs.onlineRegions.containsKey(HRegionInfo.FIRST_META_REGIONINFO.getEncodedName())) { break; } Thread.sleep(100); } }
@Override public int run(String[] arg0) throws Exception { int errCode = 0; // TODO: Make command options. // How many servers to fake. final int servers = 1; // How many regions to put on the faked servers. final int regions = 100000; // How many 'keys' in the faked regions. final long namespaceSpan = 50000000; // How long to take to pause after doing a put; make this long if you want to fake a struggling // server. final long multiPause = 0; // Check args make basic sense. if ((namespaceSpan < regions) || (regions < servers)) { throw new IllegalArgumentException( "namespaceSpan=" + namespaceSpan + " must be > regions=" + regions + " which must be > servers=" + servers); } // Set my many servers and many regions faking connection in place. getConf().set("hbase.client.connection.impl", ManyServersManyRegionsConnection.class.getName()); // Use simple kv registry rather than zk getConf().set("hbase.client.registry.impl", SimpleRegistry.class.getName()); // When to report fails. Default is we report the 10th. This means we'll see log everytime // an exception is thrown -- usually RegionTooBusyException when we have more than // hbase.test.multi.too.many requests outstanding at any time. getConf().setInt("hbase.client.start.log.errors.counter", 0); // Ugly but this is only way to pass in configs.into ManyServersManyRegionsConnection class. getConf().setInt("hbase.test.regions", regions); getConf().setLong("hbase.test.namespace.span", namespaceSpan); getConf().setLong("hbase.test.servers", servers); getConf().set("hbase.test.tablename", Bytes.toString(BIG_USER_TABLE)); getConf().setLong("hbase.test.multi.pause.when.done", multiPause); // Let there be ten outstanding requests at a time before we throw RegionBusyException. getConf().setInt("hbase.test.multi.too.many", 10); final int clients = 2; // Have them all share the same connection so they all share the same instance of // ManyServersManyRegionsConnection so I can keep an eye on how many requests by server. final ExecutorService pool = Executors.newCachedThreadPool(Threads.getNamedThreadFactory("p")); // Executors.newFixedThreadPool(servers * 10, Threads.getNamedThreadFactory("p")); // Share a connection so I can keep counts in the 'server' on concurrency. final HConnection sharedConnection = HConnectionManager.createConnection(getConf() /*, pool*/); try { Thread[] ts = new Thread[clients]; for (int j = 0; j < ts.length; j++) { final int id = j; ts[j] = new Thread("" + j) { final Configuration c = getConf(); @Override public void run() { try { cycle(id, c, sharedConnection); } catch (IOException e) { e.printStackTrace(); } } }; ts[j].start(); } for (int j = 0; j < ts.length; j++) { ts[j].join(); } } finally { sharedConnection.close(); } return errCode; }
@Test public void testInterrupt50Percent() throws IOException, InterruptedException { final AtomicInteger noEx = new AtomicInteger(0); final AtomicInteger badEx = new AtomicInteger(0); final AtomicInteger noInt = new AtomicInteger(0); final AtomicInteger done = new AtomicInteger(0); List<Thread> threads = new ArrayList<Thread>(); final int nbThread = 100; for (int i = 0; i < nbThread; i++) { Thread t = new Thread() { @Override public void run() { try { Table ht = util.getConnection().getTable(tableName); Result r = ht.get(new Get(row1)); noEx.incrementAndGet(); } catch (IOException e) { LOG.info("exception", e); if (!(e instanceof InterruptedIOException) || (e instanceof SocketTimeoutException)) { badEx.incrementAndGet(); } else { if (Thread.currentThread().isInterrupted()) { noInt.incrementAndGet(); LOG.info("The thread should NOT be with the 'interrupt' status."); } } } finally { done.incrementAndGet(); } } }; t.setName("TestClientOperationInterrupt #" + i); threads.add(t); t.start(); } for (int i = 0; i < nbThread / 2; i++) { threads.get(i).interrupt(); } boolean stillAlive = true; while (stillAlive) { stillAlive = false; for (Thread t : threads) { if (t.isAlive()) { stillAlive = true; } } Threads.sleep(10); } Assert.assertFalse(Thread.currentThread().isInterrupted()); Assert.assertTrue( " noEx: " + noEx.get() + ", badEx=" + badEx.get() + ", noInt=" + noInt.get(), noEx.get() == nbThread / 2 && badEx.get() == 0); // The problem here is that we need the server to free its handlers to handle all operations while (done.get() != nbThread) { Thread.sleep(1); } Table ht = util.getConnection().getTable(tableName); Result r = ht.get(new Get(row1)); Assert.assertFalse(r.isEmpty()); }
/** * Reproduce locking up that happens when we get an inopportune sync during setup for zigzaglatch * wait. See HBASE-14317. If below is broken, we will see this test timeout because it is locked * up. * * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to set up * a dodgy WAL that will throw an exception when we go to append to it. */ @Test(timeout = 20000) public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException { // A WAL that we can have throw exceptions when a flag is set. class DodgyFSLog extends FSHLog { // Set this when want the WAL to start throwing exceptions. volatile boolean throwException = false; // Latch to hold up processing until after another operation has had time to run. CountDownLatch latch = new CountDownLatch(1); public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) throws IOException { super(fs, root, logDir, conf); } @Override protected void afterCreatingZigZagLatch() { // If throwException set, then append will throw an exception causing the WAL to be // rolled. We'll come in here. Hold up processing until a sync can get in before // the zigzag has time to complete its setup and get its own sync in. This is what causes // the lock up we've seen in production. if (throwException) { try { LOG.info("LATCHED"); // So, timing can have it that the test can run and the bad flush below happens // before we get here. In this case, we'll be stuck waiting on this latch but there // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint... // because all WALs have rolled. In this case, just give up on test. if (!this.latch.await(5, TimeUnit.SECONDS)) { LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!"); } } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } @Override protected void beforeWaitOnSafePoint() { if (throwException) { LOG.info("COUNTDOWN"); // Don't countdown latch until someone waiting on it otherwise, the above // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll // be stuck; test won't go down while (this.latch.getCount() <= 0) Threads.sleep(1); this.latch.countDown(); } } @Override protected Writer createWriterInstance(Path path) throws IOException { final Writer w = super.createWriterInstance(path); return new Writer() { @Override public void close() throws IOException { w.close(); } @Override public void sync() throws IOException { if (throwException) { throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); } w.sync(); } @Override public void append(Entry entry) throws IOException { if (throwException) { throw new IOException("FAKE! Failed to replace a bad datanode...APPEND"); } w.append(entry); } @Override public long getLength() { return w.getLength(); } }; } } // Mocked up server and regionserver services. Needed below. Server server = Mockito.mock(Server.class); Mockito.when(server.getConfiguration()).thenReturn(CONF); Mockito.when(server.isStopped()).thenReturn(false); Mockito.when(server.isAborted()).thenReturn(false); RegionServerServices services = Mockito.mock(RegionServerServices.class); // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test. FileSystem fs = FileSystem.get(CONF); Path rootDir = new Path(dir + getName()); DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF); Path originalWAL = dodgyWAL.getCurrentFileName(); // I need a log roller running. LogRoller logRoller = new LogRoller(server, services); logRoller.addWAL(dodgyWAL); // There is no 'stop' once a logRoller is running.. it just dies. logRoller.start(); // Now get a region and start adding in edits. HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME); final HRegion region = initHRegion(tableName, null, null, dodgyWAL); byte[] bytes = Bytes.toBytes(getName()); NavigableMap<byte[], Integer> scopes = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR); scopes.put(COLUMN_FAMILY_BYTES, 0); MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(); try { // First get something into memstore. Make a Put and then pull the Cell out of it. Will // manage append and sync carefully in below to manufacture hang. We keep adding same // edit. WAL subsystem doesn't care. Put put = new Put(bytes); put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes); WALKey key = new WALKey( region.getRegionInfo().getEncodedNameAsBytes(), htd.getTableName(), System.currentTimeMillis(), mvcc, scopes); WALEdit edit = new WALEdit(); CellScanner CellScanner = put.cellScanner(); assertTrue(CellScanner.advance()); edit.add(CellScanner.current()); // Put something in memstore and out in the WAL. Do a big number of appends so we push // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL for (int i = 0; i < 1000; i++) { region.put(put); } // Set it so we start throwing exceptions. LOG.info("SET throwing of exception on append"); dodgyWAL.throwException = true; // This append provokes a WAL roll request dodgyWAL.append(region.getRegionInfo(), key, edit, true); boolean exception = false; try { dodgyWAL.sync(); } catch (Exception e) { exception = true; } assertTrue("Did not get sync exception", exception); // Get a memstore flush going too so we have same hung profile as up in the issue over // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up // by the zigzaglatch waiting on syncs to come home. Thread t = new Thread("Flusher") { public void run() { try { if (region.getMemstoreSize() <= 0) { throw new IOException("memstore size=" + region.getMemstoreSize()); } region.flush(false); } catch (IOException e) { // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later // when roll completes. LOG.info("In flush", e); } LOG.info("Exiting"); }; }; t.setDaemon(true); t.start(); // Wait until while (dodgyWAL.latch.getCount() > 0) Threads.sleep(1); // Now assert I got a new WAL file put in place even though loads of errors above. assertTrue(originalWAL != dodgyWAL.getCurrentFileName()); // Can I append to it? dodgyWAL.throwException = false; try { region.put(put); } catch (Exception e) { LOG.info("In the put", e); } } finally { // To stop logRoller, its server has to say it is stopped. Mockito.when(server.isStopped()).thenReturn(true); if (logRoller != null) logRoller.close(); try { if (region != null) region.close(); if (dodgyWAL != null) dodgyWAL.close(); } catch (Exception e) { LOG.info("On way out", e); } } }
/** * Reproduce locking up that happens when there's no further syncs after append fails, and causing * an isolated sync then infinite wait. See HBASE-16960. If below is broken, we will see this test * timeout because it is locked up. * * <p>Steps for reproduce:<br> * 1. Trigger server abort through dodgyWAL1<br> * 2. Add a {@link DummyWALActionsListener} to dodgyWAL2 to cause ringbuffer event handler thread * sleep for a while thus keeping {@code endOfBatch} false<br> * 3. Publish a sync then an append which will throw exception, check whether the sync could * return */ @Test(timeout = 20000) public void testLockup16960() throws IOException { // A WAL that we can have throw exceptions when a flag is set. class DodgyFSLog extends FSHLog { // Set this when want the WAL to start throwing exceptions. volatile boolean throwException = false; public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) throws IOException { super(fs, root, logDir, conf); } @Override protected Writer createWriterInstance(Path path) throws IOException { final Writer w = super.createWriterInstance(path); return new Writer() { @Override public void close() throws IOException { w.close(); } @Override public void sync() throws IOException { if (throwException) { throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); } w.sync(); } @Override public void append(Entry entry) throws IOException { if (throwException) { throw new IOException("FAKE! Failed to replace a bad datanode...APPEND"); } w.append(entry); } @Override public long getLength() { return w.getLength(); } }; } @Override protected long doReplaceWriter(Path oldPath, Path newPath, Writer nextWriter) throws IOException { if (throwException) { throw new FailedLogCloseException("oldPath=" + oldPath + ", newPath=" + newPath); } long oldFileLen = 0L; oldFileLen = super.doReplaceWriter(oldPath, newPath, nextWriter); return oldFileLen; } } // Mocked up server and regionserver services. Needed below. Server server = new DummyServer(CONF, ServerName.valueOf("hostname1.example.org", 1234, 1L).toString()); RegionServerServices services = Mockito.mock(RegionServerServices.class); CONF.setLong("hbase.regionserver.hlog.sync.timeout", 10000); // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, // go ahead with test. FileSystem fs = FileSystem.get(CONF); Path rootDir = new Path(dir + getName()); DodgyFSLog dodgyWAL1 = new DodgyFSLog(fs, rootDir, getName(), CONF); Path rootDir2 = new Path(dir + getName() + "2"); final DodgyFSLog dodgyWAL2 = new DodgyFSLog(fs, rootDir2, getName() + "2", CONF); // Add a listener to force ringbuffer event handler sleep for a while dodgyWAL2.registerWALActionsListener(new DummyWALActionsListener()); // I need a log roller running. LogRoller logRoller = new LogRoller(server, services); logRoller.addWAL(dodgyWAL1); logRoller.addWAL(dodgyWAL2); // There is no 'stop' once a logRoller is running.. it just dies. logRoller.start(); // Now get a region and start adding in edits. HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME); final HRegion region = initHRegion(tableName, null, null, dodgyWAL1); byte[] bytes = Bytes.toBytes(getName()); NavigableMap<byte[], Integer> scopes = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR); scopes.put(COLUMN_FAMILY_BYTES, 0); MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(); try { Put put = new Put(bytes); put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes); WALKey key = new WALKey( region.getRegionInfo().getEncodedNameAsBytes(), htd.getTableName(), System.currentTimeMillis(), mvcc, scopes); WALEdit edit = new WALEdit(); CellScanner CellScanner = put.cellScanner(); assertTrue(CellScanner.advance()); edit.add(CellScanner.current()); LOG.info("SET throwing of exception on append"); dodgyWAL1.throwException = true; // This append provokes a WAL roll request dodgyWAL1.append(region.getRegionInfo(), key, edit, true); boolean exception = false; try { dodgyWAL1.sync(); } catch (Exception e) { exception = true; } assertTrue("Did not get sync exception", exception); // LogRoller call dodgyWAL1.rollWriter get FailedLogCloseException and // cause server abort. try { // wait LogRoller exit. Thread.sleep(50); } catch (InterruptedException e) { e.printStackTrace(); } final CountDownLatch latch = new CountDownLatch(1); // make RingBufferEventHandler sleep 1s, so the following sync // endOfBatch=false key = new WALKey( region.getRegionInfo().getEncodedNameAsBytes(), TableName.valueOf("sleep"), System.currentTimeMillis(), mvcc, scopes); dodgyWAL2.append(region.getRegionInfo(), key, edit, true); Thread t = new Thread("Sync") { public void run() { try { dodgyWAL2.sync(); } catch (IOException e) { LOG.info("In sync", e); } latch.countDown(); LOG.info("Sync exiting"); }; }; t.setDaemon(true); t.start(); try { // make sure sync have published. Thread.sleep(100); } catch (InterruptedException e1) { e1.printStackTrace(); } // make append throw DamagedWALException key = new WALKey( region.getRegionInfo().getEncodedNameAsBytes(), TableName.valueOf("DamagedWALException"), System.currentTimeMillis(), mvcc, scopes); dodgyWAL2.append(region.getRegionInfo(), key, edit, true); while (latch.getCount() > 0) { Threads.sleep(100); } assertTrue(server.isAborted()); } finally { if (logRoller != null) { logRoller.close(); } try { if (region != null) { region.close(); } if (dodgyWAL1 != null) { dodgyWAL1.close(); } if (dodgyWAL2 != null) { dodgyWAL2.close(); } } catch (Exception e) { LOG.info("On way out", e); } } }
private void doIncrementalLoadTest(boolean shouldChangeRegions, boolean shouldKeepLocality) throws Exception { util = new HBaseTestingUtility(); Configuration conf = util.getConfiguration(); conf.setBoolean(HFileOutputFormat2.LOCALITY_SENSITIVE_CONF_KEY, shouldKeepLocality); int hostCount = 1; int regionNum = 5; if (shouldKeepLocality) { // We should change host count higher than hdfs replica count when MiniHBaseCluster supports // explicit hostnames parameter just like MiniDFSCluster does. hostCount = 3; regionNum = 20; } byte[][] splitKeys = generateRandomSplitKeys(regionNum - 1); String[] hostnames = new String[hostCount]; for (int i = 0; i < hostCount; ++i) { hostnames[i] = "datanode_" + i; } util.startMiniCluster(1, hostCount, hostnames); Table table = util.createTable(TABLE_NAME, FAMILIES, splitKeys); Path testDir = util.getDataTestDirOnTestFS("testLocalMRIncrementalLoad"); try (RegionLocator r = util.getConnection().getRegionLocator(TABLE_NAME); Admin admin = util.getConnection().getAdmin(); ) { assertEquals("Should start with empty table", 0, util.countRows(table)); int numRegions = r.getStartKeys().length; assertEquals("Should make " + regionNum + " regions", numRegions, regionNum); // Generate the bulk load files runIncrementalPELoad(conf, table.getTableDescriptor(), r, testDir); // This doesn't write into the table, just makes files assertEquals("HFOF should not touch actual table", 0, util.countRows(table)); // Make sure that a directory was created for every CF int dir = 0; for (FileStatus f : testDir.getFileSystem(conf).listStatus(testDir)) { for (byte[] family : FAMILIES) { if (Bytes.toString(family).equals(f.getPath().getName())) { ++dir; } } } assertEquals("Column family not found in FS.", FAMILIES.length, dir); // handle the split case if (shouldChangeRegions) { LOG.info("Changing regions in table"); admin.disableTable(table.getName()); while (util.getMiniHBaseCluster() .getMaster() .getAssignmentManager() .getRegionStates() .isRegionsInTransition()) { Threads.sleep(200); LOG.info("Waiting on table to finish disabling"); } util.deleteTable(table.getName()); byte[][] newSplitKeys = generateRandomSplitKeys(14); table = util.createTable(TABLE_NAME, FAMILIES, newSplitKeys); while (util.getConnection().getRegionLocator(TABLE_NAME).getAllRegionLocations().size() != 15 || !admin.isTableAvailable(table.getName())) { Thread.sleep(200); LOG.info("Waiting for new region assignment to happen"); } } // Perform the actual load new LoadIncrementalHFiles(conf).doBulkLoad(testDir, admin, table, r); // Ensure data shows up int expectedRows = NMapInputFormat.getNumMapTasks(conf) * ROWSPERSPLIT; assertEquals( "LoadIncrementalHFiles should put expected data in table", expectedRows, util.countRows(table)); Scan scan = new Scan(); ResultScanner results = table.getScanner(scan); for (Result res : results) { assertEquals(FAMILIES.length, res.rawCells().length); Cell first = res.rawCells()[0]; for (Cell kv : res.rawCells()) { assertTrue(CellUtil.matchingRow(first, kv)); assertTrue(Bytes.equals(CellUtil.cloneValue(first), CellUtil.cloneValue(kv))); } } results.close(); String tableDigestBefore = util.checksumRows(table); // Check region locality HDFSBlocksDistribution hbd = new HDFSBlocksDistribution(); for (HRegion region : util.getHBaseCluster().getRegions(TABLE_NAME)) { hbd.add(region.getHDFSBlocksDistribution()); } for (String hostname : hostnames) { float locality = hbd.getBlockLocalityIndex(hostname); LOG.info("locality of [" + hostname + "]: " + locality); assertEquals(100, (int) (locality * 100)); } // Cause regions to reopen admin.disableTable(TABLE_NAME); while (!admin.isTableDisabled(TABLE_NAME)) { Thread.sleep(200); LOG.info("Waiting for table to disable"); } admin.enableTable(TABLE_NAME); util.waitTableAvailable(TABLE_NAME); assertEquals( "Data should remain after reopening of regions", tableDigestBefore, util.checksumRows(table)); } finally { testDir.getFileSystem(conf).delete(testDir, true); util.deleteTable(TABLE_NAME); util.shutdownMiniCluster(); } }
@Test public void testConcurrentMetaScannerAndCatalogJanitor() throws Throwable { /* TEST PLAN: start with only one region in a table. Have a splitter * thread and metascanner threads that continously scan the meta table for regions. * CatalogJanitor from master will run frequently to clean things up */ TEST_UTIL.getConfiguration().setLong("hbase.catalogjanitor.interval", 500); setUp(); final long runtime = 30 * 1000; // 30 sec LOG.info("Starting testConcurrentMetaScannerAndCatalogJanitor"); final TableName TABLENAME = TableName.valueOf("testConcurrentMetaScannerAndCatalogJanitor"); final byte[] FAMILY = Bytes.toBytes("family"); TEST_UTIL.createTable(TABLENAME, FAMILY); class RegionMetaSplitter extends StoppableImplementation implements Runnable { Random random = new Random(); Throwable ex = null; @Override public void run() { while (!isStopped()) { try { List<HRegionInfo> regions = MetaScanner.listAllRegions(TEST_UTIL.getConfiguration(), connection, false); // select a random region HRegionInfo parent = regions.get(random.nextInt(regions.size())); if (parent == null || !TABLENAME.equals(parent.getTable())) { continue; } long startKey = 0, endKey = Long.MAX_VALUE; byte[] start = parent.getStartKey(); byte[] end = parent.getEndKey(); if (!Bytes.equals(HConstants.EMPTY_START_ROW, parent.getStartKey())) { startKey = Bytes.toLong(parent.getStartKey()); } if (!Bytes.equals(HConstants.EMPTY_END_ROW, parent.getEndKey())) { endKey = Bytes.toLong(parent.getEndKey()); } if (startKey == endKey) { continue; } long midKey = BigDecimal.valueOf(startKey) .add(BigDecimal.valueOf(endKey)) .divideToIntegralValue(BigDecimal.valueOf(2)) .longValue(); HRegionInfo splita = new HRegionInfo(TABLENAME, start, Bytes.toBytes(midKey)); HRegionInfo splitb = new HRegionInfo(TABLENAME, Bytes.toBytes(midKey), end); MetaTableAccessor.splitRegion( connection, parent, splita, splitb, ServerName.valueOf("fooserver", 1, 0)); Threads.sleep(random.nextInt(200)); } catch (Throwable e) { ex = e; Assert.fail(StringUtils.stringifyException(e)); } } } void rethrowExceptionIfAny() throws Throwable { if (ex != null) { throw ex; } } } class MetaScannerVerifier extends StoppableImplementation implements Runnable { Random random = new Random(); Throwable ex = null; @Override public void run() { while (!isStopped()) { try { NavigableMap<HRegionInfo, ServerName> regions = MetaScanner.allTableRegions(connection, TABLENAME); LOG.info("-------"); byte[] lastEndKey = HConstants.EMPTY_START_ROW; for (HRegionInfo hri : regions.navigableKeySet()) { long startKey = 0, endKey = Long.MAX_VALUE; if (!Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())) { startKey = Bytes.toLong(hri.getStartKey()); } if (!Bytes.equals(HConstants.EMPTY_END_ROW, hri.getEndKey())) { endKey = Bytes.toLong(hri.getEndKey()); } LOG.info("start:" + startKey + " end:" + endKey + " hri:" + hri); Assert.assertTrue( "lastEndKey=" + Bytes.toString(lastEndKey) + ", startKey=" + Bytes.toString(hri.getStartKey()), Bytes.equals(lastEndKey, hri.getStartKey())); lastEndKey = hri.getEndKey(); } Assert.assertTrue(Bytes.equals(lastEndKey, HConstants.EMPTY_END_ROW)); LOG.info("-------"); Threads.sleep(10 + random.nextInt(50)); } catch (Throwable e) { ex = e; Assert.fail(StringUtils.stringifyException(e)); } } } void rethrowExceptionIfAny() throws Throwable { if (ex != null) { throw ex; } } } RegionMetaSplitter regionMetaSplitter = new RegionMetaSplitter(); MetaScannerVerifier metaScannerVerifier = new MetaScannerVerifier(); Thread regionMetaSplitterThread = new Thread(regionMetaSplitter); Thread metaScannerVerifierThread = new Thread(metaScannerVerifier); regionMetaSplitterThread.start(); metaScannerVerifierThread.start(); Threads.sleep(runtime); regionMetaSplitter.stop("test finished"); metaScannerVerifier.stop("test finished"); regionMetaSplitterThread.join(); metaScannerVerifierThread.join(); regionMetaSplitter.rethrowExceptionIfAny(); metaScannerVerifier.rethrowExceptionIfAny(); }
private void doIncrementalLoadTest(boolean shouldChangeRegions) throws Exception { util = new HBaseTestingUtility(); Configuration conf = util.getConfiguration(); byte[][] splitKeys = generateRandomSplitKeys(4); util.startMiniCluster(); try { HTable table = util.createTable(TABLE_NAME, FAMILIES, splitKeys); Admin admin = table.getConnection().getAdmin(); Path testDir = util.getDataTestDirOnTestFS("testLocalMRIncrementalLoad"); assertEquals("Should start with empty table", 0, util.countRows(table)); int numRegions = -1; try (RegionLocator r = table.getRegionLocator()) { numRegions = r.getStartKeys().length; } assertEquals("Should make 5 regions", numRegions, 5); // Generate the bulk load files util.startMiniMapReduceCluster(); runIncrementalPELoad(conf, table.getTableDescriptor(), table.getRegionLocator(), testDir); // This doesn't write into the table, just makes files assertEquals("HFOF should not touch actual table", 0, util.countRows(table)); // Make sure that a directory was created for every CF int dir = 0; for (FileStatus f : testDir.getFileSystem(conf).listStatus(testDir)) { for (byte[] family : FAMILIES) { if (Bytes.toString(family).equals(f.getPath().getName())) { ++dir; } } } assertEquals("Column family not found in FS.", FAMILIES.length, dir); // handle the split case if (shouldChangeRegions) { LOG.info("Changing regions in table"); admin.disableTable(table.getName()); while (util.getMiniHBaseCluster() .getMaster() .getAssignmentManager() .getRegionStates() .isRegionsInTransition()) { Threads.sleep(200); LOG.info("Waiting on table to finish disabling"); } util.deleteTable(table.getName()); byte[][] newSplitKeys = generateRandomSplitKeys(14); table = util.createTable(TABLE_NAME, FAMILIES, newSplitKeys); while (table.getRegionLocator().getAllRegionLocations().size() != 15 || !admin.isTableAvailable(table.getName())) { Thread.sleep(200); LOG.info("Waiting for new region assignment to happen"); } } // Perform the actual load new LoadIncrementalHFiles(conf).doBulkLoad(testDir, table); // Ensure data shows up int expectedRows = NMapInputFormat.getNumMapTasks(conf) * ROWSPERSPLIT; assertEquals( "LoadIncrementalHFiles should put expected data in table", expectedRows, util.countRows(table)); Scan scan = new Scan(); ResultScanner results = table.getScanner(scan); for (Result res : results) { assertEquals(FAMILIES.length, res.rawCells().length); Cell first = res.rawCells()[0]; for (Cell kv : res.rawCells()) { assertTrue(CellUtil.matchingRow(first, kv)); assertTrue(Bytes.equals(CellUtil.cloneValue(first), CellUtil.cloneValue(kv))); } } results.close(); String tableDigestBefore = util.checksumRows(table); // Cause regions to reopen admin.disableTable(TABLE_NAME); while (!admin.isTableDisabled(TABLE_NAME)) { Thread.sleep(200); LOG.info("Waiting for table to disable"); } admin.enableTable(TABLE_NAME); util.waitTableAvailable(TABLE_NAME); assertEquals( "Data should remain after reopening of regions", tableDigestBefore, util.checksumRows(table)); } finally { util.shutdownMiniMapReduceCluster(); util.shutdownMiniCluster(); } }