예제 #1
0
  /**
   * Reproduce locking up that happens when there's no further syncs after append fails, and causing
   * an isolated sync then infinite wait. See HBASE-16960. If below is broken, we will see this test
   * timeout because it is locked up.
   *
   * <p>Steps for reproduce:<br>
   * 1. Trigger server abort through dodgyWAL1<br>
   * 2. Add a {@link DummyWALActionsListener} to dodgyWAL2 to cause ringbuffer event handler thread
   * sleep for a while thus keeping {@code endOfBatch} false<br>
   * 3. Publish a sync then an append which will throw exception, check whether the sync could
   * return
   */
  @Test(timeout = 20000)
  public void testLockup16960() throws IOException {
    // A WAL that we can have throw exceptions when a flag is set.
    class DodgyFSLog extends FSHLog {
      // Set this when want the WAL to start throwing exceptions.
      volatile boolean throwException = false;

      public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
          throws IOException {
        super(fs, root, logDir, conf);
      }

      @Override
      protected Writer createWriterInstance(Path path) throws IOException {
        final Writer w = super.createWriterInstance(path);
        return new Writer() {
          @Override
          public void close() throws IOException {
            w.close();
          }

          @Override
          public void sync() throws IOException {
            if (throwException) {
              throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
            }
            w.sync();
          }

          @Override
          public void append(Entry entry) throws IOException {
            if (throwException) {
              throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
            }
            w.append(entry);
          }

          @Override
          public long getLength() {
            return w.getLength();
          }
        };
      }

      @Override
      protected long doReplaceWriter(Path oldPath, Path newPath, Writer nextWriter)
          throws IOException {
        if (throwException) {
          throw new FailedLogCloseException("oldPath=" + oldPath + ", newPath=" + newPath);
        }
        long oldFileLen = 0L;
        oldFileLen = super.doReplaceWriter(oldPath, newPath, nextWriter);
        return oldFileLen;
      }
    }

    // Mocked up server and regionserver services. Needed below.
    Server server =
        new DummyServer(CONF, ServerName.valueOf("hostname1.example.org", 1234, 1L).toString());
    RegionServerServices services = Mockito.mock(RegionServerServices.class);

    CONF.setLong("hbase.regionserver.hlog.sync.timeout", 10000);

    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL,
    // go ahead with test.
    FileSystem fs = FileSystem.get(CONF);
    Path rootDir = new Path(dir + getName());
    DodgyFSLog dodgyWAL1 = new DodgyFSLog(fs, rootDir, getName(), CONF);

    Path rootDir2 = new Path(dir + getName() + "2");
    final DodgyFSLog dodgyWAL2 = new DodgyFSLog(fs, rootDir2, getName() + "2", CONF);
    // Add a listener to force ringbuffer event handler sleep for a while
    dodgyWAL2.registerWALActionsListener(new DummyWALActionsListener());

    // I need a log roller running.
    LogRoller logRoller = new LogRoller(server, services);
    logRoller.addWAL(dodgyWAL1);
    logRoller.addWAL(dodgyWAL2);
    // There is no 'stop' once a logRoller is running.. it just dies.
    logRoller.start();
    // Now get a region and start adding in edits.
    HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
    final HRegion region = initHRegion(tableName, null, null, dodgyWAL1);
    byte[] bytes = Bytes.toBytes(getName());
    NavigableMap<byte[], Integer> scopes = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    scopes.put(COLUMN_FAMILY_BYTES, 0);
    MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
    try {
      Put put = new Put(bytes);
      put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
      WALKey key =
          new WALKey(
              region.getRegionInfo().getEncodedNameAsBytes(),
              htd.getTableName(),
              System.currentTimeMillis(),
              mvcc,
              scopes);
      WALEdit edit = new WALEdit();
      CellScanner CellScanner = put.cellScanner();
      assertTrue(CellScanner.advance());
      edit.add(CellScanner.current());

      LOG.info("SET throwing of exception on append");
      dodgyWAL1.throwException = true;
      // This append provokes a WAL roll request
      dodgyWAL1.append(region.getRegionInfo(), key, edit, true);
      boolean exception = false;
      try {
        dodgyWAL1.sync();
      } catch (Exception e) {
        exception = true;
      }
      assertTrue("Did not get sync exception", exception);

      // LogRoller call dodgyWAL1.rollWriter get FailedLogCloseException and
      // cause server abort.
      try {
        // wait LogRoller exit.
        Thread.sleep(50);
      } catch (InterruptedException e) {
        e.printStackTrace();
      }

      final CountDownLatch latch = new CountDownLatch(1);

      // make RingBufferEventHandler sleep 1s, so the following sync
      // endOfBatch=false
      key =
          new WALKey(
              region.getRegionInfo().getEncodedNameAsBytes(),
              TableName.valueOf("sleep"),
              System.currentTimeMillis(),
              mvcc,
              scopes);
      dodgyWAL2.append(region.getRegionInfo(), key, edit, true);

      Thread t =
          new Thread("Sync") {
            public void run() {
              try {
                dodgyWAL2.sync();
              } catch (IOException e) {
                LOG.info("In sync", e);
              }
              latch.countDown();
              LOG.info("Sync exiting");
            };
          };
      t.setDaemon(true);
      t.start();
      try {
        // make sure sync have published.
        Thread.sleep(100);
      } catch (InterruptedException e1) {
        e1.printStackTrace();
      }
      // make append throw DamagedWALException
      key =
          new WALKey(
              region.getRegionInfo().getEncodedNameAsBytes(),
              TableName.valueOf("DamagedWALException"),
              System.currentTimeMillis(),
              mvcc,
              scopes);
      dodgyWAL2.append(region.getRegionInfo(), key, edit, true);

      while (latch.getCount() > 0) {
        Threads.sleep(100);
      }
      assertTrue(server.isAborted());
    } finally {
      if (logRoller != null) {
        logRoller.close();
      }
      try {
        if (region != null) {
          region.close();
        }
        if (dodgyWAL1 != null) {
          dodgyWAL1.close();
        }
        if (dodgyWAL2 != null) {
          dodgyWAL2.close();
        }
      } catch (Exception e) {
        LOG.info("On way out", e);
      }
    }
  }
예제 #2
0
  /**
   * Reproduce locking up that happens when we get an inopportune sync during setup for zigzaglatch
   * wait. See HBASE-14317. If below is broken, we will see this test timeout because it is locked
   * up.
   *
   * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to set up
   * a dodgy WAL that will throw an exception when we go to append to it.
   */
  @Test(timeout = 20000)
  public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
    // A WAL that we can have throw exceptions when a flag is set.
    class DodgyFSLog extends FSHLog {
      // Set this when want the WAL to start throwing exceptions.
      volatile boolean throwException = false;

      // Latch to hold up processing until after another operation has had time to run.
      CountDownLatch latch = new CountDownLatch(1);

      public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
          throws IOException {
        super(fs, root, logDir, conf);
      }

      @Override
      protected void afterCreatingZigZagLatch() {
        // If throwException set, then append will throw an exception causing the WAL to be
        // rolled. We'll come in here. Hold up processing until a sync can get in before
        // the zigzag has time to complete its setup and get its own sync in. This is what causes
        // the lock up we've seen in production.
        if (throwException) {
          try {
            LOG.info("LATCHED");
            // So, timing can have it that the test can run and the bad flush below happens
            // before we get here. In this case, we'll be stuck waiting on this latch but there
            // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
            // because all WALs have rolled. In this case, just give up on test.
            if (!this.latch.await(5, TimeUnit.SECONDS)) {
              LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
            }
          } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
          }
        }
      }

      @Override
      protected void beforeWaitOnSafePoint() {
        if (throwException) {
          LOG.info("COUNTDOWN");
          // Don't countdown latch until someone waiting on it otherwise, the above
          // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
          // be stuck; test won't go down
          while (this.latch.getCount() <= 0) Threads.sleep(1);
          this.latch.countDown();
        }
      }

      @Override
      protected Writer createWriterInstance(Path path) throws IOException {
        final Writer w = super.createWriterInstance(path);
        return new Writer() {
          @Override
          public void close() throws IOException {
            w.close();
          }

          @Override
          public void sync() throws IOException {
            if (throwException) {
              throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
            }
            w.sync();
          }

          @Override
          public void append(Entry entry) throws IOException {
            if (throwException) {
              throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
            }
            w.append(entry);
          }

          @Override
          public long getLength() {
            return w.getLength();
          }
        };
      }
    }

    // Mocked up server and regionserver services. Needed below.
    Server server = Mockito.mock(Server.class);
    Mockito.when(server.getConfiguration()).thenReturn(CONF);
    Mockito.when(server.isStopped()).thenReturn(false);
    Mockito.when(server.isAborted()).thenReturn(false);
    RegionServerServices services = Mockito.mock(RegionServerServices.class);

    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
    FileSystem fs = FileSystem.get(CONF);
    Path rootDir = new Path(dir + getName());
    DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
    Path originalWAL = dodgyWAL.getCurrentFileName();
    // I need a log roller running.
    LogRoller logRoller = new LogRoller(server, services);
    logRoller.addWAL(dodgyWAL);
    // There is no 'stop' once a logRoller is running.. it just dies.
    logRoller.start();
    // Now get a region and start adding in edits.
    HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
    final HRegion region = initHRegion(tableName, null, null, dodgyWAL);
    byte[] bytes = Bytes.toBytes(getName());
    NavigableMap<byte[], Integer> scopes = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    scopes.put(COLUMN_FAMILY_BYTES, 0);
    MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
    try {
      // First get something into memstore. Make a Put and then pull the Cell out of it. Will
      // manage append and sync carefully in below to manufacture hang. We keep adding same
      // edit. WAL subsystem doesn't care.
      Put put = new Put(bytes);
      put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
      WALKey key =
          new WALKey(
              region.getRegionInfo().getEncodedNameAsBytes(),
              htd.getTableName(),
              System.currentTimeMillis(),
              mvcc,
              scopes);
      WALEdit edit = new WALEdit();
      CellScanner CellScanner = put.cellScanner();
      assertTrue(CellScanner.advance());
      edit.add(CellScanner.current());
      // Put something in memstore and out in the WAL. Do a big number of appends so we push
      // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
      for (int i = 0; i < 1000; i++) {
        region.put(put);
      }
      // Set it so we start throwing exceptions.
      LOG.info("SET throwing of exception on append");
      dodgyWAL.throwException = true;
      // This append provokes a WAL roll request
      dodgyWAL.append(region.getRegionInfo(), key, edit, true);
      boolean exception = false;
      try {
        dodgyWAL.sync();
      } catch (Exception e) {
        exception = true;
      }
      assertTrue("Did not get sync exception", exception);

      // Get a memstore flush going too so we have same hung profile as up in the issue over
      // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
      // by the zigzaglatch waiting on syncs to come home.
      Thread t =
          new Thread("Flusher") {
            public void run() {
              try {
                if (region.getMemstoreSize() <= 0) {
                  throw new IOException("memstore size=" + region.getMemstoreSize());
                }
                region.flush(false);
              } catch (IOException e) {
                // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
                // when roll completes.
                LOG.info("In flush", e);
              }
              LOG.info("Exiting");
            };
          };
      t.setDaemon(true);
      t.start();
      // Wait until
      while (dodgyWAL.latch.getCount() > 0) Threads.sleep(1);
      // Now assert I got a new WAL file put in place even though loads of errors above.
      assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
      // Can I append to it?
      dodgyWAL.throwException = false;
      try {
        region.put(put);
      } catch (Exception e) {
        LOG.info("In the put", e);
      }
    } finally {
      // To stop logRoller, its server has to say it is stopped.
      Mockito.when(server.isStopped()).thenReturn(true);
      if (logRoller != null) logRoller.close();
      try {
        if (region != null) region.close();
        if (dodgyWAL != null) dodgyWAL.close();
      } catch (Exception e) {
        LOG.info("On way out", e);
      }
    }
  }