/**
  * Test that if we fail a flush, abort gets set on close.
  *
  * @see <a href="https://issues.apache.org/jira/browse/HBASE-4270">HBASE-4270</a>
  * @throws IOException
  * @throws NodeExistsException
  * @throws KeeperException
  */
 @Test
 public void testFailedFlushAborts() throws IOException, NodeExistsException, KeeperException {
   final Server server = new MockServer(HTU, false);
   final RegionServerServices rss = HTU.createMockRegionServerService();
   HTableDescriptor htd = TEST_HTD;
   final HRegionInfo hri =
       new HRegionInfo(htd.getTableName(), HConstants.EMPTY_END_ROW, HConstants.EMPTY_END_ROW);
   HRegion region = HTU.createLocalHRegion(hri, htd);
   try {
     assertNotNull(region);
     // Spy on the region so can throw exception when close is called.
     HRegion spy = Mockito.spy(region);
     final boolean abort = false;
     Mockito.when(spy.close(abort)).thenThrow(new RuntimeException("Mocked failed close!"));
     // The CloseRegionHandler will try to get an HRegion that corresponds
     // to the passed hri -- so insert the region into the online region Set.
     rss.addToOnlineRegions(spy);
     // Assert the Server is NOT stopped before we call close region.
     assertFalse(server.isStopped());
     CloseRegionHandler handler = new CloseRegionHandler(server, rss, hri, false, false, -1);
     boolean throwable = false;
     try {
       handler.process();
     } catch (Throwable t) {
       throwable = true;
     } finally {
       assertTrue(throwable);
       // Abort calls stop so stopped flag should be set.
       assertTrue(server.isStopped());
     }
   } finally {
     HRegion.closeHRegion(region);
   }
 }
  /**
   * Inspect the log directory to recover any log file without
   * an active region server.
   */
  void splitLogAfterStartup() {
    boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
        HLog.SPLIT_SKIP_ERRORS_DEFAULT);
    Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
    do {
      if (master.isStopped()) {
        LOG.warn("Master stopped while splitting logs");
        break;
      }
      List<ServerName> serverNames = new ArrayList<ServerName>();
      try {
        if (!this.fs.exists(logsDirPath)) return;
        FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
        // Get online servers after getting log folders to avoid log folder deletion of newly
        // checked in region servers . see HBASE-5916
        Set<ServerName> onlineServers = ((HMaster) master).getServerManager().getOnlineServers()
            .keySet();

        if (logFolders == null || logFolders.length == 0) {
          LOG.debug("No log files to split, proceeding...");
          return;
        }
        for (FileStatus status : logFolders) {
          String sn = status.getPath().getName();
          // truncate splitting suffix if present (for ServerName parsing)
          if (sn.endsWith(HLog.SPLITTING_EXT)) {
            sn = sn.substring(0, sn.length() - HLog.SPLITTING_EXT.length());
          }
          ServerName serverName = ServerName.parseServerName(sn);
          if (!onlineServers.contains(serverName)) {
            LOG.info("Log folder " + status.getPath() + " doesn't belong "
                + "to a known region server, splitting");
            serverNames.add(serverName);
          } else {
            LOG.info("Log folder " + status.getPath()
                + " belongs to an existing region server");
          }
        }
        splitLog(serverNames);
        retrySplitting = false;
      } catch (IOException ioe) {
        LOG.warn("Failed splitting of " + serverNames, ioe);
        if (!checkFileSystem()) {
          LOG.warn("Bad Filesystem, exiting");
          Runtime.getRuntime().halt(1);
        }
        try {
          if (retrySplitting) {
            Thread.sleep(conf.getInt(
              "hbase.hlog.split.failure.retry.interval", 30 * 1000));
          }
        } catch (InterruptedException e) {
          LOG.warn("Interrupted, aborting since cannot return w/o splitting");
          Thread.currentThread().interrupt();
          retrySplitting = false;
          Runtime.getRuntime().halt(1);
        }
      }
    } while (retrySplitting);
  }
예제 #3
0
 /**
  * Perform time consuming opening of the daughter regions.
  *
  * @param server Hosting server instance. Can be null when testing
  * @param services Used to online/offline regions.
  * @param a first daughter region
  * @param a second daughter region
  * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server,
  *     RegionServerServices)}
  */
 @VisibleForTesting
 void openDaughters(final Server server, final RegionServerServices services, Region a, Region b)
     throws IOException {
   boolean stopped = server != null && server.isStopped();
   boolean stopping = services != null && services.isStopping();
   // TODO: Is this check needed here?
   if (stopped || stopping) {
     LOG.info(
         "Not opening daughters "
             + b.getRegionInfo().getRegionNameAsString()
             + " and "
             + a.getRegionInfo().getRegionNameAsString()
             + " because stopping="
             + stopping
             + ", stopped="
             + stopped);
   } else {
     // Open daughters in parallel.
     DaughterOpener aOpener = new DaughterOpener(server, a);
     DaughterOpener bOpener = new DaughterOpener(server, b);
     aOpener.start();
     bOpener.start();
     try {
       aOpener.join();
       if (aOpener.getException() == null) {
         transition(SplitTransactionPhase.OPENED_REGION_A);
       }
       bOpener.join();
       if (bOpener.getException() == null) {
         transition(SplitTransactionPhase.OPENED_REGION_B);
       }
     } catch (InterruptedException e) {
       throw (InterruptedIOException) new InterruptedIOException().initCause(e);
     }
     if (aOpener.getException() != null) {
       throw new IOException("Failed " + aOpener.getName(), aOpener.getException());
     }
     if (bOpener.getException() != null) {
       throw new IOException("Failed " + bOpener.getName(), bOpener.getException());
     }
     if (services != null) {
       if (!services.reportRegionStateTransition(
           TransitionCode.SPLIT, parent.getRegionInfo(), hri_a, hri_b)) {
         throw new IOException(
             "Failed to report split region to master: "
                 + parent.getRegionInfo().getShortNameToLog());
       }
       // Should add it to OnlineRegions
       services.addToOnlineRegions(b);
       services.addToOnlineRegions(a);
     }
   }
 }
  /**
   * Perform time consuming opening of the merged region.
   *
   * @param server Hosting server instance. Can be null when testing
   * @param services Used to online/offline regions.
   * @param merged the merged region
   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server,
   *     RegionServerServices)}
   */
  void openMergedRegion(final Server server, final RegionServerServices services, HRegion merged)
      throws IOException {
    boolean stopped = server != null && server.isStopped();
    boolean stopping = services != null && services.isStopping();
    if (stopped || stopping) {
      LOG.info(
          "Not opening merged region  "
              + merged.getRegionNameAsString()
              + " because stopping="
              + stopping
              + ", stopped="
              + stopped);
      return;
    }
    HRegionInfo hri = merged.getRegionInfo();
    LoggingProgressable reporter =
        server == null
            ? null
            : new LoggingProgressable(
                hri,
                server
                    .getConfiguration()
                    .getLong("hbase.regionserver.regionmerge.open.log.interval", 10000));
    merged.openHRegion(reporter);

    if (services != null) {
      try {
        if (useCoordinationForAssignment) {
          services.postOpenDeployTasks(merged);
        } else if (!services.reportRegionStateTransition(
            TransitionCode.MERGED,
            mergedRegionInfo,
            region_a.getRegionInfo(),
            region_b.getRegionInfo())) {
          throw new IOException(
              "Failed to report merged region to master: " + mergedRegionInfo.getShortNameToLog());
        }
        services.addToOnlineRegions(merged);
      } catch (KeeperException ke) {
        throw new IOException(ke);
      }
    }
  }
예제 #5
0
  /**
   * Reproduce locking up that happens when we get an inopportune sync during setup for zigzaglatch
   * wait. See HBASE-14317. If below is broken, we will see this test timeout because it is locked
   * up.
   *
   * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to set up
   * a dodgy WAL that will throw an exception when we go to append to it.
   */
  @Test(timeout = 20000)
  public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
    // A WAL that we can have throw exceptions when a flag is set.
    class DodgyFSLog extends FSHLog {
      // Set this when want the WAL to start throwing exceptions.
      volatile boolean throwException = false;

      // Latch to hold up processing until after another operation has had time to run.
      CountDownLatch latch = new CountDownLatch(1);

      public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
          throws IOException {
        super(fs, root, logDir, conf);
      }

      @Override
      protected void afterCreatingZigZagLatch() {
        // If throwException set, then append will throw an exception causing the WAL to be
        // rolled. We'll come in here. Hold up processing until a sync can get in before
        // the zigzag has time to complete its setup and get its own sync in. This is what causes
        // the lock up we've seen in production.
        if (throwException) {
          try {
            LOG.info("LATCHED");
            // So, timing can have it that the test can run and the bad flush below happens
            // before we get here. In this case, we'll be stuck waiting on this latch but there
            // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
            // because all WALs have rolled. In this case, just give up on test.
            if (!this.latch.await(5, TimeUnit.SECONDS)) {
              LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
            }
          } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
          }
        }
      }

      @Override
      protected void beforeWaitOnSafePoint() {
        if (throwException) {
          LOG.info("COUNTDOWN");
          // Don't countdown latch until someone waiting on it otherwise, the above
          // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
          // be stuck; test won't go down
          while (this.latch.getCount() <= 0) Threads.sleep(1);
          this.latch.countDown();
        }
      }

      @Override
      protected Writer createWriterInstance(Path path) throws IOException {
        final Writer w = super.createWriterInstance(path);
        return new Writer() {
          @Override
          public void close() throws IOException {
            w.close();
          }

          @Override
          public void sync() throws IOException {
            if (throwException) {
              throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
            }
            w.sync();
          }

          @Override
          public void append(Entry entry) throws IOException {
            if (throwException) {
              throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
            }
            w.append(entry);
          }

          @Override
          public long getLength() {
            return w.getLength();
          }
        };
      }
    }

    // Mocked up server and regionserver services. Needed below.
    Server server = Mockito.mock(Server.class);
    Mockito.when(server.getConfiguration()).thenReturn(CONF);
    Mockito.when(server.isStopped()).thenReturn(false);
    Mockito.when(server.isAborted()).thenReturn(false);
    RegionServerServices services = Mockito.mock(RegionServerServices.class);

    // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
    FileSystem fs = FileSystem.get(CONF);
    Path rootDir = new Path(dir + getName());
    DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
    Path originalWAL = dodgyWAL.getCurrentFileName();
    // I need a log roller running.
    LogRoller logRoller = new LogRoller(server, services);
    logRoller.addWAL(dodgyWAL);
    // There is no 'stop' once a logRoller is running.. it just dies.
    logRoller.start();
    // Now get a region and start adding in edits.
    HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
    final HRegion region = initHRegion(tableName, null, null, dodgyWAL);
    byte[] bytes = Bytes.toBytes(getName());
    NavigableMap<byte[], Integer> scopes = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    scopes.put(COLUMN_FAMILY_BYTES, 0);
    MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
    try {
      // First get something into memstore. Make a Put and then pull the Cell out of it. Will
      // manage append and sync carefully in below to manufacture hang. We keep adding same
      // edit. WAL subsystem doesn't care.
      Put put = new Put(bytes);
      put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
      WALKey key =
          new WALKey(
              region.getRegionInfo().getEncodedNameAsBytes(),
              htd.getTableName(),
              System.currentTimeMillis(),
              mvcc,
              scopes);
      WALEdit edit = new WALEdit();
      CellScanner CellScanner = put.cellScanner();
      assertTrue(CellScanner.advance());
      edit.add(CellScanner.current());
      // Put something in memstore and out in the WAL. Do a big number of appends so we push
      // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
      for (int i = 0; i < 1000; i++) {
        region.put(put);
      }
      // Set it so we start throwing exceptions.
      LOG.info("SET throwing of exception on append");
      dodgyWAL.throwException = true;
      // This append provokes a WAL roll request
      dodgyWAL.append(region.getRegionInfo(), key, edit, true);
      boolean exception = false;
      try {
        dodgyWAL.sync();
      } catch (Exception e) {
        exception = true;
      }
      assertTrue("Did not get sync exception", exception);

      // Get a memstore flush going too so we have same hung profile as up in the issue over
      // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
      // by the zigzaglatch waiting on syncs to come home.
      Thread t =
          new Thread("Flusher") {
            public void run() {
              try {
                if (region.getMemstoreSize() <= 0) {
                  throw new IOException("memstore size=" + region.getMemstoreSize());
                }
                region.flush(false);
              } catch (IOException e) {
                // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
                // when roll completes.
                LOG.info("In flush", e);
              }
              LOG.info("Exiting");
            };
          };
      t.setDaemon(true);
      t.start();
      // Wait until
      while (dodgyWAL.latch.getCount() > 0) Threads.sleep(1);
      // Now assert I got a new WAL file put in place even though loads of errors above.
      assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
      // Can I append to it?
      dodgyWAL.throwException = false;
      try {
        region.put(put);
      } catch (Exception e) {
        LOG.info("In the put", e);
      }
    } finally {
      // To stop logRoller, its server has to say it is stopped.
      Mockito.when(server.isStopped()).thenReturn(true);
      if (logRoller != null) logRoller.close();
      try {
        if (region != null) region.close();
        if (dodgyWAL != null) dodgyWAL.close();
      } catch (Exception e) {
        LOG.info("On way out", e);
      }
    }
  }
예제 #6
0
  /**
   * Prepare the regions and region files.
   *
   * @param server Hosting server instance. Can be null when testing (won't try and update in zk if
   *     a null server)
   * @param services Used to online/offline regions.
   * @param user
   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server,
   *     RegionServerServices)}
   * @return Regions created
   */
  @VisibleForTesting
  PairOfSameType<Region> createDaughters(
      final Server server, final RegionServerServices services, User user) throws IOException {
    LOG.info("Starting split of region " + this.parent);
    if ((server != null && server.isStopped()) || (services != null && services.isStopping())) {
      throw new IOException("Server is stopped or stopping");
    }
    assert !this.parent.lock.writeLock().isHeldByCurrentThread()
        : "Unsafe to hold write lock while performing RPCs";

    transition(SplitTransactionPhase.BEFORE_PRE_SPLIT_HOOK);

    // Coprocessor callback
    if (this.parent.getCoprocessorHost() != null) {
      // TODO: Remove one of these
      parent.getCoprocessorHost().preSplit(user);
      parent.getCoprocessorHost().preSplit(splitrow, user);
    }

    transition(SplitTransactionPhase.AFTER_PRE_SPLIT_HOOK);

    // If true, no cluster to write meta edits to or to update znodes in.
    boolean testing =
        server == null
            ? true
            : server.getConfiguration().getBoolean("hbase.testing.nocluster", false);
    this.fileSplitTimeout =
        testing
            ? this.fileSplitTimeout
            : server
                .getConfiguration()
                .getLong("hbase.regionserver.fileSplitTimeout", this.fileSplitTimeout);

    PairOfSameType<Region> daughterRegions = stepsBeforePONR(server, services, testing);

    final List<Mutation> metaEntries = new ArrayList<Mutation>();
    boolean ret = false;
    if (this.parent.getCoprocessorHost() != null) {
      ret = parent.getCoprocessorHost().preSplitBeforePONR(splitrow, metaEntries, user);
      if (ret) {
        throw new IOException(
            "Coprocessor bypassing region "
                + parent.getRegionInfo().getRegionNameAsString()
                + " split.");
      }
      try {
        for (Mutation p : metaEntries) {
          HRegionInfo.parseRegionName(p.getRow());
        }
      } catch (IOException e) {
        LOG.error(
            "Row key of mutation from coprossor is not parsable as region name."
                + "Mutations from coprocessor should only for hbase:meta table.");
        throw e;
      }
    }

    // This is the point of no return.  Adding subsequent edits to .META. as we
    // do below when we do the daughter opens adding each to .META. can fail in
    // various interesting ways the most interesting of which is a timeout
    // BUT the edits all go through (See HBASE-3872).  IF we reach the PONR
    // then subsequent failures need to crash out this regionserver; the
    // server shutdown processing should be able to fix-up the incomplete split.
    // The offlined parent will have the daughters as extra columns.  If
    // we leave the daughter regions in place and do not remove them when we
    // crash out, then they will have their references to the parent in place
    // still and the server shutdown fixup of .META. will point to these
    // regions.
    // We should add PONR JournalEntry before offlineParentInMeta,so even if
    // OfflineParentInMeta timeout,this will cause regionserver exit,and then
    // master ServerShutdownHandler will fix daughter & avoid data loss. (See
    // HBase-4562).

    transition(SplitTransactionPhase.PONR);

    // Edit parent in meta.  Offlines parent region and adds splita and splitb
    // as an atomic update. See HBASE-7721. This update to META makes the region
    // will determine whether the region is split or not in case of failures.
    // If it is successful, master will roll-forward, if not, master will rollback
    // and assign the parent region.
    if (services != null
        && !services.reportRegionStateTransition(
            TransitionCode.SPLIT_PONR, parent.getRegionInfo(), hri_a, hri_b)) {
      // Passed PONR, let SSH clean it up
      throw new IOException(
          "Failed to notify master that split passed PONR: "
              + parent.getRegionInfo().getRegionNameAsString());
    }
    return daughterRegions;
  }
  /**
   * Prepare the merged region and region files.
   *
   * @param server Hosting server instance. Can be null when testing
   * @param services Used to online/offline regions.
   * @return merged region
   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server,
   *     RegionServerServices)}
   */
  HRegion createMergedRegion(final Server server, final RegionServerServices services)
      throws IOException {
    LOG.info(
        "Starting merge of "
            + region_a
            + " and "
            + region_b.getRegionNameAsString()
            + ", forcible="
            + forcible);
    if ((server != null && server.isStopped()) || (services != null && services.isStopping())) {
      throw new IOException("Server is stopped or stopping");
    }

    if (rsCoprocessorHost != null) {
      if (rsCoprocessorHost.preMerge(this.region_a, this.region_b)) {
        throw new IOException(
            "Coprocessor bypassing regions " + this.region_a + " " + this.region_b + " merge.");
      }
    }

    // If true, no cluster to write meta edits to or to use coordination.
    boolean testing =
        server == null
            ? true
            : server.getConfiguration().getBoolean("hbase.testing.nocluster", false);

    HRegion mergedRegion = stepsBeforePONR(server, services, testing);

    @MetaMutationAnnotation List<Mutation> metaEntries = new ArrayList<Mutation>();
    if (rsCoprocessorHost != null) {
      if (rsCoprocessorHost.preMergeCommit(this.region_a, this.region_b, metaEntries)) {
        throw new IOException(
            "Coprocessor bypassing regions " + this.region_a + " " + this.region_b + " merge.");
      }
      try {
        for (Mutation p : metaEntries) {
          HRegionInfo.parseRegionName(p.getRow());
        }
      } catch (IOException e) {
        LOG.error(
            "Row key of mutation from coprocessor is not parsable as region name."
                + "Mutations from coprocessor should only be for hbase:meta table.",
            e);
        throw e;
      }
    }

    // This is the point of no return. Similar with SplitTransaction.
    // IF we reach the PONR then subsequent failures need to crash out this
    // regionserver
    this.journal.add(JournalEntry.PONR);

    // Add merged region and delete region_a and region_b
    // as an atomic update. See HBASE-7721. This update to hbase:meta makes the region
    // will determine whether the region is merged or not in case of failures.
    // If it is successful, master will roll-forward, if not, master will
    // rollback
    if (!testing && useCoordinationForAssignment) {
      if (metaEntries.isEmpty()) {
        MetaTableAccessor.mergeRegions(
            server.getConnection(),
            mergedRegion.getRegionInfo(),
            region_a.getRegionInfo(),
            region_b.getRegionInfo(),
            server.getServerName(),
            region_a.getTableDesc().getRegionReplication());
      } else {
        mergeRegionsAndPutMetaEntries(
            server.getConnection(),
            mergedRegion.getRegionInfo(),
            region_a.getRegionInfo(),
            region_b.getRegionInfo(),
            server.getServerName(),
            metaEntries,
            region_a.getTableDesc().getRegionReplication());
      }
    } else if (services != null && !useCoordinationForAssignment) {
      if (!services.reportRegionStateTransition(
          TransitionCode.MERGE_PONR,
          mergedRegionInfo,
          region_a.getRegionInfo(),
          region_b.getRegionInfo())) {
        // Passed PONR, let SSH clean it up
        throw new IOException(
            "Failed to notify master that merge passed PONR: "
                + region_a.getRegionInfo().getRegionNameAsString()
                + " and "
                + region_b.getRegionInfo().getRegionNameAsString());
      }
    }
    return mergedRegion;
  }