/** * Take a snapshot using the specified handler. On failure the snapshot temporary working * directory is removed. NOTE: prepareToTakeSnapshot() called before this one takes care of the * rejecting the snapshot request if the table is busy with another snapshot/restore operation. * * @param snapshot the snapshot description * @param handler the snapshot handler */ private synchronized void snapshotTable( SnapshotDescription snapshot, final TakeSnapshotHandler handler) throws HBaseSnapshotException { try { handler.prepare(); this.executorService.submit(handler); this.snapshotHandlers.put(TableName.valueOf(snapshot.getTable()), handler); } catch (Exception e) { // cleanup the working directory by trying to delete it from the fs. Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir); try { if (!this.master.getMasterFileSystem().getFileSystem().delete(workingDir, true)) { LOG.error( "Couldn't delete working directory (" + workingDir + " for snapshot:" + ClientSnapshotDescriptionUtils.toString(snapshot)); } } catch (IOException e1) { LOG.error( "Couldn't delete working directory (" + workingDir + " for snapshot:" + ClientSnapshotDescriptionUtils.toString(snapshot)); } // fail the snapshot throw new SnapshotCreationException( "Could not build snapshot handler", e, ProtobufUtil.createSnapshotDesc(snapshot)); } }
// TODO consider parallelizing these operations since they are independent. Right now its just // easier to keep them serial though @Override public void snapshotRegions(List<Pair<HRegionInfo, ServerName>> regionsAndLocations) throws IOException, KeeperException { try { timeoutInjector.start(); // 1. get all the regions hosting this table. // extract each pair to separate lists Set<HRegionInfo> regions = new HashSet<HRegionInfo>(); for (Pair<HRegionInfo, ServerName> p : regionsAndLocations) { regions.add(p.getFirst()); } // 2. for each region, write all the info to disk String msg = "Starting to write region info and WALs for regions for offline snapshot:" + ClientSnapshotDescriptionUtils.toString(snapshot); LOG.info(msg); status.setStatus(msg); ThreadPoolExecutor exec = SnapshotManifest.createExecutor(conf, "DisabledTableSnapshot"); try { ModifyRegionUtils.editRegions( exec, regions, new ModifyRegionUtils.RegionEditTask() { @Override public void editRegion(final HRegionInfo regionInfo) throws IOException { snapshotManifest.addRegion(FSUtils.getTableDir(rootDir, snapshotTable), regionInfo); } }); } finally { exec.shutdown(); } } catch (Exception e) { // make sure we capture the exception to propagate back to the client later String reason = "Failed snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot) + " due to exception:" + e.getMessage(); ForeignException ee = new ForeignException(reason, e); monitor.receive(ee); status.abort("Snapshot of table: " + snapshotTable + " failed because " + e.getMessage()); } finally { LOG.debug( "Marking snapshot" + ClientSnapshotDescriptionUtils.toString(snapshot) + " as finished."); // 3. mark the timer as finished - even if we got an exception, we don't need to time the // operation any further timeoutInjector.complete(); } }
/** * Check to make sure that we are OK to run the passed snapshot. Checks to make sure that we * aren't already running a snapshot or restore on the requested table. * * @param snapshot description of the snapshot we want to start * @throws HBaseSnapshotException if the filesystem could not be prepared to start the snapshot */ private synchronized void prepareToTakeSnapshot(SnapshotDescription snapshot) throws HBaseSnapshotException { FileSystem fs = master.getMasterFileSystem().getFileSystem(); Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir); TableName snapshotTable = TableName.valueOf(snapshot.getTable()); // make sure we aren't already running a snapshot if (isTakingSnapshot(snapshot)) { SnapshotSentinel handler = this.snapshotHandlers.get(snapshotTable); throw new SnapshotCreationException( "Rejected taking " + ClientSnapshotDescriptionUtils.toString(snapshot) + " because we are already running another snapshot " + (handler != null ? ("on the same table " + ClientSnapshotDescriptionUtils.toString(handler.getSnapshot())) : "with the same name"), ProtobufUtil.createSnapshotDesc(snapshot)); } // make sure we aren't running a restore on the same table if (isRestoringTable(snapshotTable)) { throw new SnapshotCreationException( "Rejected taking " + ClientSnapshotDescriptionUtils.toString(snapshot) + " because we are already have a restore in progress on the same snapshot."); } try { // delete the working directory, since we aren't running the snapshot. Likely leftovers // from a failed attempt. fs.delete(workingDir, true); // recreate the working directory for the snapshot if (!fs.mkdirs(workingDir)) { throw new SnapshotCreationException( "Couldn't create working directory (" + workingDir + ") for snapshot", ProtobufUtil.createSnapshotDesc(snapshot)); } } catch (HBaseSnapshotException e) { throw e; } catch (IOException e) { throw new SnapshotCreationException( "Exception while checking to see if snapshot could be started.", e, ProtobufUtil.createSnapshotDesc(snapshot)); } }
/** * Check if the specified snapshot is done * * @param expected * @return true if snapshot is ready to be restored, false if it is still being taken. * @throws IOException IOException if error from HDFS or RPC * @throws UnknownSnapshotException if snapshot is invalid or does not exist. */ public boolean isSnapshotDone(SnapshotDescription expected) throws IOException { // check the request to make sure it has a snapshot if (expected == null) { throw new UnknownSnapshotException( "No snapshot name passed in request, can't figure out which snapshot you want to check."); } String ssString = ClientSnapshotDescriptionUtils.toString(expected); // check to see if the sentinel exists, // and if the task is complete removes it from the in-progress snapshots map. SnapshotSentinel handler = removeSentinelIfFinished(this.snapshotHandlers, expected); // stop tracking "abandoned" handlers cleanupSentinels(); if (handler == null) { // If there's no handler in the in-progress map, it means one of the following: // - someone has already requested the snapshot state // - the requested snapshot was completed long time ago (cleanupSentinels() timeout) // - the snapshot was never requested // In those cases returns to the user the "done state" if the snapshots exists on disk, // otherwise raise an exception saying that the snapshot is not running and doesn't exist. if (!isSnapshotCompleted(expected)) { throw new UnknownSnapshotException( "Snapshot " + ssString + " is not currently running or one of the known completed snapshots."); } // was done, return true; return true; } // pass on any failure we find in the sentinel try { handler.rethrowExceptionIfFailed(); } catch (ForeignException e) { // Give some procedure info on an exception. String status; Procedure p = coordinator.getProcedure(expected.getName()); if (p != null) { status = p.getStatus(); } else { status = expected.getName() + " not found in proclist " + coordinator.getProcedureNames(); } throw new HBaseSnapshotException( "Snapshot " + ssString + " had an error. " + status, e, ProtobufUtil.createSnapshotDesc(expected)); } // check to see if we are done if (handler.isFinished()) { LOG.debug("Snapshot '" + ssString + "' has completed, notifying client."); return true; } else if (LOG.isDebugEnabled()) { LOG.debug("Snapshoting '" + ssString + "' is still in progress!"); } return false; }
private void flushSnapshot() throws ForeignException { if (regions.isEmpty()) { // No regions on this RS, we are basically done. return; } monitor.rethrowException(); // assert that the taskManager is empty. if (taskManager.hasTasks()) { throw new IllegalStateException( "Attempting to take snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot) + " but we currently have outstanding tasks"); } // Add all hfiles already existing in region. for (Region region : regions) { // submit one task per region for parallelize by region. taskManager.submitTask(new RegionSnapshotTask(region)); monitor.rethrowException(); } // wait for everything to complete. LOG.debug("Flush Snapshot Tasks submitted for " + regions.size() + " regions"); try { taskManager.waitForOutstandingTasks(); } catch (InterruptedException e) { LOG.error("got interrupted exception for " + getMemberName()); throw new ForeignException(getMemberName(), e); } }
@Override public void cancel(String why) { if (finished) return; this.finished = true; LOG.info( "Stop taking snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) + " because: " + why); CancellationException ce = new CancellationException(why); monitor.receive(new ForeignException(master.getServerName().toString(), ce)); }
/** * Restore the specified snapshot. The restore will fail if the destination table has a snapshot * or restore in progress. * * @param snapshot Snapshot Descriptor * @param hTableDescriptor Table Descriptor * @param nonceGroup unique value to prevent duplicated RPC * @param nonce unique value to prevent duplicated RPC * @return procId the ID of the restore snapshot procedure */ private synchronized long restoreSnapshot( final SnapshotDescription snapshot, final HTableDescriptor hTableDescriptor, final long nonceGroup, final long nonce) throws HBaseSnapshotException { TableName tableName = hTableDescriptor.getTableName(); // make sure we aren't running a snapshot on the same table if (isTakingSnapshot(tableName)) { throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName); } // make sure we aren't running a restore on the same table if (isRestoringTable(tableName)) { throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName); } try { long procId = master .getMasterProcedureExecutor() .submitProcedure( new RestoreSnapshotProcedure( master.getMasterProcedureExecutor().getEnvironment(), hTableDescriptor, snapshot), nonceGroup, nonce); this.restoreTableToProcIdMap.put(tableName, procId); return procId; } catch (Exception e) { String msg = "Couldn't restore the snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) + " on table=" + tableName; LOG.error(msg, e); throw new RestoreSnapshotException(msg, e); } }
/** * Take a snapshot based on the enabled/disabled state of the table. * * @param snapshot * @throws HBaseSnapshotException when a snapshot specific exception occurs. * @throws IOException when some sort of generic IO exception occurs. */ public void takeSnapshot(SnapshotDescription snapshot) throws IOException { // check to see if we already completed the snapshot if (isSnapshotCompleted(snapshot)) { throw new SnapshotExistsException( "Snapshot '" + snapshot.getName() + "' already stored on the filesystem.", ProtobufUtil.createSnapshotDesc(snapshot)); } LOG.debug("No existing snapshot, attempting snapshot..."); // stop tracking "abandoned" handlers cleanupSentinels(); // check to see if the table exists HTableDescriptor desc = null; try { desc = master.getTableDescriptors().get(TableName.valueOf(snapshot.getTable())); } catch (FileNotFoundException e) { String msg = "Table:" + snapshot.getTable() + " info doesn't exist!"; LOG.error(msg); throw new SnapshotCreationException(msg, e, ProtobufUtil.createSnapshotDesc(snapshot)); } catch (IOException e) { throw new SnapshotCreationException( "Error while geting table description for table " + snapshot.getTable(), e, ProtobufUtil.createSnapshotDesc(snapshot)); } if (desc == null) { throw new SnapshotCreationException( "Table '" + snapshot.getTable() + "' doesn't exist, can't take snapshot.", ProtobufUtil.createSnapshotDesc(snapshot)); } SnapshotDescription.Builder builder = snapshot.toBuilder(); // if not specified, set the snapshot format if (!snapshot.hasVersion()) { builder.setVersion(SnapshotDescriptionUtils.SNAPSHOT_LAYOUT_VERSION); } User user = RpcServer.getRequestUser(); if (User.isHBaseSecurityEnabled(master.getConfiguration()) && user != null) { builder.setOwner(user.getShortName()); } snapshot = builder.build(); // call pre coproc hook MasterCoprocessorHost cpHost = master.getMasterCoprocessorHost(); if (cpHost != null) { cpHost.preSnapshot(snapshot, desc); } // if the table is enabled, then have the RS run actually the snapshot work TableName snapshotTable = TableName.valueOf(snapshot.getTable()); if (master.getTableStateManager().isTableState(snapshotTable, TableState.State.ENABLED)) { LOG.debug("Table enabled, starting distributed snapshot."); snapshotEnabledTable(snapshot); LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot)); } // For disabled table, snapshot is created by the master else if (master.getTableStateManager().isTableState(snapshotTable, TableState.State.DISABLED)) { LOG.debug("Table is disabled, running snapshot entirely on master."); snapshotDisabledTable(snapshot); LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot)); } else { LOG.error( "Can't snapshot table '" + snapshot.getTable() + "', isn't open or closed, we don't know what to do!"); TablePartiallyOpenException tpoe = new TablePartiallyOpenException(snapshot.getTable() + " isn't fully open."); throw new SnapshotCreationException( "Table is not entirely open or closed", tpoe, ProtobufUtil.createSnapshotDesc(snapshot)); } // call post coproc hook if (cpHost != null) { cpHost.postSnapshot(snapshot, desc); } }
/** * Execute the core common portions of taking a snapshot. The {@link #snapshotRegions(List)} call * should get implemented for each snapshot flavor. */ @Override @edu.umd.cs.findbugs.annotations.SuppressWarnings( value = "REC_CATCH_EXCEPTION", justification = "Intentional") public void process() { String msg = "Running " + snapshot.getType() + " table snapshot " + snapshot.getName() + " " + eventType + " on table " + snapshotTable; LOG.info(msg); status.setStatus(msg); try { // If regions move after this meta scan, the region specific snapshot should fail, triggering // an external exception that gets captured here. // write down the snapshot info in the working directory SnapshotDescriptionUtils.writeSnapshotInfo(snapshot, workingDir, fs); snapshotManifest.addTableDescriptor(this.htd); monitor.rethrowException(); List<Pair<HRegionInfo, ServerName>> regionsAndLocations; if (TableName.META_TABLE_NAME.equals(snapshotTable)) { regionsAndLocations = new MetaTableLocator().getMetaRegionsAndLocations(server.getZooKeeper()); } else { regionsAndLocations = MetaTableAccessor.getTableRegionsAndLocations( server.getConnection(), snapshotTable, false); } // run the snapshot snapshotRegions(regionsAndLocations); monitor.rethrowException(); // extract each pair to separate lists Set<String> serverNames = new HashSet<String>(); for (Pair<HRegionInfo, ServerName> p : regionsAndLocations) { if (p != null && p.getFirst() != null && p.getSecond() != null) { HRegionInfo hri = p.getFirst(); if (hri.isOffline() && (hri.isSplit() || hri.isSplitParent())) continue; serverNames.add(p.getSecond().toString()); } } // flush the in-memory state, and write the single manifest status.setStatus("Consolidate snapshot: " + snapshot.getName()); snapshotManifest.consolidate(); // verify the snapshot is valid status.setStatus("Verifying snapshot: " + snapshot.getName()); verifier.verifySnapshot(this.workingDir, serverNames); // complete the snapshot, atomically moving from tmp to .snapshot dir. completeSnapshot(this.snapshotDir, this.workingDir, this.fs); msg = "Snapshot " + snapshot.getName() + " of table " + snapshotTable + " completed"; status.markComplete(msg); LOG.info(msg); metricsSnapshot.addSnapshot(status.getCompletionTimestamp() - status.getStartTime()); } catch (Exception e) { // FindBugs: REC_CATCH_EXCEPTION status.abort( "Failed to complete snapshot " + snapshot.getName() + " on table " + snapshotTable + " because " + e.getMessage()); String reason = "Failed taking snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot) + " due to exception:" + e.getMessage(); LOG.error(reason, e); ForeignException ee = new ForeignException(reason, e); monitor.receive(ee); // need to mark this completed to close off and allow cleanup to happen. cancel(reason); } finally { LOG.debug("Launching cleanup of working dir:" + workingDir); try { // if the working dir is still present, the snapshot has failed. it is present we delete // it. if (fs.exists(workingDir) && !this.fs.delete(workingDir, true)) { LOG.error("Couldn't delete snapshot working directory:" + workingDir); } } catch (IOException e) { LOG.error("Couldn't delete snapshot working directory:" + workingDir); } releaseTableLock(); } }