private void doRecovery(final RecoveryStatus recoveryStatus) { assert recoveryStatus.sourceNode() != null : "can't do a recovery without a source node"; logger.trace("collecting local files for {}", recoveryStatus); Store.MetadataSnapshot metadataSnapshot = null; try { metadataSnapshot = recoveryStatus.store().getMetadataOrEmpty(); } catch (IOException e) { logger.warn("error while listing local files, recover as if there are none", e); metadataSnapshot = Store.MetadataSnapshot.EMPTY; } catch (Exception e) { // this will be logged as warning later on... logger.trace("unexpected error while listing local files, failing recovery", e); onGoingRecoveries.failRecovery( recoveryStatus.recoveryId(), new RecoveryFailedException(recoveryStatus.state(), "failed to list local files", e), true); return; } final StartRecoveryRequest request = new StartRecoveryRequest( recoveryStatus.shardId(), recoveryStatus.sourceNode(), clusterService.localNode(), false, metadataSnapshot, recoveryStatus.state().getType(), recoveryStatus.recoveryId()); final AtomicReference<RecoveryResponse> responseHolder = new AtomicReference<>(); try { logger.trace( "[{}][{}] starting recovery from {}", request.shardId().index().name(), request.shardId().id(), request.sourceNode()); recoveryStatus.indexShard().prepareForIndexRecovery(); recoveryStatus .CancellableThreads() .execute( new CancellableThreads.Interruptable() { @Override public void run() throws InterruptedException { responseHolder.set( transportService .submitRequest( request.sourceNode(), RecoverySource.Actions.START_RECOVERY, request, new FutureTransportResponseHandler<RecoveryResponse>() { @Override public RecoveryResponse newInstance() { return new RecoveryResponse(); } }) .txGet()); } }); final RecoveryResponse recoveryResponse = responseHolder.get(); assert responseHolder != null; final TimeValue recoveryTime = new TimeValue(recoveryStatus.state().getTimer().time()); // do this through ongoing recoveries to remove it from the collection onGoingRecoveries.markRecoveryAsDone(recoveryStatus.recoveryId()); if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append('[') .append(request.shardId().index().name()) .append(']') .append('[') .append(request.shardId().id()) .append("] "); sb.append("recovery completed from ") .append(request.sourceNode()) .append(", took[") .append(recoveryTime) .append("]\n"); sb.append(" phase1: recovered_files [") .append(recoveryResponse.phase1FileNames.size()) .append("]") .append(" with total_size of [") .append(new ByteSizeValue(recoveryResponse.phase1TotalSize)) .append("]") .append(", took [") .append(timeValueMillis(recoveryResponse.phase1Time)) .append("], throttling_wait [") .append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)) .append(']') .append("\n"); sb.append(" : reusing_files [") .append(recoveryResponse.phase1ExistingFileNames.size()) .append("] with total_size of [") .append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)) .append("]\n"); sb.append(" phase2: start took [") .append(timeValueMillis(recoveryResponse.startTime)) .append("]\n"); sb.append(" : recovered [") .append(recoveryResponse.phase2Operations) .append("]") .append(" transaction log operations") .append(", took [") .append(timeValueMillis(recoveryResponse.phase2Time)) .append("]") .append("\n"); logger.trace(sb.toString()); } else { logger.debug( "{} recovery done from [{}], took [{}]", request.shardId(), recoveryStatus.sourceNode(), recoveryTime); } } catch (CancellableThreads.ExecutionCancelledException e) { logger.trace("recovery cancelled", e); } catch (Throwable e) { if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] Got exception on recovery", e, request.shardId().index().name(), request.shardId().id()); } Throwable cause = ExceptionsHelper.unwrapCause(e); if (cause instanceof RecoveryEngineException) { // unwrap an exception that was thrown as part of the recovery cause = cause.getCause(); } // do it twice, in case we have double transport exception cause = ExceptionsHelper.unwrapCause(cause); if (cause instanceof RecoveryEngineException) { // unwrap an exception that was thrown as part of the recovery cause = cause.getCause(); } // here, we would add checks against exception that need to be retried (and not removeAndClean // in this case) if (cause instanceof IllegalIndexShardStateException || cause instanceof IndexNotFoundException || cause instanceof ShardNotFoundException) { // if the target is not ready yet, retry retryRecovery( recoveryStatus, "remote shard not ready", recoverySettings.retryDelayStateSync(), request); return; } if (cause instanceof DelayRecoveryException) { retryRecovery(recoveryStatus, cause, recoverySettings.retryDelayStateSync(), request); return; } if (cause instanceof ConnectTransportException) { logger.debug( "delaying recovery of {} for [{}] due to networking error [{}]", recoveryStatus.shardId(), recoverySettings.retryDelayNetwork(), cause.getMessage()); retryRecovery( recoveryStatus, cause.getMessage(), recoverySettings.retryDelayNetwork(), request); return; } if (cause instanceof IndexShardClosedException) { onGoingRecoveries.failRecovery( recoveryStatus.recoveryId(), new RecoveryFailedException(request, "source shard is closed", cause), false); return; } if (cause instanceof AlreadyClosedException) { onGoingRecoveries.failRecovery( recoveryStatus.recoveryId(), new RecoveryFailedException(request, "source shard is closed", cause), false); return; } onGoingRecoveries.failRecovery( recoveryStatus.recoveryId(), new RecoveryFailedException(request, e), true); } }
/** Snapshots the given shard into the gateway. */ public synchronized void snapshot(final String reason) throws IndexShardGatewaySnapshotFailedException { if (!indexShard.routingEntry().primary()) { return; // throw new IndexShardGatewaySnapshotNotAllowedException(shardId, "Snapshot not // allowed on non primary shard"); } if (indexShard.routingEntry().relocating()) { // do not snapshot when in the process of relocation of primaries so we won't get conflicts return; } if (indexShard.state() == IndexShardState.CREATED) { // shard has just been created, ignore it and return return; } if (indexShard.state() == IndexShardState.RECOVERING) { // shard is recovering, don't snapshot return; } if (snapshotLock == null) { try { snapshotLock = shardGateway.obtainSnapshotLock(); } catch (Exception e) { logger.warn("failed to obtain snapshot lock, ignoring snapshot", e); return; } } try { SnapshotStatus snapshotStatus = indexShard.snapshot( new Engine.SnapshotHandler<SnapshotStatus>() { @Override public SnapshotStatus snapshot( SnapshotIndexCommit snapshotIndexCommit, Translog.Snapshot translogSnapshot) throws EngineException { if (lastIndexVersion != snapshotIndexCommit.getGeneration() || lastTranslogId != translogSnapshot.translogId() || lastTranslogLength < translogSnapshot.length()) { logger.debug("snapshot ({}) to {} ...", reason, shardGateway); SnapshotStatus snapshotStatus = shardGateway.snapshot( new IndexShardGateway.Snapshot( snapshotIndexCommit, translogSnapshot, lastIndexVersion, lastTranslogId, lastTranslogLength, lastTotalTranslogOperations)); lastIndexVersion = snapshotIndexCommit.getGeneration(); lastTranslogId = translogSnapshot.translogId(); lastTranslogLength = translogSnapshot.length(); lastTotalTranslogOperations = translogSnapshot.estimatedTotalOperations(); return snapshotStatus; } return null; } }); if (snapshotStatus != null) { if (logger.isDebugEnabled()) { StringBuilder sb = new StringBuilder(); sb.append("snapshot (") .append(reason) .append(") completed to ") .append(shardGateway) .append(", took [") .append(TimeValue.timeValueMillis(snapshotStatus.time())) .append("]\n"); sb.append(" index : version [") .append(lastIndexVersion) .append("], number_of_files [") .append(snapshotStatus.index().numberOfFiles()) .append("] with total_size [") .append(new ByteSizeValue(snapshotStatus.index().totalSize())) .append("], took [") .append(TimeValue.timeValueMillis(snapshotStatus.index().time())) .append("]\n"); sb.append(" translog : id [") .append(lastTranslogId) .append("], number_of_operations [") .append(snapshotStatus.translog().expectedNumberOfOperations()) .append("], took [") .append(TimeValue.timeValueMillis(snapshotStatus.translog().time())) .append("]"); logger.debug(sb.toString()); } } } catch (SnapshotFailedEngineException e) { if (e.getCause() instanceof IllegalStateException) { // ignore, that's fine, snapshot has not started yet } else { throw new IndexShardGatewaySnapshotFailedException(shardId, "Failed to snapshot", e); } } catch (IllegalIndexShardStateException e) { // ignore, that's fine, snapshot has not started yet } catch (IndexShardGatewaySnapshotFailedException e) { throw e; } catch (Exception e) { throw new IndexShardGatewaySnapshotFailedException(shardId, "Failed to snapshot", e); } }