private void retryRecovery( final RecoveryTarget recoveryTarget, TimeValue retryAfter, final StartRecoveryRequest currentRequest) { try { onGoingRecoveries.resetRecovery(recoveryTarget.recoveryId(), recoveryTarget.shardId()); } catch (Exception e) { onGoingRecoveries.failRecovery( recoveryTarget.recoveryId(), new RecoveryFailedException(currentRequest, e), true); } threadPool.schedule( retryAfter, ThreadPool.Names.GENERIC, new RecoveryRunner(recoveryTarget.recoveryId())); }
private void doRecovery(final RecoveryTarget recoveryTarget) { assert recoveryTarget.sourceNode() != null : "can't do a recovery without a source node"; logger.trace("collecting local files for {}", recoveryTarget); Store.MetadataSnapshot metadataSnapshot = null; try { if (recoveryTarget.indexShard().indexSettings().isOnSharedFilesystem()) { // we are not going to copy any files, so don't bother listing files, potentially running // into concurrency issues with the primary changing files underneath us. metadataSnapshot = Store.MetadataSnapshot.EMPTY; } else { metadataSnapshot = recoveryTarget.indexShard().snapshotStoreMetadata(); } } catch (org.apache.lucene.index.IndexNotFoundException e) { // happens on an empty folder. no need to log metadataSnapshot = Store.MetadataSnapshot.EMPTY; } catch (IOException e) { logger.warn("error while listing local files, recover as if there are none", e); metadataSnapshot = Store.MetadataSnapshot.EMPTY; } catch (Exception e) { // this will be logged as warning later on... logger.trace("unexpected error while listing local files, failing recovery", e); onGoingRecoveries.failRecovery( recoveryTarget.recoveryId(), new RecoveryFailedException(recoveryTarget.state(), "failed to list local files", e), true); return; } logger.trace("{} local file count: [{}]", recoveryTarget, metadataSnapshot.size()); final StartRecoveryRequest request = new StartRecoveryRequest( recoveryTarget.shardId(), recoveryTarget.sourceNode(), clusterService.localNode(), metadataSnapshot, recoveryTarget.state().getPrimary(), recoveryTarget.recoveryId()); final AtomicReference<RecoveryResponse> responseHolder = new AtomicReference<>(); try { logger.trace( "[{}][{}] starting recovery from {}", request.shardId().getIndex().getName(), request.shardId().id(), request.sourceNode()); recoveryTarget.indexShard().prepareForIndexRecovery(); recoveryTarget .CancellableThreads() .execute( () -> responseHolder.set( transportService .submitRequest( request.sourceNode(), PeerRecoverySourceService.Actions.START_RECOVERY, request, new FutureTransportResponseHandler<RecoveryResponse>() { @Override public RecoveryResponse newInstance() { return new RecoveryResponse(); } }) .txGet())); final RecoveryResponse recoveryResponse = responseHolder.get(); assert responseHolder != null; final TimeValue recoveryTime = new TimeValue(recoveryTarget.state().getTimer().time()); // do this through ongoing recoveries to remove it from the collection onGoingRecoveries.markRecoveryAsDone(recoveryTarget.recoveryId()); if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append('[') .append(request.shardId().getIndex().getName()) .append(']') .append('[') .append(request.shardId().id()) .append("] "); sb.append("recovery completed from ") .append(request.sourceNode()) .append(", took[") .append(recoveryTime) .append("]\n"); sb.append(" phase1: recovered_files [") .append(recoveryResponse.phase1FileNames.size()) .append("]") .append(" with " + "total_size of [") .append(new ByteSizeValue(recoveryResponse.phase1TotalSize)) .append("]") .append(", took [") .append(timeValueMillis(recoveryResponse.phase1Time)) .append("], throttling_wait [") .append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)) .append(']') .append("\n"); sb.append(" : reusing_files [") .append(recoveryResponse.phase1ExistingFileNames.size()) .append("] with " + "total_size of [") .append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)) .append("]\n"); sb.append(" phase2: start took [") .append(timeValueMillis(recoveryResponse.startTime)) .append("]\n"); sb.append(" : recovered [") .append(recoveryResponse.phase2Operations) .append("]") .append(" transaction log " + "operations") .append(", took [") .append(timeValueMillis(recoveryResponse.phase2Time)) .append("]") .append("\n"); logger.trace("{}", sb); } else { logger.debug( "{} recovery done from [{}], took [{}]", request.shardId(), recoveryTarget.sourceNode(), recoveryTime); } } catch (CancellableThreads.ExecutionCancelledException e) { logger.trace("recovery cancelled", e); } catch (Exception e) { if (logger.isTraceEnabled()) { logger.trace( (Supplier<?>) () -> new ParameterizedMessage( "[{}][{}] Got exception on recovery", request.shardId().getIndex().getName(), request.shardId().id()), e); } Throwable cause = ExceptionsHelper.unwrapCause(e); if (cause instanceof CancellableThreads.ExecutionCancelledException) { // this can also come from the source wrapped in a RemoteTransportException onGoingRecoveries.failRecovery( recoveryTarget.recoveryId(), new RecoveryFailedException(request, "source has canceled the" + " recovery", cause), false); return; } if (cause instanceof RecoveryEngineException) { // unwrap an exception that was thrown as part of the recovery cause = cause.getCause(); } // do it twice, in case we have double transport exception cause = ExceptionsHelper.unwrapCause(cause); if (cause instanceof RecoveryEngineException) { // unwrap an exception that was thrown as part of the recovery cause = cause.getCause(); } // here, we would add checks against exception that need to be retried (and not removeAndClean // in this case) if (cause instanceof IllegalIndexShardStateException || cause instanceof IndexNotFoundException || cause instanceof ShardNotFoundException) { // if the target is not ready yet, retry retryRecovery( recoveryTarget, "remote shard not ready", recoverySettings.retryDelayStateSync(), request); return; } if (cause instanceof DelayRecoveryException) { retryRecovery(recoveryTarget, cause, recoverySettings.retryDelayStateSync(), request); return; } if (cause instanceof ConnectTransportException) { logger.debug( "delaying recovery of {} for [{}] due to networking error [{}]", recoveryTarget.shardId(), recoverySettings.retryDelayNetwork(), cause.getMessage()); retryRecovery( recoveryTarget, cause.getMessage(), recoverySettings.retryDelayNetwork(), request); return; } if (cause instanceof AlreadyClosedException) { onGoingRecoveries.failRecovery( recoveryTarget.recoveryId(), new RecoveryFailedException(request, "source shard is " + "closed", cause), false); return; } onGoingRecoveries.failRecovery( recoveryTarget.recoveryId(), new RecoveryFailedException(request, e), true); } }