コード例 #1
0
 private void retryRecovery(
     final RecoveryTarget recoveryTarget,
     TimeValue retryAfter,
     final StartRecoveryRequest currentRequest) {
   try {
     onGoingRecoveries.resetRecovery(recoveryTarget.recoveryId(), recoveryTarget.shardId());
   } catch (Exception e) {
     onGoingRecoveries.failRecovery(
         recoveryTarget.recoveryId(), new RecoveryFailedException(currentRequest, e), true);
   }
   threadPool.schedule(
       retryAfter, ThreadPool.Names.GENERIC, new RecoveryRunner(recoveryTarget.recoveryId()));
 }
コード例 #2
0
 protected void retryRecovery(
     final RecoveryTarget recoveryTarget,
     final String reason,
     TimeValue retryAfter,
     final StartRecoveryRequest currentRequest) {
   logger.trace(
       "will retry recovery with id [{}] in [{}] (reason [{}])",
       recoveryTarget.recoveryId(),
       retryAfter,
       reason);
   retryRecovery(recoveryTarget, retryAfter, currentRequest);
 }
コード例 #3
0
 protected void retryRecovery(
     final RecoveryTarget recoveryTarget,
     final Throwable reason,
     TimeValue retryAfter,
     final StartRecoveryRequest currentRequest) {
   logger.trace(
       (Supplier<?>)
           () ->
               new ParameterizedMessage(
                   "will retry recovery with id [{}] in [{}]",
                   recoveryTarget.recoveryId(),
                   retryAfter),
       reason);
   retryRecovery(recoveryTarget, retryAfter, currentRequest);
 }
コード例 #4
0
  private void doRecovery(final RecoveryTarget recoveryTarget) {
    assert recoveryTarget.sourceNode() != null : "can't do a recovery without a source node";

    logger.trace("collecting local files for {}", recoveryTarget);
    Store.MetadataSnapshot metadataSnapshot = null;
    try {
      if (recoveryTarget.indexShard().indexSettings().isOnSharedFilesystem()) {
        // we are not going to copy any files, so don't bother listing files, potentially running
        // into concurrency issues with the primary changing files underneath us.
        metadataSnapshot = Store.MetadataSnapshot.EMPTY;
      } else {
        metadataSnapshot = recoveryTarget.indexShard().snapshotStoreMetadata();
      }
    } catch (org.apache.lucene.index.IndexNotFoundException e) {
      // happens on an empty folder. no need to log
      metadataSnapshot = Store.MetadataSnapshot.EMPTY;
    } catch (IOException e) {
      logger.warn("error while listing local files, recover as if there are none", e);
      metadataSnapshot = Store.MetadataSnapshot.EMPTY;
    } catch (Exception e) {
      // this will be logged as warning later on...
      logger.trace("unexpected error while listing local files, failing recovery", e);
      onGoingRecoveries.failRecovery(
          recoveryTarget.recoveryId(),
          new RecoveryFailedException(recoveryTarget.state(), "failed to list local files", e),
          true);
      return;
    }
    logger.trace("{} local file count: [{}]", recoveryTarget, metadataSnapshot.size());
    final StartRecoveryRequest request =
        new StartRecoveryRequest(
            recoveryTarget.shardId(),
            recoveryTarget.sourceNode(),
            clusterService.localNode(),
            metadataSnapshot,
            recoveryTarget.state().getPrimary(),
            recoveryTarget.recoveryId());

    final AtomicReference<RecoveryResponse> responseHolder = new AtomicReference<>();
    try {
      logger.trace(
          "[{}][{}] starting recovery from {}",
          request.shardId().getIndex().getName(),
          request.shardId().id(),
          request.sourceNode());
      recoveryTarget.indexShard().prepareForIndexRecovery();
      recoveryTarget
          .CancellableThreads()
          .execute(
              () ->
                  responseHolder.set(
                      transportService
                          .submitRequest(
                              request.sourceNode(),
                              PeerRecoverySourceService.Actions.START_RECOVERY,
                              request,
                              new FutureTransportResponseHandler<RecoveryResponse>() {
                                @Override
                                public RecoveryResponse newInstance() {
                                  return new RecoveryResponse();
                                }
                              })
                          .txGet()));
      final RecoveryResponse recoveryResponse = responseHolder.get();
      assert responseHolder != null;
      final TimeValue recoveryTime = new TimeValue(recoveryTarget.state().getTimer().time());
      // do this through ongoing recoveries to remove it from the collection
      onGoingRecoveries.markRecoveryAsDone(recoveryTarget.recoveryId());
      if (logger.isTraceEnabled()) {
        StringBuilder sb = new StringBuilder();
        sb.append('[')
            .append(request.shardId().getIndex().getName())
            .append(']')
            .append('[')
            .append(request.shardId().id())
            .append("] ");
        sb.append("recovery completed from ")
            .append(request.sourceNode())
            .append(", took[")
            .append(recoveryTime)
            .append("]\n");
        sb.append("   phase1: recovered_files [")
            .append(recoveryResponse.phase1FileNames.size())
            .append("]")
            .append(" with " + "total_size of [")
            .append(new ByteSizeValue(recoveryResponse.phase1TotalSize))
            .append("]")
            .append(", took [")
            .append(timeValueMillis(recoveryResponse.phase1Time))
            .append("], throttling_wait [")
            .append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime))
            .append(']')
            .append("\n");
        sb.append("         : reusing_files   [")
            .append(recoveryResponse.phase1ExistingFileNames.size())
            .append("] with " + "total_size of [")
            .append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize))
            .append("]\n");
        sb.append("   phase2: start took [")
            .append(timeValueMillis(recoveryResponse.startTime))
            .append("]\n");
        sb.append("         : recovered [")
            .append(recoveryResponse.phase2Operations)
            .append("]")
            .append(" transaction log " + "operations")
            .append(", took [")
            .append(timeValueMillis(recoveryResponse.phase2Time))
            .append("]")
            .append("\n");
        logger.trace("{}", sb);
      } else {
        logger.debug(
            "{} recovery done from [{}], took [{}]",
            request.shardId(),
            recoveryTarget.sourceNode(),
            recoveryTime);
      }
    } catch (CancellableThreads.ExecutionCancelledException e) {
      logger.trace("recovery cancelled", e);
    } catch (Exception e) {
      if (logger.isTraceEnabled()) {
        logger.trace(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "[{}][{}] Got exception on recovery",
                        request.shardId().getIndex().getName(),
                        request.shardId().id()),
            e);
      }
      Throwable cause = ExceptionsHelper.unwrapCause(e);
      if (cause instanceof CancellableThreads.ExecutionCancelledException) {
        // this can also come from the source wrapped in a RemoteTransportException
        onGoingRecoveries.failRecovery(
            recoveryTarget.recoveryId(),
            new RecoveryFailedException(request, "source has canceled the" + " recovery", cause),
            false);
        return;
      }
      if (cause instanceof RecoveryEngineException) {
        // unwrap an exception that was thrown as part of the recovery
        cause = cause.getCause();
      }
      // do it twice, in case we have double transport exception
      cause = ExceptionsHelper.unwrapCause(cause);
      if (cause instanceof RecoveryEngineException) {
        // unwrap an exception that was thrown as part of the recovery
        cause = cause.getCause();
      }

      // here, we would add checks against exception that need to be retried (and not removeAndClean
      // in this case)

      if (cause instanceof IllegalIndexShardStateException
          || cause instanceof IndexNotFoundException
          || cause instanceof ShardNotFoundException) {
        // if the target is not ready yet, retry
        retryRecovery(
            recoveryTarget,
            "remote shard not ready",
            recoverySettings.retryDelayStateSync(),
            request);
        return;
      }

      if (cause instanceof DelayRecoveryException) {
        retryRecovery(recoveryTarget, cause, recoverySettings.retryDelayStateSync(), request);
        return;
      }

      if (cause instanceof ConnectTransportException) {
        logger.debug(
            "delaying recovery of {} for [{}] due to networking error [{}]",
            recoveryTarget.shardId(),
            recoverySettings.retryDelayNetwork(),
            cause.getMessage());
        retryRecovery(
            recoveryTarget, cause.getMessage(), recoverySettings.retryDelayNetwork(), request);
        return;
      }

      if (cause instanceof AlreadyClosedException) {
        onGoingRecoveries.failRecovery(
            recoveryTarget.recoveryId(),
            new RecoveryFailedException(request, "source shard is " + "closed", cause),
            false);
        return;
      }
      onGoingRecoveries.failRecovery(
          recoveryTarget.recoveryId(), new RecoveryFailedException(request, e), true);
    }
  }