コード例 #1
0
 protected void retryRecovery(
     final RecoveryStatus recoveryStatus,
     final String reason,
     TimeValue retryAfter,
     final StartRecoveryRequest currentRequest) {
   logger.trace(
       "will retry recovery with id [{}] in [{}] (reason [{}])",
       recoveryStatus.recoveryId(),
       retryAfter,
       reason);
   retryRecovery(recoveryStatus, retryAfter, currentRequest);
 }
コード例 #2
0
  private void doRecovery(final RecoveryStatus recoveryStatus) {
    assert recoveryStatus.sourceNode() != null : "can't do a recovery without a source node";

    logger.trace("collecting local files for {}", recoveryStatus);
    Store.MetadataSnapshot metadataSnapshot = null;
    try {
      metadataSnapshot = recoveryStatus.store().getMetadataOrEmpty();
    } catch (IOException e) {
      logger.warn("error while listing local files, recover as if there are none", e);
      metadataSnapshot = Store.MetadataSnapshot.EMPTY;
    } catch (Exception e) {
      // this will be logged as warning later on...
      logger.trace("unexpected error while listing local files, failing recovery", e);
      onGoingRecoveries.failRecovery(
          recoveryStatus.recoveryId(),
          new RecoveryFailedException(recoveryStatus.state(), "failed to list local files", e),
          true);
      return;
    }
    final StartRecoveryRequest request =
        new StartRecoveryRequest(
            recoveryStatus.shardId(),
            recoveryStatus.sourceNode(),
            clusterService.localNode(),
            false,
            metadataSnapshot,
            recoveryStatus.state().getType(),
            recoveryStatus.recoveryId());

    final AtomicReference<RecoveryResponse> responseHolder = new AtomicReference<>();
    try {
      logger.trace(
          "[{}][{}] starting recovery from {}",
          request.shardId().index().name(),
          request.shardId().id(),
          request.sourceNode());
      recoveryStatus.indexShard().prepareForIndexRecovery();
      recoveryStatus
          .CancellableThreads()
          .execute(
              new CancellableThreads.Interruptable() {
                @Override
                public void run() throws InterruptedException {
                  responseHolder.set(
                      transportService
                          .submitRequest(
                              request.sourceNode(),
                              RecoverySource.Actions.START_RECOVERY,
                              request,
                              new FutureTransportResponseHandler<RecoveryResponse>() {
                                @Override
                                public RecoveryResponse newInstance() {
                                  return new RecoveryResponse();
                                }
                              })
                          .txGet());
                }
              });
      final RecoveryResponse recoveryResponse = responseHolder.get();
      assert responseHolder != null;
      final TimeValue recoveryTime = new TimeValue(recoveryStatus.state().getTimer().time());
      // do this through ongoing recoveries to remove it from the collection
      onGoingRecoveries.markRecoveryAsDone(recoveryStatus.recoveryId());
      if (logger.isTraceEnabled()) {
        StringBuilder sb = new StringBuilder();
        sb.append('[')
            .append(request.shardId().index().name())
            .append(']')
            .append('[')
            .append(request.shardId().id())
            .append("] ");
        sb.append("recovery completed from ")
            .append(request.sourceNode())
            .append(", took[")
            .append(recoveryTime)
            .append("]\n");
        sb.append("   phase1: recovered_files [")
            .append(recoveryResponse.phase1FileNames.size())
            .append("]")
            .append(" with total_size of [")
            .append(new ByteSizeValue(recoveryResponse.phase1TotalSize))
            .append("]")
            .append(", took [")
            .append(timeValueMillis(recoveryResponse.phase1Time))
            .append("], throttling_wait [")
            .append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime))
            .append(']')
            .append("\n");
        sb.append("         : reusing_files   [")
            .append(recoveryResponse.phase1ExistingFileNames.size())
            .append("] with total_size of [")
            .append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize))
            .append("]\n");
        sb.append("   phase2: start took [")
            .append(timeValueMillis(recoveryResponse.startTime))
            .append("]\n");
        sb.append("         : recovered [")
            .append(recoveryResponse.phase2Operations)
            .append("]")
            .append(" transaction log operations")
            .append(", took [")
            .append(timeValueMillis(recoveryResponse.phase2Time))
            .append("]")
            .append("\n");
        logger.trace(sb.toString());
      } else {
        logger.debug(
            "{} recovery done from [{}], took [{}]",
            request.shardId(),
            recoveryStatus.sourceNode(),
            recoveryTime);
      }
    } catch (CancellableThreads.ExecutionCancelledException e) {
      logger.trace("recovery cancelled", e);
    } catch (Throwable e) {

      if (logger.isTraceEnabled()) {
        logger.trace(
            "[{}][{}] Got exception on recovery",
            e,
            request.shardId().index().name(),
            request.shardId().id());
      }
      Throwable cause = ExceptionsHelper.unwrapCause(e);
      if (cause instanceof RecoveryEngineException) {
        // unwrap an exception that was thrown as part of the recovery
        cause = cause.getCause();
      }
      // do it twice, in case we have double transport exception
      cause = ExceptionsHelper.unwrapCause(cause);
      if (cause instanceof RecoveryEngineException) {
        // unwrap an exception that was thrown as part of the recovery
        cause = cause.getCause();
      }

      // here, we would add checks against exception that need to be retried (and not removeAndClean
      // in this case)

      if (cause instanceof IllegalIndexShardStateException
          || cause instanceof IndexNotFoundException
          || cause instanceof ShardNotFoundException) {
        // if the target is not ready yet, retry
        retryRecovery(
            recoveryStatus,
            "remote shard not ready",
            recoverySettings.retryDelayStateSync(),
            request);
        return;
      }

      if (cause instanceof DelayRecoveryException) {
        retryRecovery(recoveryStatus, cause, recoverySettings.retryDelayStateSync(), request);
        return;
      }

      if (cause instanceof ConnectTransportException) {
        logger.debug(
            "delaying recovery of {} for [{}] due to networking error [{}]",
            recoveryStatus.shardId(),
            recoverySettings.retryDelayNetwork(),
            cause.getMessage());
        retryRecovery(
            recoveryStatus, cause.getMessage(), recoverySettings.retryDelayNetwork(), request);
        return;
      }

      if (cause instanceof IndexShardClosedException) {
        onGoingRecoveries.failRecovery(
            recoveryStatus.recoveryId(),
            new RecoveryFailedException(request, "source shard is closed", cause),
            false);
        return;
      }

      if (cause instanceof AlreadyClosedException) {
        onGoingRecoveries.failRecovery(
            recoveryStatus.recoveryId(),
            new RecoveryFailedException(request, "source shard is closed", cause),
            false);
        return;
      }

      onGoingRecoveries.failRecovery(
          recoveryStatus.recoveryId(), new RecoveryFailedException(request, e), true);
    }
  }