Exemplo n.º 1
0
  private void doRecovery(final RecoveryStatus recoveryStatus) {
    assert recoveryStatus.sourceNode() != null : "can't do a recovery without a source node";

    logger.trace("collecting local files for {}", recoveryStatus);
    Store.MetadataSnapshot metadataSnapshot = null;
    try {
      metadataSnapshot = recoveryStatus.store().getMetadataOrEmpty();
    } catch (IOException e) {
      logger.warn("error while listing local files, recover as if there are none", e);
      metadataSnapshot = Store.MetadataSnapshot.EMPTY;
    } catch (Exception e) {
      // this will be logged as warning later on...
      logger.trace("unexpected error while listing local files, failing recovery", e);
      onGoingRecoveries.failRecovery(
          recoveryStatus.recoveryId(),
          new RecoveryFailedException(recoveryStatus.state(), "failed to list local files", e),
          true);
      return;
    }
    final StartRecoveryRequest request =
        new StartRecoveryRequest(
            recoveryStatus.shardId(),
            recoveryStatus.sourceNode(),
            clusterService.localNode(),
            false,
            metadataSnapshot,
            recoveryStatus.state().getType(),
            recoveryStatus.recoveryId());

    final AtomicReference<RecoveryResponse> responseHolder = new AtomicReference<>();
    try {
      logger.trace(
          "[{}][{}] starting recovery from {}",
          request.shardId().index().name(),
          request.shardId().id(),
          request.sourceNode());
      recoveryStatus.indexShard().prepareForIndexRecovery();
      recoveryStatus
          .CancellableThreads()
          .execute(
              new CancellableThreads.Interruptable() {
                @Override
                public void run() throws InterruptedException {
                  responseHolder.set(
                      transportService
                          .submitRequest(
                              request.sourceNode(),
                              RecoverySource.Actions.START_RECOVERY,
                              request,
                              new FutureTransportResponseHandler<RecoveryResponse>() {
                                @Override
                                public RecoveryResponse newInstance() {
                                  return new RecoveryResponse();
                                }
                              })
                          .txGet());
                }
              });
      final RecoveryResponse recoveryResponse = responseHolder.get();
      assert responseHolder != null;
      final TimeValue recoveryTime = new TimeValue(recoveryStatus.state().getTimer().time());
      // do this through ongoing recoveries to remove it from the collection
      onGoingRecoveries.markRecoveryAsDone(recoveryStatus.recoveryId());
      if (logger.isTraceEnabled()) {
        StringBuilder sb = new StringBuilder();
        sb.append('[')
            .append(request.shardId().index().name())
            .append(']')
            .append('[')
            .append(request.shardId().id())
            .append("] ");
        sb.append("recovery completed from ")
            .append(request.sourceNode())
            .append(", took[")
            .append(recoveryTime)
            .append("]\n");
        sb.append("   phase1: recovered_files [")
            .append(recoveryResponse.phase1FileNames.size())
            .append("]")
            .append(" with total_size of [")
            .append(new ByteSizeValue(recoveryResponse.phase1TotalSize))
            .append("]")
            .append(", took [")
            .append(timeValueMillis(recoveryResponse.phase1Time))
            .append("], throttling_wait [")
            .append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime))
            .append(']')
            .append("\n");
        sb.append("         : reusing_files   [")
            .append(recoveryResponse.phase1ExistingFileNames.size())
            .append("] with total_size of [")
            .append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize))
            .append("]\n");
        sb.append("   phase2: start took [")
            .append(timeValueMillis(recoveryResponse.startTime))
            .append("]\n");
        sb.append("         : recovered [")
            .append(recoveryResponse.phase2Operations)
            .append("]")
            .append(" transaction log operations")
            .append(", took [")
            .append(timeValueMillis(recoveryResponse.phase2Time))
            .append("]")
            .append("\n");
        logger.trace(sb.toString());
      } else {
        logger.debug(
            "{} recovery done from [{}], took [{}]",
            request.shardId(),
            recoveryStatus.sourceNode(),
            recoveryTime);
      }
    } catch (CancellableThreads.ExecutionCancelledException e) {
      logger.trace("recovery cancelled", e);
    } catch (Throwable e) {

      if (logger.isTraceEnabled()) {
        logger.trace(
            "[{}][{}] Got exception on recovery",
            e,
            request.shardId().index().name(),
            request.shardId().id());
      }
      Throwable cause = ExceptionsHelper.unwrapCause(e);
      if (cause instanceof RecoveryEngineException) {
        // unwrap an exception that was thrown as part of the recovery
        cause = cause.getCause();
      }
      // do it twice, in case we have double transport exception
      cause = ExceptionsHelper.unwrapCause(cause);
      if (cause instanceof RecoveryEngineException) {
        // unwrap an exception that was thrown as part of the recovery
        cause = cause.getCause();
      }

      // here, we would add checks against exception that need to be retried (and not removeAndClean
      // in this case)

      if (cause instanceof IllegalIndexShardStateException
          || cause instanceof IndexNotFoundException
          || cause instanceof ShardNotFoundException) {
        // if the target is not ready yet, retry
        retryRecovery(
            recoveryStatus,
            "remote shard not ready",
            recoverySettings.retryDelayStateSync(),
            request);
        return;
      }

      if (cause instanceof DelayRecoveryException) {
        retryRecovery(recoveryStatus, cause, recoverySettings.retryDelayStateSync(), request);
        return;
      }

      if (cause instanceof ConnectTransportException) {
        logger.debug(
            "delaying recovery of {} for [{}] due to networking error [{}]",
            recoveryStatus.shardId(),
            recoverySettings.retryDelayNetwork(),
            cause.getMessage());
        retryRecovery(
            recoveryStatus, cause.getMessage(), recoverySettings.retryDelayNetwork(), request);
        return;
      }

      if (cause instanceof IndexShardClosedException) {
        onGoingRecoveries.failRecovery(
            recoveryStatus.recoveryId(),
            new RecoveryFailedException(request, "source shard is closed", cause),
            false);
        return;
      }

      if (cause instanceof AlreadyClosedException) {
        onGoingRecoveries.failRecovery(
            recoveryStatus.recoveryId(),
            new RecoveryFailedException(request, "source shard is closed", cause),
            false);
        return;
      }

      onGoingRecoveries.failRecovery(
          recoveryStatus.recoveryId(), new RecoveryFailedException(request, e), true);
    }
  }
  /** Snapshots the given shard into the gateway. */
  public synchronized void snapshot(final String reason)
      throws IndexShardGatewaySnapshotFailedException {
    if (!indexShard.routingEntry().primary()) {
      return;
      //            throw new IndexShardGatewaySnapshotNotAllowedException(shardId, "Snapshot not
      // allowed on non primary shard");
    }
    if (indexShard.routingEntry().relocating()) {
      // do not snapshot when in the process of relocation of primaries so we won't get conflicts
      return;
    }
    if (indexShard.state() == IndexShardState.CREATED) {
      // shard has just been created, ignore it and return
      return;
    }
    if (indexShard.state() == IndexShardState.RECOVERING) {
      // shard is recovering, don't snapshot
      return;
    }

    if (snapshotLock == null) {
      try {
        snapshotLock = shardGateway.obtainSnapshotLock();
      } catch (Exception e) {
        logger.warn("failed to obtain snapshot lock, ignoring snapshot", e);
        return;
      }
    }

    try {
      SnapshotStatus snapshotStatus =
          indexShard.snapshot(
              new Engine.SnapshotHandler<SnapshotStatus>() {
                @Override
                public SnapshotStatus snapshot(
                    SnapshotIndexCommit snapshotIndexCommit, Translog.Snapshot translogSnapshot)
                    throws EngineException {
                  if (lastIndexVersion != snapshotIndexCommit.getGeneration()
                      || lastTranslogId != translogSnapshot.translogId()
                      || lastTranslogLength < translogSnapshot.length()) {

                    logger.debug("snapshot ({}) to {} ...", reason, shardGateway);
                    SnapshotStatus snapshotStatus =
                        shardGateway.snapshot(
                            new IndexShardGateway.Snapshot(
                                snapshotIndexCommit,
                                translogSnapshot,
                                lastIndexVersion,
                                lastTranslogId,
                                lastTranslogLength,
                                lastTotalTranslogOperations));

                    lastIndexVersion = snapshotIndexCommit.getGeneration();
                    lastTranslogId = translogSnapshot.translogId();
                    lastTranslogLength = translogSnapshot.length();
                    lastTotalTranslogOperations = translogSnapshot.estimatedTotalOperations();
                    return snapshotStatus;
                  }
                  return null;
                }
              });
      if (snapshotStatus != null) {
        if (logger.isDebugEnabled()) {
          StringBuilder sb = new StringBuilder();
          sb.append("snapshot (")
              .append(reason)
              .append(") completed to ")
              .append(shardGateway)
              .append(", took [")
              .append(TimeValue.timeValueMillis(snapshotStatus.time()))
              .append("]\n");
          sb.append("    index    : version [")
              .append(lastIndexVersion)
              .append("], number_of_files [")
              .append(snapshotStatus.index().numberOfFiles())
              .append("] with total_size [")
              .append(new ByteSizeValue(snapshotStatus.index().totalSize()))
              .append("], took [")
              .append(TimeValue.timeValueMillis(snapshotStatus.index().time()))
              .append("]\n");
          sb.append("    translog : id      [")
              .append(lastTranslogId)
              .append("], number_of_operations [")
              .append(snapshotStatus.translog().expectedNumberOfOperations())
              .append("], took [")
              .append(TimeValue.timeValueMillis(snapshotStatus.translog().time()))
              .append("]");
          logger.debug(sb.toString());
        }
      }
    } catch (SnapshotFailedEngineException e) {
      if (e.getCause() instanceof IllegalStateException) {
        // ignore, that's fine, snapshot has not started yet
      } else {
        throw new IndexShardGatewaySnapshotFailedException(shardId, "Failed to snapshot", e);
      }
    } catch (IllegalIndexShardStateException e) {
      // ignore, that's fine, snapshot has not started yet
    } catch (IndexShardGatewaySnapshotFailedException e) {
      throw e;
    } catch (Exception e) {
      throw new IndexShardGatewaySnapshotFailedException(shardId, "Failed to snapshot", e);
    }
  }