/**
   * Perform phase 3 of the recovery process
   *
   * <p>Phase3 again takes a snapshot of the translog, however this time the snapshot is acquired
   * under a write lock. The translog operations are sent to the target node where they are
   * replayed.
   *
   * <p>{@code InternalEngine#recover} is responsible for taking the snapshot of the translog, and
   * after phase 3 completes the snapshots from all three phases are released.
   */
  @Override
  public void phase3(Translog.Snapshot snapshot) throws ElasticsearchException {
    if (shard.state() == IndexShardState.CLOSED) {
      throw new IndexShardClosedException(request.shardId());
    }
    cancellableThreads.checkForCancel();
    StopWatch stopWatch = new StopWatch().start();
    final int totalOperations;
    logger.trace(
        "[{}][{}] recovery [phase3] to {}: sending transaction log operations",
        indexName,
        shardId,
        request.targetNode());

    // Send the translog operations to the target node
    totalOperations = sendSnapshot(snapshot);

    cancellableThreads.execute(
        new Interruptable() {
          @Override
          public void run() throws InterruptedException {
            // Send the FINALIZE request to the target node. The finalize request
            // clears unreferenced translog files, refreshes the engine now that
            // new segments are available, and enables garbage collection of
            // tombstone files. The shard is also moved to the POST_RECOVERY phase
            // during this time
            transportService
                .submitRequest(
                    request.targetNode(),
                    RecoveryTarget.Actions.FINALIZE,
                    new RecoveryFinalizeRecoveryRequest(request.recoveryId(), request.shardId()),
                    TransportRequestOptions.options()
                        .withTimeout(recoverySettings.internalActionLongTimeout()),
                    EmptyTransportResponseHandler.INSTANCE_SAME)
                .txGet();
          }
        });

    if (request.markAsRelocated()) {
      // TODO what happens if the recovery process fails afterwards, we need to mark this back to
      // started
      try {
        shard.relocated("to " + request.targetNode());
      } catch (IllegalIndexShardStateException e) {
        // we can ignore this exception since, on the other node, when it moved to phase3
        // it will also send shard started, which might cause the index shard we work against
        // to move be closed by the time we get to the the relocated method
      }
    }
    stopWatch.stop();
    logger.trace(
        "[{}][{}] recovery [phase3] to {}: took [{}]",
        indexName,
        shardId,
        request.targetNode(),
        stopWatch.totalTime());
    response.phase3Time = stopWatch.totalTime().millis();
    response.phase3Operations = totalOperations;
  }
  /** finalizes the recovery process */
  public void finalizeRecovery() {
    if (shard.state() == IndexShardState.CLOSED) {
      throw new IndexShardClosedException(request.shardId());
    }
    cancellableThreads.checkForCancel();
    StopWatch stopWatch = new StopWatch().start();
    logger.trace("[{}][{}] finalizing recovery to {}", indexName, shardId, request.targetNode());
    cancellableThreads.execute(recoveryTarget::finalizeRecovery);

    if (request.isPrimaryRelocation()) {
      // in case of primary relocation we have to ensure that the cluster state on the primary
      // relocation target has all
      // replica shards that have recovered or are still recovering from the current primary,
      // otherwise replication actions
      // will not be send to these replicas. To accomplish this, first block new recoveries, then
      // take version of latest cluster
      // state. This means that no new recovery can be completed based on information of a newer
      // cluster state than the current one.
      try (Releasable ignored =
          delayNewRecoveries.apply(
              "primary relocation hand-off in progress or completed for " + shardId)) {
        final long currentClusterStateVersion = currentClusterStateVersionSupplier.get();
        logger.trace(
            "[{}][{}] waiting on {} to have cluster state with version [{}]",
            indexName,
            shardId,
            request.targetNode(),
            currentClusterStateVersion);
        cancellableThreads.execute(
            () -> recoveryTarget.ensureClusterStateVersion(currentClusterStateVersion));

        logger.trace(
            "[{}][{}] performing relocation hand-off to {}",
            indexName,
            shardId,
            request.targetNode());
        cancellableThreads.execute(() -> shard.relocated("to " + request.targetNode()));
      }
      /**
       * if the recovery process fails after setting the shard state to RELOCATED, both relocation
       * source and target are failed (see {@link IndexShard#updateRoutingEntry}).
       */
    }
    stopWatch.stop();
    logger.trace(
        "[{}][{}] finalizing recovery to {}: took [{}]",
        indexName,
        shardId,
        request.targetNode(),
        stopWatch.totalTime());
  }
    @Override
    public ClusterState execute(ClusterState currentState) throws Exception {
      if (cancellableThreads.isCancelled()
          == false) { // no need to run this if recovery is canceled
        IndexMetaData indexMetaData =
            clusterService.state().metaData().getIndices().get(indexService.index().getName());
        ImmutableOpenMap<String, MappingMetaData> metaDataMappings = null;
        if (indexMetaData != null) {
          metaDataMappings = indexMetaData.getMappings();
        }
        // default mapping should not be sent back, it can only be updated by put mapping API, and
        // its
        // a full in place replace, we don't want to override a potential update coming into it
        for (DocumentMapper documentMapper : indexService.mapperService().docMappers(false)) {

          MappingMetaData mappingMetaData =
              metaDataMappings == null ? null : metaDataMappings.get(documentMapper.type());
          if (mappingMetaData == null
              || !documentMapper.refreshSource().equals(mappingMetaData.source())) {
            // not on master yet in the right form
            documentMappersToUpdate.add(documentMapper);
          }
        }
      }
      return currentState;
    }
  /**
   * Perform phase2 of the recovery process
   *
   * <p>Phase2 takes a snapshot of the current translog *without* acquiring the write lock (however,
   * the translog snapshot is a point-in-time view of the translog). It then sends each translog
   * operation to the target node so it can be replayed into the new shard.
   */
  public void phase2(Translog.Snapshot snapshot) {
    if (shard.state() == IndexShardState.CLOSED) {
      throw new IndexShardClosedException(request.shardId());
    }
    cancellableThreads.checkForCancel();

    StopWatch stopWatch = new StopWatch().start();

    logger.trace(
        "{} recovery [phase2] to {}: sending transaction log operations",
        request.shardId(),
        request.targetNode());
    // Send all the snapshot's translog operations to the target
    int totalOperations = sendSnapshot(snapshot);
    stopWatch.stop();
    logger.trace(
        "{} recovery [phase2] to {}: took [{}]",
        request.shardId(),
        request.targetNode(),
        stopWatch.totalTime());
    response.phase2Time = stopWatch.totalTime().millis();
    response.phase2Operations = totalOperations;
  }
  protected void prepareTargetForTranslog(final int totalTranslogOps) throws IOException {
    StopWatch stopWatch = new StopWatch().start();
    logger.trace(
        "{} recovery [phase1] to {}: prepare remote engine for translog",
        request.shardId(),
        request.targetNode());
    final long startEngineStart = stopWatch.totalTime().millis();
    // Send a request preparing the new shard's translog to receive
    // operations. This ensures the shard engine is started and disables
    // garbage collection (not the JVM's GC!) of tombstone deletes
    cancellableThreads.executeIO(
        () ->
            recoveryTarget.prepareForTranslogOperations(
                totalTranslogOps, shard.segmentStats(false).getMaxUnsafeAutoIdTimestamp()));
    stopWatch.stop();

    response.startTime = stopWatch.totalTime().millis() - startEngineStart;
    logger.trace(
        "{} recovery [phase1] to {}: remote engine start took [{}]",
        request.shardId(),
        request.targetNode(),
        stopWatch.totalTime());
  }
 /** Cancels the recovery and interrupts all eligible threads. */
 public void cancel(String reason) {
   cancellableThreads.cancel(reason);
 }
  /**
   * Send the given snapshot's operations to this handler's target node.
   *
   * <p>Operations are bulked into a single request depending on an operation count limit or
   * size-in-bytes limit
   *
   * @return the total number of translog operations that were sent
   */
  protected int sendSnapshot(final Translog.Snapshot snapshot) {
    int ops = 0;
    long size = 0;
    int totalOperations = 0;
    final List<Translog.Operation> operations = new ArrayList<>();
    Translog.Operation operation;
    try {
      operation = snapshot.next(); // this ex should bubble up
    } catch (IOException ex) {
      throw new ElasticsearchException("failed to get next operation from translog", ex);
    }

    if (operation == null) {
      logger.trace(
          "[{}][{}] no translog operations to send to {}",
          indexName,
          shardId,
          request.targetNode());
    }
    while (operation != null) {
      if (shard.state() == IndexShardState.CLOSED) {
        throw new IndexShardClosedException(request.shardId());
      }
      cancellableThreads.checkForCancel();
      operations.add(operation);
      ops += 1;
      size += operation.estimateSize();
      totalOperations++;

      // Check if this request is past bytes threshold, and
      // if so, send it off
      if (size >= chunkSizeInBytes) {

        // don't throttle translog, since we lock for phase3 indexing,
        // so we need to move it as fast as possible. Note, since we
        // index docs to replicas while the index files are recovered
        // the lock can potentially be removed, in which case, it might
        // make sense to re-enable throttling in this phase
        cancellableThreads.execute(
            () -> recoveryTarget.indexTranslogOperations(operations, snapshot.totalOperations()));
        if (logger.isTraceEnabled()) {
          logger.trace(
              "[{}][{}] sent batch of [{}][{}] (total: [{}]) translog operations to {}",
              indexName,
              shardId,
              ops,
              new ByteSizeValue(size),
              snapshot.totalOperations(),
              request.targetNode());
        }

        ops = 0;
        size = 0;
        operations.clear();
      }
      try {
        operation = snapshot.next(); // this ex should bubble up
      } catch (IOException ex) {
        throw new ElasticsearchException("failed to get next operation from translog", ex);
      }
    }
    // send the leftover
    if (!operations.isEmpty()) {
      cancellableThreads.execute(
          () -> recoveryTarget.indexTranslogOperations(operations, snapshot.totalOperations()));
    }
    if (logger.isTraceEnabled()) {
      logger.trace(
          "[{}][{}] sent final batch of [{}][{}] (total: [{}]) translog operations to {}",
          indexName,
          shardId,
          ops,
          new ByteSizeValue(size),
          snapshot.totalOperations(),
          request.targetNode());
    }
    return totalOperations;
  }
  /**
   * Perform phase1 of the recovery operations. Once this {@link IndexCommit} snapshot has been
   * performed no commit operations (files being fsync'd) are effectively allowed on this index
   * until all recovery phases are done
   *
   * <p>Phase1 examines the segment files on the target node and copies over the segments that are
   * missing. Only segments that have the same size and checksum can be reused
   */
  public void phase1(final IndexCommit snapshot, final Translog.View translogView) {
    cancellableThreads.checkForCancel();
    // Total size of segment files that are recovered
    long totalSize = 0;
    // Total size of segment files that were able to be re-used
    long existingTotalSize = 0;
    final Store store = shard.store();
    store.incRef();
    try {
      StopWatch stopWatch = new StopWatch().start();
      final Store.MetadataSnapshot recoverySourceMetadata;
      try {
        recoverySourceMetadata = store.getMetadata(snapshot);
      } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
        shard.failShard("recovery", ex);
        throw ex;
      }
      for (String name : snapshot.getFileNames()) {
        final StoreFileMetaData md = recoverySourceMetadata.get(name);
        if (md == null) {
          logger.info(
              "Snapshot differs from actual index for file: {} meta: {}",
              name,
              recoverySourceMetadata.asMap());
          throw new CorruptIndexException(
              "Snapshot differs from actual index - maybe index was removed metadata has "
                  + recoverySourceMetadata.asMap().size()
                  + " files",
              name);
        }
      }
      // Generate a "diff" of all the identical, different, and missing
      // segment files on the target node, using the existing files on
      // the source node
      String recoverySourceSyncId = recoverySourceMetadata.getSyncId();
      String recoveryTargetSyncId = request.metadataSnapshot().getSyncId();
      final boolean recoverWithSyncId =
          recoverySourceSyncId != null && recoverySourceSyncId.equals(recoveryTargetSyncId);
      if (recoverWithSyncId) {
        final long numDocsTarget = request.metadataSnapshot().getNumDocs();
        final long numDocsSource = recoverySourceMetadata.getNumDocs();
        if (numDocsTarget != numDocsSource) {
          throw new IllegalStateException(
              "try to recover "
                  + request.shardId()
                  + " from primary shard with sync id but number "
                  + "of docs differ: "
                  + numDocsTarget
                  + " ("
                  + request.sourceNode().getName()
                  + ", primary) vs "
                  + numDocsSource
                  + "("
                  + request.targetNode().getName()
                  + ")");
        }
        // we shortcut recovery here because we have nothing to copy. but we must still start the
        // engine on the target.
        // so we don't return here
        logger.trace(
            "[{}][{}] skipping [phase1] to {} - identical sync id [{}] found on both source and target",
            indexName,
            shardId,
            request.targetNode(),
            recoverySourceSyncId);
      } else {
        final Store.RecoveryDiff diff =
            recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());
        for (StoreFileMetaData md : diff.identical) {
          response.phase1ExistingFileNames.add(md.name());
          response.phase1ExistingFileSizes.add(md.length());
          existingTotalSize += md.length();
          if (logger.isTraceEnabled()) {
            logger.trace(
                "[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}],"
                    + " size [{}]",
                indexName,
                shardId,
                request.targetNode(),
                md.name(),
                md.checksum(),
                md.length());
          }
          totalSize += md.length();
        }
        List<StoreFileMetaData> phase1Files =
            new ArrayList<>(diff.different.size() + diff.missing.size());
        phase1Files.addAll(diff.different);
        phase1Files.addAll(diff.missing);
        for (StoreFileMetaData md : phase1Files) {
          if (request.metadataSnapshot().asMap().containsKey(md.name())) {
            logger.trace(
                "[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote "
                    + "[{}], local [{}]",
                indexName,
                shardId,
                request.targetNode(),
                md.name(),
                request.metadataSnapshot().asMap().get(md.name()),
                md);
          } else {
            logger.trace(
                "[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote",
                indexName,
                shardId,
                request.targetNode(),
                md.name());
          }
          response.phase1FileNames.add(md.name());
          response.phase1FileSizes.add(md.length());
          totalSize += md.length();
        }

        response.phase1TotalSize = totalSize;
        response.phase1ExistingTotalSize = existingTotalSize;

        logger.trace(
            "[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with "
                + "total_size [{}]",
            indexName,
            shardId,
            request.targetNode(),
            response.phase1FileNames.size(),
            new ByteSizeValue(totalSize),
            response.phase1ExistingFileNames.size(),
            new ByteSizeValue(existingTotalSize));
        cancellableThreads.execute(
            () ->
                recoveryTarget.receiveFileInfo(
                    response.phase1FileNames,
                    response.phase1FileSizes,
                    response.phase1ExistingFileNames,
                    response.phase1ExistingFileSizes,
                    translogView.totalOperations()));
        // How many bytes we've copied since we last called RateLimiter.pause
        final Function<StoreFileMetaData, OutputStream> outputStreamFactories =
            md ->
                new BufferedOutputStream(
                    new RecoveryOutputStream(md, translogView), chunkSizeInBytes);
        sendFiles(
            store,
            phase1Files.toArray(new StoreFileMetaData[phase1Files.size()]),
            outputStreamFactories);
        // Send the CLEAN_FILES request, which takes all of the files that
        // were transferred and renames them from their temporary file
        // names to the actual file names. It also writes checksums for
        // the files after they have been renamed.
        //
        // Once the files have been renamed, any other files that are not
        // related to this recovery (out of date segments, for example)
        // are deleted
        try {
          cancellableThreads.executeIO(
              () ->
                  recoveryTarget.cleanFiles(
                      translogView.totalOperations(), recoverySourceMetadata));
        } catch (RemoteTransportException | IOException targetException) {
          final IOException corruptIndexException;
          // we realized that after the index was copied and we wanted to finalize the recovery
          // the index was corrupted:
          //   - maybe due to a broken segments file on an empty index (transferred with no
          // checksum)
          //   - maybe due to old segments without checksums or length only checks
          if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(targetException))
              != null) {
            try {
              final Store.MetadataSnapshot recoverySourceMetadata1 = store.getMetadata(snapshot);
              StoreFileMetaData[] metadata =
                  StreamSupport.stream(recoverySourceMetadata1.spliterator(), false)
                      .toArray(size -> new StoreFileMetaData[size]);
              ArrayUtil.timSort(
                  metadata,
                  (o1, o2) -> {
                    return Long.compare(o1.length(), o2.length()); // check small files first
                  });
              for (StoreFileMetaData md : metadata) {
                cancellableThreads.checkForCancel();
                logger.debug(
                    "{} checking integrity for file {} after remove corruption exception",
                    shard.shardId(),
                    md);
                if (store.checkIntegrityNoException(md)
                    == false) { // we are corrupted on the primary -- fail!
                  shard.failShard("recovery", corruptIndexException);
                  logger.warn(
                      "{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
                  throw corruptIndexException;
                }
              }
            } catch (IOException ex) {
              targetException.addSuppressed(ex);
              throw targetException;
            }
            // corruption has happened on the way to replica
            RemoteTransportException exception =
                new RemoteTransportException(
                    "File corruption occurred on recovery but " + "checksums are ok", null);
            exception.addSuppressed(targetException);
            logger.warn(
                (org.apache.logging.log4j.util.Supplier<?>)
                    () ->
                        new ParameterizedMessage(
                            "{} Remote file corruption during finalization of recovery on node {}. local checksum OK",
                            shard.shardId(),
                            request.targetNode()),
                corruptIndexException);
            throw exception;
          } else {
            throw targetException;
          }
        }
      }

      prepareTargetForTranslog(translogView.totalOperations());

      logger.trace(
          "[{}][{}] recovery [phase1] to {}: took [{}]",
          indexName,
          shardId,
          request.targetNode(),
          stopWatch.totalTime());
      response.phase1Time = stopWatch.totalTime().millis();
    } catch (Exception e) {
      throw new RecoverFilesRecoveryException(
          request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e);
    } finally {
      store.decRef();
    }
  }
  /**
   * Send the given snapshot's operations to this handler's target node.
   *
   * <p>Operations are bulked into a single request depending on an operation count limit or
   * size-in-bytes limit
   *
   * @return the total number of translog operations that were sent
   */
  protected int sendSnapshot(Translog.Snapshot snapshot) throws ElasticsearchException {
    int ops = 0;
    long size = 0;
    int totalOperations = 0;
    final List<Translog.Operation> operations = Lists.newArrayList();
    Translog.Operation operation = snapshot.next();

    final TransportRequestOptions recoveryOptions =
        TransportRequestOptions.options()
            .withCompress(recoverySettings.compress())
            .withType(TransportRequestOptions.Type.RECOVERY)
            .withTimeout(recoverySettings.internalActionLongTimeout());

    if (operation == null) {
      logger.trace(
          "[{}][{}] no translog operations (id: [{}]) to send to {}",
          indexName,
          shardId,
          snapshot.translogId(),
          request.targetNode());
    }
    while (operation != null) {
      if (shard.state() == IndexShardState.CLOSED) {
        throw new IndexShardClosedException(request.shardId());
      }
      cancellableThreads.checkForCancel();
      operations.add(operation);
      ops += 1;
      size += operation.estimateSize();
      totalOperations++;

      // Check if this request is past the size or bytes threshold, and
      // if so, send it off
      if (ops >= recoverySettings.translogOps()
          || size >= recoverySettings.translogSize().bytes()) {

        // don't throttle translog, since we lock for phase3 indexing,
        // so we need to move it as fast as possible. Note, since we
        // index docs to replicas while the index files are recovered
        // the lock can potentially be removed, in which case, it might
        // make sense to re-enable throttling in this phase
        //                if (recoverySettings.rateLimiter() != null) {
        //                    recoverySettings.rateLimiter().pause(size);
        //                }

        if (logger.isTraceEnabled()) {
          logger.trace(
              "[{}][{}] sending batch of [{}][{}] (total: [{}], id: [{}]) translog operations to {}",
              indexName,
              shardId,
              ops,
              new ByteSizeValue(size),
              shard.translog().estimatedNumberOfOperations(),
              snapshot.translogId(),
              request.targetNode());
        }
        cancellableThreads.execute(
            new Interruptable() {
              @Override
              public void run() throws InterruptedException {
                final RecoveryTranslogOperationsRequest translogOperationsRequest =
                    new RecoveryTranslogOperationsRequest(
                        request.recoveryId(),
                        request.shardId(),
                        operations,
                        shard.translog().estimatedNumberOfOperations());
                transportService
                    .submitRequest(
                        request.targetNode(),
                        RecoveryTarget.Actions.TRANSLOG_OPS,
                        translogOperationsRequest,
                        recoveryOptions,
                        EmptyTransportResponseHandler.INSTANCE_SAME)
                    .txGet();
              }
            });

        ops = 0;
        size = 0;
        operations.clear();
      }
      operation = snapshot.next();
    }
    // send the leftover
    if (logger.isTraceEnabled()) {
      logger.trace(
          "[{}][{}] sending final batch of [{}][{}] (total: [{}], id: [{}]) translog operations to {}",
          indexName,
          shardId,
          ops,
          new ByteSizeValue(size),
          shard.translog().estimatedNumberOfOperations(),
          snapshot.translogId(),
          request.targetNode());
    }
    if (!operations.isEmpty()) {
      cancellableThreads.execute(
          new Interruptable() {
            @Override
            public void run() throws InterruptedException {
              RecoveryTranslogOperationsRequest translogOperationsRequest =
                  new RecoveryTranslogOperationsRequest(
                      request.recoveryId(),
                      request.shardId(),
                      operations,
                      shard.translog().estimatedNumberOfOperations());
              transportService
                  .submitRequest(
                      request.targetNode(),
                      RecoveryTarget.Actions.TRANSLOG_OPS,
                      translogOperationsRequest,
                      recoveryOptions,
                      EmptyTransportResponseHandler.INSTANCE_SAME)
                  .txGet();
            }
          });
    }
    return totalOperations;
  }
  /**
   * Ensures that the mapping in the cluster state is the same as the mapping in our mapper service.
   * If the mapping is not in sync, sends a request to update it in the cluster state and blocks
   * until it has finished being updated.
   */
  private void updateMappingOnMaster() {
    // we test that the cluster state is in sync with our in memory mapping stored by the
    // mapperService
    // we have to do it under the "cluster state update" thread to make sure that one doesn't modify
    // it
    // while we're checking
    final BlockingQueue<DocumentMapper> documentMappersToUpdate =
        ConcurrentCollections.newBlockingQueue();
    final CountDownLatch latch = new CountDownLatch(1);
    final AtomicReference<Throwable> mappingCheckException = new AtomicReference<>();

    // we use immediate as this is a very light weight check and we don't wait to delay recovery
    clusterService.submitStateUpdateTask(
        "recovery_mapping_check",
        Priority.IMMEDIATE,
        new MappingUpdateTask(
            clusterService,
            indexService,
            recoverySettings,
            latch,
            documentMappersToUpdate,
            mappingCheckException,
            this.cancellableThreads));
    cancellableThreads.execute(
        new Interruptable() {
          @Override
          public void run() throws InterruptedException {
            latch.await();
          }
        });
    if (mappingCheckException.get() != null) {
      logger.warn("error during mapping check, failing recovery", mappingCheckException.get());
      throw new ElasticsearchException("error during mapping check", mappingCheckException.get());
    }
    if (documentMappersToUpdate.isEmpty()) {
      return;
    }
    final CountDownLatch updatedOnMaster = new CountDownLatch(documentMappersToUpdate.size());
    MappingUpdatedAction.MappingUpdateListener listener =
        new MappingUpdatedAction.MappingUpdateListener() {
          @Override
          public void onMappingUpdate() {
            updatedOnMaster.countDown();
          }

          @Override
          public void onFailure(Throwable t) {
            logger.debug(
                "{} recovery to {}: failed to update mapping on master",
                request.shardId(),
                request.targetNode(),
                t);
            updatedOnMaster.countDown();
          }
        };
    for (DocumentMapper documentMapper : documentMappersToUpdate) {
      mappingUpdatedAction.updateMappingOnMaster(
          indexService.index().getName(), documentMapper, indexService.indexUUID(), listener);
    }
    cancellableThreads.execute(
        new Interruptable() {
          @Override
          public void run() throws InterruptedException {
            try {
              if (!updatedOnMaster.await(
                  recoverySettings.internalActionTimeout().millis(), TimeUnit.MILLISECONDS)) {
                logger.debug(
                    "[{}][{}] recovery [phase2] to {}: waiting on pending mapping update timed out. waited [{}]",
                    indexName,
                    shardId,
                    request.targetNode(),
                    recoverySettings.internalActionTimeout());
              }
            } catch (InterruptedException e) {
              Thread.currentThread().interrupt();
              logger.debug("interrupted while waiting for mapping to update on master");
            }
          }
        });
  }
  /**
   * Perform phase2 of the recovery process
   *
   * <p>Phase2 takes a snapshot of the current translog *without* acquiring the write lock (however,
   * the translog snapshot is a point-in-time view of the translog). It then sends each translog
   * operation to the target node so it can be replayed into the new shard.
   *
   * <p>{@code InternalEngine#recover} is responsible for taking the snapshot of the translog and
   * releasing it once all 3 phases of recovery are complete
   */
  @Override
  public void phase2(Translog.Snapshot snapshot) throws ElasticsearchException {
    if (shard.state() == IndexShardState.CLOSED) {
      throw new IndexShardClosedException(request.shardId());
    }
    cancellableThreads.checkForCancel();
    logger.trace("{} recovery [phase2] to {}: start", request.shardId(), request.targetNode());
    StopWatch stopWatch = new StopWatch().start();
    cancellableThreads.execute(
        new Interruptable() {
          @Override
          public void run() throws InterruptedException {
            // Send a request preparing the new shard's translog to receive
            // operations. This ensures the shard engine is started and disables
            // garbage collection (not the JVM's GC!) of tombstone deletes
            transportService
                .submitRequest(
                    request.targetNode(),
                    RecoveryTarget.Actions.PREPARE_TRANSLOG,
                    new RecoveryPrepareForTranslogOperationsRequest(
                        request.recoveryId(),
                        request.shardId(),
                        shard.translog().estimatedNumberOfOperations()),
                    TransportRequestOptions.options()
                        .withTimeout(recoverySettings.internalActionTimeout()),
                    EmptyTransportResponseHandler.INSTANCE_SAME)
                .txGet();
          }
        });

    stopWatch.stop();
    response.startTime = stopWatch.totalTime().millis();
    logger.trace(
        "{} recovery [phase2] to {}: start took [{}]",
        request.shardId(),
        request.targetNode(),
        stopWatch.totalTime());

    logger.trace(
        "{} recovery [phase2] to {}: updating current mapping to master",
        request.shardId(),
        request.targetNode());
    // Ensure that the mappings are synced with the master node
    updateMappingOnMaster();

    logger.trace(
        "{} recovery [phase2] to {}: sending transaction log operations",
        request.shardId(),
        request.targetNode());
    stopWatch = new StopWatch().start();
    // Send all the snapshot's translog operations to the target
    int totalOperations = sendSnapshot(snapshot);
    stopWatch.stop();
    logger.trace(
        "{} recovery [phase2] to {}: took [{}]",
        request.shardId(),
        request.targetNode(),
        stopWatch.totalTime());
    response.phase2Time = stopWatch.totalTime().millis();
    response.phase2Operations = totalOperations;
  }
  /**
   * Perform phase1 of the recovery operations. Once this {@link SnapshotIndexCommit} snapshot has
   * been performed no commit operations (files being fsync'd) are effectively allowed on this index
   * until all recovery phases are done
   *
   * <p>Phase1 examines the segment files on the target node and copies over the segments that are
   * missing. Only segments that have the same size and checksum can be reused
   *
   * <p>{@code InternalEngine#recover} is responsible for snapshotting the index and releasing the
   * snapshot once all 3 phases of recovery are complete
   */
  @Override
  public void phase1(final SnapshotIndexCommit snapshot) throws ElasticsearchException {
    cancellableThreads.checkForCancel();
    // Total size of segment files that are recovered
    long totalSize = 0;
    // Total size of segment files that were able to be re-used
    long existingTotalSize = 0;
    final Store store = shard.store();
    store.incRef();
    try {
      StopWatch stopWatch = new StopWatch().start();
      final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot);
      for (String name : snapshot.getFiles()) {
        final StoreFileMetaData md = recoverySourceMetadata.get(name);
        if (md == null) {
          logger.info(
              "Snapshot differs from actual index for file: {} meta: {}",
              name,
              recoverySourceMetadata.asMap());
          throw new CorruptIndexException(
              "Snapshot differs from actual index - maybe index was removed metadata has "
                  + recoverySourceMetadata.asMap().size()
                  + " files");
        }
      }
      String recoverySourceSyncId = recoverySourceMetadata.getSyncId();
      String recoveryTargetSyncId = request.metadataSnapshot().getSyncId();
      final boolean recoverWithSyncId =
          recoverySourceSyncId != null && recoverySourceSyncId.equals(recoveryTargetSyncId);
      if (recoverWithSyncId) {
        final long numDocsTarget = request.metadataSnapshot().getNumDocs();
        final long numDocsSource = recoverySourceMetadata.getNumDocs();
        if (numDocsTarget != numDocsSource) {
          throw new IllegalStateException(
              "try to recover "
                  + request.shardId()
                  + " from primary shard with sync id but number of docs differ: "
                  + numDocsTarget
                  + " ("
                  + request.sourceNode().getName()
                  + ", primary) vs "
                  + numDocsSource
                  + "("
                  + request.targetNode().getName()
                  + ")");
        }
        // we shortcut recovery here because we have nothing to copy. but we must still start the
        // engine on the target.
        // so we don't return here
        logger.trace(
            "[{}][{}] skipping [phase1] to {} - identical sync id [{}] found on both source and target",
            indexName,
            shardId,
            request.targetNode(),
            recoverySourceSyncId);
      } else {

        // Generate a "diff" of all the identical, different, and missing
        // segment files on the target node, using the existing files on
        // the source node
        final Store.RecoveryDiff diff =
            recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());
        for (StoreFileMetaData md : diff.identical) {
          response.phase1ExistingFileNames.add(md.name());
          response.phase1ExistingFileSizes.add(md.length());
          existingTotalSize += md.length();
          if (logger.isTraceEnabled()) {
            logger.trace(
                "[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}], size [{}]",
                indexName,
                shardId,
                request.targetNode(),
                md.name(),
                md.checksum(),
                md.length());
          }
          totalSize += md.length();
        }
        for (StoreFileMetaData md : Iterables.concat(diff.different, diff.missing)) {
          if (request.metadataSnapshot().asMap().containsKey(md.name())) {
            logger.trace(
                "[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote [{}], local [{}]",
                indexName,
                shardId,
                request.targetNode(),
                md.name(),
                request.metadataSnapshot().get(md.name()),
                md);
          } else {
            logger.trace(
                "[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote",
                indexName,
                shardId,
                request.targetNode(),
                md.name());
          }
          response.phase1FileNames.add(md.name());
          response.phase1FileSizes.add(md.length());
          totalSize += md.length();
        }
        response.phase1TotalSize = totalSize;
        response.phase1ExistingTotalSize = existingTotalSize;

        logger.trace(
            "[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]",
            indexName,
            shardId,
            request.targetNode(),
            response.phase1FileNames.size(),
            new ByteSizeValue(totalSize),
            response.phase1ExistingFileNames.size(),
            new ByteSizeValue(existingTotalSize));
        cancellableThreads.execute(
            new Interruptable() {
              @Override
              public void run() throws InterruptedException {
                RecoveryFilesInfoRequest recoveryInfoFilesRequest =
                    new RecoveryFilesInfoRequest(
                        request.recoveryId(),
                        request.shardId(),
                        response.phase1FileNames,
                        response.phase1FileSizes,
                        response.phase1ExistingFileNames,
                        response.phase1ExistingFileSizes,
                        shard.translog().estimatedNumberOfOperations(),
                        response.phase1TotalSize,
                        response.phase1ExistingTotalSize);
                transportService
                    .submitRequest(
                        request.targetNode(),
                        RecoveryTarget.Actions.FILES_INFO,
                        recoveryInfoFilesRequest,
                        TransportRequestOptions.options()
                            .withTimeout(recoverySettings.internalActionTimeout()),
                        EmptyTransportResponseHandler.INSTANCE_SAME)
                    .txGet();
              }
            });

        // This latch will be used to wait until all files have been transferred to the target node
        final CountDownLatch latch = new CountDownLatch(response.phase1FileNames.size());
        final CopyOnWriteArrayList<Throwable> exceptions = new CopyOnWriteArrayList<>();
        final AtomicReference<Throwable> corruptedEngine = new AtomicReference<>();
        int fileIndex = 0;
        ThreadPoolExecutor pool;

        // How many bytes we've copied since we last called RateLimiter.pause
        final AtomicLong bytesSinceLastPause = new AtomicLong();

        for (final String name : response.phase1FileNames) {
          long fileSize = response.phase1FileSizes.get(fileIndex);

          // Files are split into two categories, files that are "small"
          // (under 5mb) and other files. Small files are transferred
          // using a separate thread pool dedicated to small files.
          //
          // The idea behind this is that while we are transferring an
          // older, large index, a user may create a new index, but that
          // index will not be able to recover until the large index
          // finishes, by using two different thread pools we can allow
          // tiny files (like segments for a brand new index) to be
          // recovered while ongoing large segment recoveries are
          // happening. It also allows these pools to be configured
          // separately.
          if (fileSize > RecoverySettings.SMALL_FILE_CUTOFF_BYTES) {
            pool = recoverySettings.concurrentStreamPool();
          } else {
            pool = recoverySettings.concurrentSmallFileStreamPool();
          }

          pool.execute(
              new AbstractRunnable() {
                @Override
                public void onFailure(Throwable t) {
                  // we either got rejected or the store can't be incremented / we are canceled
                  logger.debug("Failed to transfer file [" + name + "] on recovery");
                }

                public void onAfter() {
                  // Signify this file has completed by decrementing the latch
                  latch.countDown();
                }

                @Override
                protected void doRun() {
                  cancellableThreads.checkForCancel();
                  store.incRef();
                  final StoreFileMetaData md = recoverySourceMetadata.get(name);
                  try (final IndexInput indexInput =
                      store.directory().openInput(name, IOContext.READONCE)) {
                    final int BUFFER_SIZE = (int) recoverySettings.fileChunkSize().bytes();
                    final byte[] buf = new byte[BUFFER_SIZE];
                    boolean shouldCompressRequest = recoverySettings.compress();
                    if (CompressorFactory.isCompressed(indexInput)) {
                      shouldCompressRequest = false;
                    }

                    final long len = indexInput.length();
                    long readCount = 0;
                    final TransportRequestOptions requestOptions =
                        TransportRequestOptions.options()
                            .withCompress(shouldCompressRequest)
                            .withType(TransportRequestOptions.Type.RECOVERY)
                            .withTimeout(recoverySettings.internalActionTimeout());

                    while (readCount < len) {
                      if (shard.state()
                          == IndexShardState.CLOSED) { // check if the shard got closed on us
                        throw new IndexShardClosedException(shard.shardId());
                      }
                      int toRead =
                          readCount + BUFFER_SIZE > len ? (int) (len - readCount) : BUFFER_SIZE;
                      final long position = indexInput.getFilePointer();

                      // Pause using the rate limiter, if desired, to throttle the recovery
                      RateLimiter rl = recoverySettings.rateLimiter();
                      long throttleTimeInNanos = 0;
                      if (rl != null) {
                        long bytes = bytesSinceLastPause.addAndGet(toRead);
                        if (bytes > rl.getMinPauseCheckBytes()) {
                          // Time to pause
                          bytesSinceLastPause.addAndGet(-bytes);
                          throttleTimeInNanos = rl.pause(bytes);
                          shard.recoveryStats().addThrottleTime(throttleTimeInNanos);
                        }
                      }
                      indexInput.readBytes(buf, 0, toRead, false);
                      final BytesArray content = new BytesArray(buf, 0, toRead);
                      readCount += toRead;
                      final boolean lastChunk = readCount == len;
                      final RecoveryFileChunkRequest fileChunkRequest =
                          new RecoveryFileChunkRequest(
                              request.recoveryId(),
                              request.shardId(),
                              md,
                              position,
                              content,
                              lastChunk,
                              shard.translog().estimatedNumberOfOperations(),
                              throttleTimeInNanos);
                      cancellableThreads.execute(
                          new Interruptable() {
                            @Override
                            public void run() throws InterruptedException {
                              // Actually send the file chunk to the target node, waiting for it to
                              // complete
                              transportService
                                  .submitRequest(
                                      request.targetNode(),
                                      RecoveryTarget.Actions.FILE_CHUNK,
                                      fileChunkRequest,
                                      requestOptions,
                                      EmptyTransportResponseHandler.INSTANCE_SAME)
                                  .txGet();
                            }
                          });
                    }
                  } catch (Throwable e) {
                    final Throwable corruptIndexException;
                    if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
                      if (store.checkIntegrity(md)
                          == false) { // we are corrupted on the primary -- fail!
                        logger.warn(
                            "{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
                        if (corruptedEngine.compareAndSet(null, corruptIndexException) == false) {
                          // if we are not the first exception, add ourselves as suppressed to the
                          // main one:
                          corruptedEngine.get().addSuppressed(e);
                        }
                      } else { // corruption has happened on the way to replica
                        RemoteTransportException exception =
                            new RemoteTransportException(
                                "File corruption occurred on recovery but checksums are ok", null);
                        exception.addSuppressed(e);
                        exceptions.add(0, exception); // last exception first
                        logger.warn(
                            "{} Remote file corruption on node {}, recovering {}. local checksum OK",
                            corruptIndexException,
                            shard.shardId(),
                            request.targetNode(),
                            md);
                      }
                    } else {
                      exceptions.add(0, e); // last exceptions first
                    }
                  } finally {
                    store.decRef();
                  }
                }
              });
          fileIndex++;
        }

        cancellableThreads.execute(
            new Interruptable() {
              @Override
              public void run() throws InterruptedException {
                // Wait for all files that need to be transferred to finish transferring
                latch.await();
              }
            });

        if (corruptedEngine.get() != null) {
          throw corruptedEngine.get();
        } else {
          ExceptionsHelper.rethrowAndSuppress(exceptions);
        }

        cancellableThreads.execute(
            new Interruptable() {
              @Override
              public void run() throws InterruptedException {
                // Send the CLEAN_FILES request, which takes all of the files that
                // were transferred and renames them from their temporary file
                // names to the actual file names. It also writes checksums for
                // the files after they have been renamed.
                //
                // Once the files have been renamed, any other files that are not
                // related to this recovery (out of date segments, for example)
                // are deleted
                try {
                  transportService
                      .submitRequest(
                          request.targetNode(),
                          RecoveryTarget.Actions.CLEAN_FILES,
                          new RecoveryCleanFilesRequest(
                              request.recoveryId(),
                              shard.shardId(),
                              recoverySourceMetadata,
                              shard.translog().estimatedNumberOfOperations()),
                          TransportRequestOptions.options()
                              .withTimeout(recoverySettings.internalActionTimeout()),
                          EmptyTransportResponseHandler.INSTANCE_SAME)
                      .txGet();
                } catch (RemoteTransportException remoteException) {
                  final IOException corruptIndexException;
                  // we realized that after the index was copied and we wanted to finalize the
                  // recovery
                  // the index was corrupted:
                  //   - maybe due to a broken segments file on an empty index (transferred with no
                  // checksum)
                  //   - maybe due to old segments without checksums or length only checks
                  if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(remoteException))
                      != null) {
                    try {
                      final Store.MetadataSnapshot recoverySourceMetadata =
                          store.getMetadata(snapshot);
                      StoreFileMetaData[] metadata =
                          Iterables.toArray(recoverySourceMetadata, StoreFileMetaData.class);
                      ArrayUtil.timSort(
                          metadata,
                          new Comparator<StoreFileMetaData>() {
                            @Override
                            public int compare(StoreFileMetaData o1, StoreFileMetaData o2) {
                              return Long.compare(
                                  o1.length(), o2.length()); // check small files first
                            }
                          });
                      for (StoreFileMetaData md : metadata) {
                        logger.debug(
                            "{} checking integrity for file {} after remove corruption exception",
                            shard.shardId(),
                            md);
                        if (store.checkIntegrity(md)
                            == false) { // we are corrupted on the primary -- fail!
                          logger.warn(
                              "{} Corrupted file detected {} checksum mismatch",
                              shard.shardId(),
                              md);
                          throw corruptIndexException;
                        }
                      }
                    } catch (IOException ex) {
                      remoteException.addSuppressed(ex);
                      throw remoteException;
                    }
                    // corruption has happened on the way to replica
                    RemoteTransportException exception =
                        new RemoteTransportException(
                            "File corruption occurred on recovery but checksums are ok", null);
                    exception.addSuppressed(remoteException);
                    logger.warn(
                        "{} Remote file corruption during finalization on node {}, recovering {}. local checksum OK",
                        corruptIndexException,
                        shard.shardId(),
                        request.targetNode());
                    throw exception;
                  } else {
                    throw remoteException;
                  }
                }
              }
            });
      }
      stopWatch.stop();
      logger.trace(
          "[{}][{}] recovery [phase1] to {}: took [{}]",
          indexName,
          shardId,
          request.targetNode(),
          stopWatch.totalTime());
      response.phase1Time = stopWatch.totalTime().millis();
    } catch (Throwable e) {
      throw new RecoverFilesRecoveryException(
          request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e);
    } finally {
      store.decRef();
    }
  }