コード例 #1
0
  private void doSnapshot(final Snapshot snapshot) throws IndexShardGatewaySnapshotFailedException {
    ImmutableMap<String, BlobMetaData> blobs;
    try {
      blobs = blobContainer.listBlobs();
    } catch (IOException e) {
      throw new IndexShardGatewaySnapshotFailedException(shardId, "failed to list blobs", e);
    }

    long generation = findLatestFileNameGeneration(blobs);
    CommitPoints commitPoints = buildCommitPoints(blobs);

    currentSnapshotStatus.index().startTime(System.currentTimeMillis());
    currentSnapshotStatus.updateStage(SnapshotStatus.Stage.INDEX);

    final SnapshotIndexCommit snapshotIndexCommit = snapshot.indexCommit();
    final Translog.Snapshot translogSnapshot = snapshot.translogSnapshot();

    final CountDownLatch indexLatch = new CountDownLatch(snapshotIndexCommit.getFiles().length);
    final CopyOnWriteArrayList<Throwable> failures = new CopyOnWriteArrayList<Throwable>();
    final List<CommitPoint.FileInfo> indexCommitPointFiles = Lists.newArrayList();

    int indexNumberOfFiles = 0;
    long indexTotalFilesSize = 0;
    for (final String fileName : snapshotIndexCommit.getFiles()) {
      StoreFileMetaData md;
      try {
        md = store.metaData(fileName);
      } catch (IOException e) {
        throw new IndexShardGatewaySnapshotFailedException(
            shardId, "Failed to get store file metadata", e);
      }

      boolean snapshotRequired = false;
      if (snapshot.indexChanged() && fileName.equals(snapshotIndexCommit.getSegmentsFileName())) {
        snapshotRequired = true; // we want to always snapshot the segment file if the index changed
      }

      CommitPoint.FileInfo fileInfo = commitPoints.findPhysicalIndexFile(fileName);
      if (fileInfo == null
          || !fileInfo.isSame(md)
          || !commitPointFileExistsInBlobs(fileInfo, blobs)) {
        // commit point file does not exists in any commit point, or has different length, or does
        // not fully exists in the listed blobs
        snapshotRequired = true;
      }

      if (snapshotRequired) {
        indexNumberOfFiles++;
        indexTotalFilesSize += md.length();
        // create a new FileInfo
        try {
          CommitPoint.FileInfo snapshotFileInfo =
              new CommitPoint.FileInfo(
                  fileNameFromGeneration(++generation), fileName, md.length(), md.checksum());
          indexCommitPointFiles.add(snapshotFileInfo);
          snapshotFile(snapshotIndexCommit.getDirectory(), snapshotFileInfo, indexLatch, failures);
        } catch (IOException e) {
          failures.add(e);
          indexLatch.countDown();
        }
      } else {
        indexCommitPointFiles.add(fileInfo);
        indexLatch.countDown();
      }
    }
    currentSnapshotStatus.index().files(indexNumberOfFiles, indexTotalFilesSize);

    try {
      indexLatch.await();
    } catch (InterruptedException e) {
      failures.add(e);
    }
    if (!failures.isEmpty()) {
      throw new IndexShardGatewaySnapshotFailedException(
          shardId(), "Failed to perform snapshot (index files)", failures.get(failures.size() - 1));
    }

    currentSnapshotStatus
        .index()
        .time(System.currentTimeMillis() - currentSnapshotStatus.index().startTime());

    currentSnapshotStatus.updateStage(SnapshotStatus.Stage.TRANSLOG);
    currentSnapshotStatus.translog().startTime(System.currentTimeMillis());

    // Note, we assume the snapshot is always started from "base 0". We need to seek forward if we
    // want to lastTranslogPosition if we want the delta
    List<CommitPoint.FileInfo> translogCommitPointFiles = Lists.newArrayList();
    int expectedNumberOfOperations = 0;
    boolean snapshotRequired = false;
    if (snapshot.newTranslogCreated()) {
      if (translogSnapshot.lengthInBytes() > 0) {
        snapshotRequired = true;
        expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations();
      }
    } else {
      // if we have a commit point, check that we have all the files listed in it in the blob store
      if (!commitPoints.commits().isEmpty()) {
        CommitPoint commitPoint = commitPoints.commits().get(0);
        boolean allTranslogFilesExists = true;
        for (CommitPoint.FileInfo fileInfo : commitPoint.translogFiles()) {
          if (!commitPointFileExistsInBlobs(fileInfo, blobs)) {
            allTranslogFilesExists = false;
            break;
          }
        }
        // if everything exists, we can seek forward in case there are new operations, otherwise, we
        // copy over all again...
        if (allTranslogFilesExists) {
          translogCommitPointFiles.addAll(commitPoint.translogFiles());
          if (snapshot.sameTranslogNewOperations()) {
            translogSnapshot.seekForward(snapshot.lastTranslogLength());
            if (translogSnapshot.lengthInBytes() > 0) {
              snapshotRequired = true;
              expectedNumberOfOperations =
                  translogSnapshot.estimatedTotalOperations()
                      - snapshot.lastTotalTranslogOperations();
            }
          } // else (no operations, nothing to snapshot)
        } else {
          // a full translog snapshot is required
          if (translogSnapshot.lengthInBytes() > 0) {
            expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations();
            snapshotRequired = true;
          }
        }
      } else {
        // no commit point, snapshot all the translog
        if (translogSnapshot.lengthInBytes() > 0) {
          expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations();
          snapshotRequired = true;
        }
      }
    }
    currentSnapshotStatus.translog().expectedNumberOfOperations(expectedNumberOfOperations);

    if (snapshotRequired) {
      CommitPoint.FileInfo addedTranslogFileInfo =
          new CommitPoint.FileInfo(
              fileNameFromGeneration(++generation),
              "translog-" + translogSnapshot.translogId(),
              translogSnapshot.lengthInBytes(),
              null /* no need for checksum in translog */);
      translogCommitPointFiles.add(addedTranslogFileInfo);
      try {
        snapshotTranslog(translogSnapshot, addedTranslogFileInfo);
      } catch (Exception e) {
        throw new IndexShardGatewaySnapshotFailedException(
            shardId, "Failed to snapshot translog", e);
      }
    }
    currentSnapshotStatus
        .translog()
        .time(System.currentTimeMillis() - currentSnapshotStatus.translog().startTime());

    // now create and write the commit point
    currentSnapshotStatus.updateStage(SnapshotStatus.Stage.FINALIZE);
    long version = 0;
    if (!commitPoints.commits().isEmpty()) {
      version = commitPoints.commits().iterator().next().version() + 1;
    }
    String commitPointName = "commit-" + Long.toString(version, Character.MAX_RADIX);
    CommitPoint commitPoint =
        new CommitPoint(
            version,
            commitPointName,
            CommitPoint.Type.GENERATED,
            indexCommitPointFiles,
            translogCommitPointFiles);
    try {
      byte[] commitPointData = CommitPoints.toXContent(commitPoint);
      blobContainer.writeBlob(
          commitPointName, new FastByteArrayInputStream(commitPointData), commitPointData.length);
    } catch (Exception e) {
      throw new IndexShardGatewaySnapshotFailedException(
          shardId, "Failed to write commit point", e);
    }

    // delete all files that are not referenced by any commit point
    // build a new CommitPoint, that includes this one and all the saved ones
    List<CommitPoint> newCommitPointsList = Lists.newArrayList();
    newCommitPointsList.add(commitPoint);
    for (CommitPoint point : commitPoints) {
      if (point.type() == CommitPoint.Type.SAVED) {
        newCommitPointsList.add(point);
      }
    }
    CommitPoints newCommitPoints = new CommitPoints(newCommitPointsList);
    // first, go over and delete all the commit points
    for (String blobName : blobs.keySet()) {
      if (!blobName.startsWith("commit-")) {
        continue;
      }
      long checkedVersion =
          Long.parseLong(blobName.substring("commit-".length()), Character.MAX_RADIX);
      if (!newCommitPoints.hasVersion(checkedVersion)) {
        try {
          blobContainer.deleteBlob(blobName);
        } catch (IOException e) {
          // ignore
        }
      }
    }
    // now go over all the blobs, and if they don't exists in a commit point, delete them
    for (String blobName : blobs.keySet()) {
      String name = blobName;
      if (!name.startsWith("__")) {
        continue;
      }
      if (blobName.contains(".part")) {
        name = blobName.substring(0, blobName.indexOf(".part"));
      }
      if (newCommitPoints.findNameFile(name) == null) {
        try {
          blobContainer.deleteBlob(blobName);
        } catch (IOException e) {
          // ignore, will delete it laters
        }
      }
    }
  }
コード例 #2
0
  /**
   * Perform phase1 of the recovery operations. Once this {@link SnapshotIndexCommit} snapshot has
   * been performed no commit operations (files being fsync'd) are effectively allowed on this index
   * until all recovery phases are done
   *
   * <p>Phase1 examines the segment files on the target node and copies over the segments that are
   * missing. Only segments that have the same size and checksum can be reused
   *
   * <p>{@code InternalEngine#recover} is responsible for snapshotting the index and releasing the
   * snapshot once all 3 phases of recovery are complete
   */
  @Override
  public void phase1(final SnapshotIndexCommit snapshot) throws ElasticsearchException {
    cancellableThreads.checkForCancel();
    // Total size of segment files that are recovered
    long totalSize = 0;
    // Total size of segment files that were able to be re-used
    long existingTotalSize = 0;
    final Store store = shard.store();
    store.incRef();
    try {
      StopWatch stopWatch = new StopWatch().start();
      final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot);
      for (String name : snapshot.getFiles()) {
        final StoreFileMetaData md = recoverySourceMetadata.get(name);
        if (md == null) {
          logger.info(
              "Snapshot differs from actual index for file: {} meta: {}",
              name,
              recoverySourceMetadata.asMap());
          throw new CorruptIndexException(
              "Snapshot differs from actual index - maybe index was removed metadata has "
                  + recoverySourceMetadata.asMap().size()
                  + " files");
        }
      }
      String recoverySourceSyncId = recoverySourceMetadata.getSyncId();
      String recoveryTargetSyncId = request.metadataSnapshot().getSyncId();
      final boolean recoverWithSyncId =
          recoverySourceSyncId != null && recoverySourceSyncId.equals(recoveryTargetSyncId);
      if (recoverWithSyncId) {
        final long numDocsTarget = request.metadataSnapshot().getNumDocs();
        final long numDocsSource = recoverySourceMetadata.getNumDocs();
        if (numDocsTarget != numDocsSource) {
          throw new IllegalStateException(
              "try to recover "
                  + request.shardId()
                  + " from primary shard with sync id but number of docs differ: "
                  + numDocsTarget
                  + " ("
                  + request.sourceNode().getName()
                  + ", primary) vs "
                  + numDocsSource
                  + "("
                  + request.targetNode().getName()
                  + ")");
        }
        // we shortcut recovery here because we have nothing to copy. but we must still start the
        // engine on the target.
        // so we don't return here
        logger.trace(
            "[{}][{}] skipping [phase1] to {} - identical sync id [{}] found on both source and target",
            indexName,
            shardId,
            request.targetNode(),
            recoverySourceSyncId);
      } else {

        // Generate a "diff" of all the identical, different, and missing
        // segment files on the target node, using the existing files on
        // the source node
        final Store.RecoveryDiff diff =
            recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());
        for (StoreFileMetaData md : diff.identical) {
          response.phase1ExistingFileNames.add(md.name());
          response.phase1ExistingFileSizes.add(md.length());
          existingTotalSize += md.length();
          if (logger.isTraceEnabled()) {
            logger.trace(
                "[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}], size [{}]",
                indexName,
                shardId,
                request.targetNode(),
                md.name(),
                md.checksum(),
                md.length());
          }
          totalSize += md.length();
        }
        for (StoreFileMetaData md : Iterables.concat(diff.different, diff.missing)) {
          if (request.metadataSnapshot().asMap().containsKey(md.name())) {
            logger.trace(
                "[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote [{}], local [{}]",
                indexName,
                shardId,
                request.targetNode(),
                md.name(),
                request.metadataSnapshot().get(md.name()),
                md);
          } else {
            logger.trace(
                "[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote",
                indexName,
                shardId,
                request.targetNode(),
                md.name());
          }
          response.phase1FileNames.add(md.name());
          response.phase1FileSizes.add(md.length());
          totalSize += md.length();
        }
        response.phase1TotalSize = totalSize;
        response.phase1ExistingTotalSize = existingTotalSize;

        logger.trace(
            "[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]",
            indexName,
            shardId,
            request.targetNode(),
            response.phase1FileNames.size(),
            new ByteSizeValue(totalSize),
            response.phase1ExistingFileNames.size(),
            new ByteSizeValue(existingTotalSize));
        cancellableThreads.execute(
            new Interruptable() {
              @Override
              public void run() throws InterruptedException {
                RecoveryFilesInfoRequest recoveryInfoFilesRequest =
                    new RecoveryFilesInfoRequest(
                        request.recoveryId(),
                        request.shardId(),
                        response.phase1FileNames,
                        response.phase1FileSizes,
                        response.phase1ExistingFileNames,
                        response.phase1ExistingFileSizes,
                        shard.translog().estimatedNumberOfOperations(),
                        response.phase1TotalSize,
                        response.phase1ExistingTotalSize);
                transportService
                    .submitRequest(
                        request.targetNode(),
                        RecoveryTarget.Actions.FILES_INFO,
                        recoveryInfoFilesRequest,
                        TransportRequestOptions.options()
                            .withTimeout(recoverySettings.internalActionTimeout()),
                        EmptyTransportResponseHandler.INSTANCE_SAME)
                    .txGet();
              }
            });

        // This latch will be used to wait until all files have been transferred to the target node
        final CountDownLatch latch = new CountDownLatch(response.phase1FileNames.size());
        final CopyOnWriteArrayList<Throwable> exceptions = new CopyOnWriteArrayList<>();
        final AtomicReference<Throwable> corruptedEngine = new AtomicReference<>();
        int fileIndex = 0;
        ThreadPoolExecutor pool;

        // How many bytes we've copied since we last called RateLimiter.pause
        final AtomicLong bytesSinceLastPause = new AtomicLong();

        for (final String name : response.phase1FileNames) {
          long fileSize = response.phase1FileSizes.get(fileIndex);

          // Files are split into two categories, files that are "small"
          // (under 5mb) and other files. Small files are transferred
          // using a separate thread pool dedicated to small files.
          //
          // The idea behind this is that while we are transferring an
          // older, large index, a user may create a new index, but that
          // index will not be able to recover until the large index
          // finishes, by using two different thread pools we can allow
          // tiny files (like segments for a brand new index) to be
          // recovered while ongoing large segment recoveries are
          // happening. It also allows these pools to be configured
          // separately.
          if (fileSize > RecoverySettings.SMALL_FILE_CUTOFF_BYTES) {
            pool = recoverySettings.concurrentStreamPool();
          } else {
            pool = recoverySettings.concurrentSmallFileStreamPool();
          }

          pool.execute(
              new AbstractRunnable() {
                @Override
                public void onFailure(Throwable t) {
                  // we either got rejected or the store can't be incremented / we are canceled
                  logger.debug("Failed to transfer file [" + name + "] on recovery");
                }

                public void onAfter() {
                  // Signify this file has completed by decrementing the latch
                  latch.countDown();
                }

                @Override
                protected void doRun() {
                  cancellableThreads.checkForCancel();
                  store.incRef();
                  final StoreFileMetaData md = recoverySourceMetadata.get(name);
                  try (final IndexInput indexInput =
                      store.directory().openInput(name, IOContext.READONCE)) {
                    final int BUFFER_SIZE = (int) recoverySettings.fileChunkSize().bytes();
                    final byte[] buf = new byte[BUFFER_SIZE];
                    boolean shouldCompressRequest = recoverySettings.compress();
                    if (CompressorFactory.isCompressed(indexInput)) {
                      shouldCompressRequest = false;
                    }

                    final long len = indexInput.length();
                    long readCount = 0;
                    final TransportRequestOptions requestOptions =
                        TransportRequestOptions.options()
                            .withCompress(shouldCompressRequest)
                            .withType(TransportRequestOptions.Type.RECOVERY)
                            .withTimeout(recoverySettings.internalActionTimeout());

                    while (readCount < len) {
                      if (shard.state()
                          == IndexShardState.CLOSED) { // check if the shard got closed on us
                        throw new IndexShardClosedException(shard.shardId());
                      }
                      int toRead =
                          readCount + BUFFER_SIZE > len ? (int) (len - readCount) : BUFFER_SIZE;
                      final long position = indexInput.getFilePointer();

                      // Pause using the rate limiter, if desired, to throttle the recovery
                      RateLimiter rl = recoverySettings.rateLimiter();
                      long throttleTimeInNanos = 0;
                      if (rl != null) {
                        long bytes = bytesSinceLastPause.addAndGet(toRead);
                        if (bytes > rl.getMinPauseCheckBytes()) {
                          // Time to pause
                          bytesSinceLastPause.addAndGet(-bytes);
                          throttleTimeInNanos = rl.pause(bytes);
                          shard.recoveryStats().addThrottleTime(throttleTimeInNanos);
                        }
                      }
                      indexInput.readBytes(buf, 0, toRead, false);
                      final BytesArray content = new BytesArray(buf, 0, toRead);
                      readCount += toRead;
                      final boolean lastChunk = readCount == len;
                      final RecoveryFileChunkRequest fileChunkRequest =
                          new RecoveryFileChunkRequest(
                              request.recoveryId(),
                              request.shardId(),
                              md,
                              position,
                              content,
                              lastChunk,
                              shard.translog().estimatedNumberOfOperations(),
                              throttleTimeInNanos);
                      cancellableThreads.execute(
                          new Interruptable() {
                            @Override
                            public void run() throws InterruptedException {
                              // Actually send the file chunk to the target node, waiting for it to
                              // complete
                              transportService
                                  .submitRequest(
                                      request.targetNode(),
                                      RecoveryTarget.Actions.FILE_CHUNK,
                                      fileChunkRequest,
                                      requestOptions,
                                      EmptyTransportResponseHandler.INSTANCE_SAME)
                                  .txGet();
                            }
                          });
                    }
                  } catch (Throwable e) {
                    final Throwable corruptIndexException;
                    if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
                      if (store.checkIntegrity(md)
                          == false) { // we are corrupted on the primary -- fail!
                        logger.warn(
                            "{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
                        if (corruptedEngine.compareAndSet(null, corruptIndexException) == false) {
                          // if we are not the first exception, add ourselves as suppressed to the
                          // main one:
                          corruptedEngine.get().addSuppressed(e);
                        }
                      } else { // corruption has happened on the way to replica
                        RemoteTransportException exception =
                            new RemoteTransportException(
                                "File corruption occurred on recovery but checksums are ok", null);
                        exception.addSuppressed(e);
                        exceptions.add(0, exception); // last exception first
                        logger.warn(
                            "{} Remote file corruption on node {}, recovering {}. local checksum OK",
                            corruptIndexException,
                            shard.shardId(),
                            request.targetNode(),
                            md);
                      }
                    } else {
                      exceptions.add(0, e); // last exceptions first
                    }
                  } finally {
                    store.decRef();
                  }
                }
              });
          fileIndex++;
        }

        cancellableThreads.execute(
            new Interruptable() {
              @Override
              public void run() throws InterruptedException {
                // Wait for all files that need to be transferred to finish transferring
                latch.await();
              }
            });

        if (corruptedEngine.get() != null) {
          throw corruptedEngine.get();
        } else {
          ExceptionsHelper.rethrowAndSuppress(exceptions);
        }

        cancellableThreads.execute(
            new Interruptable() {
              @Override
              public void run() throws InterruptedException {
                // Send the CLEAN_FILES request, which takes all of the files that
                // were transferred and renames them from their temporary file
                // names to the actual file names. It also writes checksums for
                // the files after they have been renamed.
                //
                // Once the files have been renamed, any other files that are not
                // related to this recovery (out of date segments, for example)
                // are deleted
                try {
                  transportService
                      .submitRequest(
                          request.targetNode(),
                          RecoveryTarget.Actions.CLEAN_FILES,
                          new RecoveryCleanFilesRequest(
                              request.recoveryId(),
                              shard.shardId(),
                              recoverySourceMetadata,
                              shard.translog().estimatedNumberOfOperations()),
                          TransportRequestOptions.options()
                              .withTimeout(recoverySettings.internalActionTimeout()),
                          EmptyTransportResponseHandler.INSTANCE_SAME)
                      .txGet();
                } catch (RemoteTransportException remoteException) {
                  final IOException corruptIndexException;
                  // we realized that after the index was copied and we wanted to finalize the
                  // recovery
                  // the index was corrupted:
                  //   - maybe due to a broken segments file on an empty index (transferred with no
                  // checksum)
                  //   - maybe due to old segments without checksums or length only checks
                  if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(remoteException))
                      != null) {
                    try {
                      final Store.MetadataSnapshot recoverySourceMetadata =
                          store.getMetadata(snapshot);
                      StoreFileMetaData[] metadata =
                          Iterables.toArray(recoverySourceMetadata, StoreFileMetaData.class);
                      ArrayUtil.timSort(
                          metadata,
                          new Comparator<StoreFileMetaData>() {
                            @Override
                            public int compare(StoreFileMetaData o1, StoreFileMetaData o2) {
                              return Long.compare(
                                  o1.length(), o2.length()); // check small files first
                            }
                          });
                      for (StoreFileMetaData md : metadata) {
                        logger.debug(
                            "{} checking integrity for file {} after remove corruption exception",
                            shard.shardId(),
                            md);
                        if (store.checkIntegrity(md)
                            == false) { // we are corrupted on the primary -- fail!
                          logger.warn(
                              "{} Corrupted file detected {} checksum mismatch",
                              shard.shardId(),
                              md);
                          throw corruptIndexException;
                        }
                      }
                    } catch (IOException ex) {
                      remoteException.addSuppressed(ex);
                      throw remoteException;
                    }
                    // corruption has happened on the way to replica
                    RemoteTransportException exception =
                        new RemoteTransportException(
                            "File corruption occurred on recovery but checksums are ok", null);
                    exception.addSuppressed(remoteException);
                    logger.warn(
                        "{} Remote file corruption during finalization on node {}, recovering {}. local checksum OK",
                        corruptIndexException,
                        shard.shardId(),
                        request.targetNode());
                    throw exception;
                  } else {
                    throw remoteException;
                  }
                }
              }
            });
      }
      stopWatch.stop();
      logger.trace(
          "[{}][{}] recovery [phase1] to {}: took [{}]",
          indexName,
          shardId,
          request.targetNode(),
          stopWatch.totalTime());
      response.phase1Time = stopWatch.totalTime().millis();
    } catch (Throwable e) {
      throw new RecoverFilesRecoveryException(
          request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e);
    } finally {
      store.decRef();
    }
  }