private void doSnapshot(final Snapshot snapshot) throws IndexShardGatewaySnapshotFailedException { ImmutableMap<String, BlobMetaData> blobs; try { blobs = blobContainer.listBlobs(); } catch (IOException e) { throw new IndexShardGatewaySnapshotFailedException(shardId, "failed to list blobs", e); } long generation = findLatestFileNameGeneration(blobs); CommitPoints commitPoints = buildCommitPoints(blobs); currentSnapshotStatus.index().startTime(System.currentTimeMillis()); currentSnapshotStatus.updateStage(SnapshotStatus.Stage.INDEX); final SnapshotIndexCommit snapshotIndexCommit = snapshot.indexCommit(); final Translog.Snapshot translogSnapshot = snapshot.translogSnapshot(); final CountDownLatch indexLatch = new CountDownLatch(snapshotIndexCommit.getFiles().length); final CopyOnWriteArrayList<Throwable> failures = new CopyOnWriteArrayList<Throwable>(); final List<CommitPoint.FileInfo> indexCommitPointFiles = Lists.newArrayList(); int indexNumberOfFiles = 0; long indexTotalFilesSize = 0; for (final String fileName : snapshotIndexCommit.getFiles()) { StoreFileMetaData md; try { md = store.metaData(fileName); } catch (IOException e) { throw new IndexShardGatewaySnapshotFailedException( shardId, "Failed to get store file metadata", e); } boolean snapshotRequired = false; if (snapshot.indexChanged() && fileName.equals(snapshotIndexCommit.getSegmentsFileName())) { snapshotRequired = true; // we want to always snapshot the segment file if the index changed } CommitPoint.FileInfo fileInfo = commitPoints.findPhysicalIndexFile(fileName); if (fileInfo == null || !fileInfo.isSame(md) || !commitPointFileExistsInBlobs(fileInfo, blobs)) { // commit point file does not exists in any commit point, or has different length, or does // not fully exists in the listed blobs snapshotRequired = true; } if (snapshotRequired) { indexNumberOfFiles++; indexTotalFilesSize += md.length(); // create a new FileInfo try { CommitPoint.FileInfo snapshotFileInfo = new CommitPoint.FileInfo( fileNameFromGeneration(++generation), fileName, md.length(), md.checksum()); indexCommitPointFiles.add(snapshotFileInfo); snapshotFile(snapshotIndexCommit.getDirectory(), snapshotFileInfo, indexLatch, failures); } catch (IOException e) { failures.add(e); indexLatch.countDown(); } } else { indexCommitPointFiles.add(fileInfo); indexLatch.countDown(); } } currentSnapshotStatus.index().files(indexNumberOfFiles, indexTotalFilesSize); try { indexLatch.await(); } catch (InterruptedException e) { failures.add(e); } if (!failures.isEmpty()) { throw new IndexShardGatewaySnapshotFailedException( shardId(), "Failed to perform snapshot (index files)", failures.get(failures.size() - 1)); } currentSnapshotStatus .index() .time(System.currentTimeMillis() - currentSnapshotStatus.index().startTime()); currentSnapshotStatus.updateStage(SnapshotStatus.Stage.TRANSLOG); currentSnapshotStatus.translog().startTime(System.currentTimeMillis()); // Note, we assume the snapshot is always started from "base 0". We need to seek forward if we // want to lastTranslogPosition if we want the delta List<CommitPoint.FileInfo> translogCommitPointFiles = Lists.newArrayList(); int expectedNumberOfOperations = 0; boolean snapshotRequired = false; if (snapshot.newTranslogCreated()) { if (translogSnapshot.lengthInBytes() > 0) { snapshotRequired = true; expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations(); } } else { // if we have a commit point, check that we have all the files listed in it in the blob store if (!commitPoints.commits().isEmpty()) { CommitPoint commitPoint = commitPoints.commits().get(0); boolean allTranslogFilesExists = true; for (CommitPoint.FileInfo fileInfo : commitPoint.translogFiles()) { if (!commitPointFileExistsInBlobs(fileInfo, blobs)) { allTranslogFilesExists = false; break; } } // if everything exists, we can seek forward in case there are new operations, otherwise, we // copy over all again... if (allTranslogFilesExists) { translogCommitPointFiles.addAll(commitPoint.translogFiles()); if (snapshot.sameTranslogNewOperations()) { translogSnapshot.seekForward(snapshot.lastTranslogLength()); if (translogSnapshot.lengthInBytes() > 0) { snapshotRequired = true; expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations() - snapshot.lastTotalTranslogOperations(); } } // else (no operations, nothing to snapshot) } else { // a full translog snapshot is required if (translogSnapshot.lengthInBytes() > 0) { expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations(); snapshotRequired = true; } } } else { // no commit point, snapshot all the translog if (translogSnapshot.lengthInBytes() > 0) { expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations(); snapshotRequired = true; } } } currentSnapshotStatus.translog().expectedNumberOfOperations(expectedNumberOfOperations); if (snapshotRequired) { CommitPoint.FileInfo addedTranslogFileInfo = new CommitPoint.FileInfo( fileNameFromGeneration(++generation), "translog-" + translogSnapshot.translogId(), translogSnapshot.lengthInBytes(), null /* no need for checksum in translog */); translogCommitPointFiles.add(addedTranslogFileInfo); try { snapshotTranslog(translogSnapshot, addedTranslogFileInfo); } catch (Exception e) { throw new IndexShardGatewaySnapshotFailedException( shardId, "Failed to snapshot translog", e); } } currentSnapshotStatus .translog() .time(System.currentTimeMillis() - currentSnapshotStatus.translog().startTime()); // now create and write the commit point currentSnapshotStatus.updateStage(SnapshotStatus.Stage.FINALIZE); long version = 0; if (!commitPoints.commits().isEmpty()) { version = commitPoints.commits().iterator().next().version() + 1; } String commitPointName = "commit-" + Long.toString(version, Character.MAX_RADIX); CommitPoint commitPoint = new CommitPoint( version, commitPointName, CommitPoint.Type.GENERATED, indexCommitPointFiles, translogCommitPointFiles); try { byte[] commitPointData = CommitPoints.toXContent(commitPoint); blobContainer.writeBlob( commitPointName, new FastByteArrayInputStream(commitPointData), commitPointData.length); } catch (Exception e) { throw new IndexShardGatewaySnapshotFailedException( shardId, "Failed to write commit point", e); } // delete all files that are not referenced by any commit point // build a new CommitPoint, that includes this one and all the saved ones List<CommitPoint> newCommitPointsList = Lists.newArrayList(); newCommitPointsList.add(commitPoint); for (CommitPoint point : commitPoints) { if (point.type() == CommitPoint.Type.SAVED) { newCommitPointsList.add(point); } } CommitPoints newCommitPoints = new CommitPoints(newCommitPointsList); // first, go over and delete all the commit points for (String blobName : blobs.keySet()) { if (!blobName.startsWith("commit-")) { continue; } long checkedVersion = Long.parseLong(blobName.substring("commit-".length()), Character.MAX_RADIX); if (!newCommitPoints.hasVersion(checkedVersion)) { try { blobContainer.deleteBlob(blobName); } catch (IOException e) { // ignore } } } // now go over all the blobs, and if they don't exists in a commit point, delete them for (String blobName : blobs.keySet()) { String name = blobName; if (!name.startsWith("__")) { continue; } if (blobName.contains(".part")) { name = blobName.substring(0, blobName.indexOf(".part")); } if (newCommitPoints.findNameFile(name) == null) { try { blobContainer.deleteBlob(blobName); } catch (IOException e) { // ignore, will delete it laters } } } }
/** * Perform phase1 of the recovery operations. Once this {@link SnapshotIndexCommit} snapshot has * been performed no commit operations (files being fsync'd) are effectively allowed on this index * until all recovery phases are done * * <p>Phase1 examines the segment files on the target node and copies over the segments that are * missing. Only segments that have the same size and checksum can be reused * * <p>{@code InternalEngine#recover} is responsible for snapshotting the index and releasing the * snapshot once all 3 phases of recovery are complete */ @Override public void phase1(final SnapshotIndexCommit snapshot) throws ElasticsearchException { cancellableThreads.checkForCancel(); // Total size of segment files that are recovered long totalSize = 0; // Total size of segment files that were able to be re-used long existingTotalSize = 0; final Store store = shard.store(); store.incRef(); try { StopWatch stopWatch = new StopWatch().start(); final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot); for (String name : snapshot.getFiles()) { final StoreFileMetaData md = recoverySourceMetadata.get(name); if (md == null) { logger.info( "Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.asMap()); throw new CorruptIndexException( "Snapshot differs from actual index - maybe index was removed metadata has " + recoverySourceMetadata.asMap().size() + " files"); } } String recoverySourceSyncId = recoverySourceMetadata.getSyncId(); String recoveryTargetSyncId = request.metadataSnapshot().getSyncId(); final boolean recoverWithSyncId = recoverySourceSyncId != null && recoverySourceSyncId.equals(recoveryTargetSyncId); if (recoverWithSyncId) { final long numDocsTarget = request.metadataSnapshot().getNumDocs(); final long numDocsSource = recoverySourceMetadata.getNumDocs(); if (numDocsTarget != numDocsSource) { throw new IllegalStateException( "try to recover " + request.shardId() + " from primary shard with sync id but number of docs differ: " + numDocsTarget + " (" + request.sourceNode().getName() + ", primary) vs " + numDocsSource + "(" + request.targetNode().getName() + ")"); } // we shortcut recovery here because we have nothing to copy. but we must still start the // engine on the target. // so we don't return here logger.trace( "[{}][{}] skipping [phase1] to {} - identical sync id [{}] found on both source and target", indexName, shardId, request.targetNode(), recoverySourceSyncId); } else { // Generate a "diff" of all the identical, different, and missing // segment files on the target node, using the existing files on // the source node final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot()); for (StoreFileMetaData md : diff.identical) { response.phase1ExistingFileNames.add(md.name()); response.phase1ExistingFileSizes.add(md.length()); existingTotalSize += md.length(); if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}], size [{}]", indexName, shardId, request.targetNode(), md.name(), md.checksum(), md.length()); } totalSize += md.length(); } for (StoreFileMetaData md : Iterables.concat(diff.different, diff.missing)) { if (request.metadataSnapshot().asMap().containsKey(md.name())) { logger.trace( "[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote [{}], local [{}]", indexName, shardId, request.targetNode(), md.name(), request.metadataSnapshot().get(md.name()), md); } else { logger.trace( "[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote", indexName, shardId, request.targetNode(), md.name()); } response.phase1FileNames.add(md.name()); response.phase1FileSizes.add(md.length()); totalSize += md.length(); } response.phase1TotalSize = totalSize; response.phase1ExistingTotalSize = existingTotalSize; logger.trace( "[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]", indexName, shardId, request.targetNode(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), response.phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSize)); cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { RecoveryFilesInfoRequest recoveryInfoFilesRequest = new RecoveryFilesInfoRequest( request.recoveryId(), request.shardId(), response.phase1FileNames, response.phase1FileSizes, response.phase1ExistingFileNames, response.phase1ExistingFileSizes, shard.translog().estimatedNumberOfOperations(), response.phase1TotalSize, response.phase1ExistingTotalSize); transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.FILES_INFO, recoveryInfoFilesRequest, TransportRequestOptions.options() .withTimeout(recoverySettings.internalActionTimeout()), EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); // This latch will be used to wait until all files have been transferred to the target node final CountDownLatch latch = new CountDownLatch(response.phase1FileNames.size()); final CopyOnWriteArrayList<Throwable> exceptions = new CopyOnWriteArrayList<>(); final AtomicReference<Throwable> corruptedEngine = new AtomicReference<>(); int fileIndex = 0; ThreadPoolExecutor pool; // How many bytes we've copied since we last called RateLimiter.pause final AtomicLong bytesSinceLastPause = new AtomicLong(); for (final String name : response.phase1FileNames) { long fileSize = response.phase1FileSizes.get(fileIndex); // Files are split into two categories, files that are "small" // (under 5mb) and other files. Small files are transferred // using a separate thread pool dedicated to small files. // // The idea behind this is that while we are transferring an // older, large index, a user may create a new index, but that // index will not be able to recover until the large index // finishes, by using two different thread pools we can allow // tiny files (like segments for a brand new index) to be // recovered while ongoing large segment recoveries are // happening. It also allows these pools to be configured // separately. if (fileSize > RecoverySettings.SMALL_FILE_CUTOFF_BYTES) { pool = recoverySettings.concurrentStreamPool(); } else { pool = recoverySettings.concurrentSmallFileStreamPool(); } pool.execute( new AbstractRunnable() { @Override public void onFailure(Throwable t) { // we either got rejected or the store can't be incremented / we are canceled logger.debug("Failed to transfer file [" + name + "] on recovery"); } public void onAfter() { // Signify this file has completed by decrementing the latch latch.countDown(); } @Override protected void doRun() { cancellableThreads.checkForCancel(); store.incRef(); final StoreFileMetaData md = recoverySourceMetadata.get(name); try (final IndexInput indexInput = store.directory().openInput(name, IOContext.READONCE)) { final int BUFFER_SIZE = (int) recoverySettings.fileChunkSize().bytes(); final byte[] buf = new byte[BUFFER_SIZE]; boolean shouldCompressRequest = recoverySettings.compress(); if (CompressorFactory.isCompressed(indexInput)) { shouldCompressRequest = false; } final long len = indexInput.length(); long readCount = 0; final TransportRequestOptions requestOptions = TransportRequestOptions.options() .withCompress(shouldCompressRequest) .withType(TransportRequestOptions.Type.RECOVERY) .withTimeout(recoverySettings.internalActionTimeout()); while (readCount < len) { if (shard.state() == IndexShardState.CLOSED) { // check if the shard got closed on us throw new IndexShardClosedException(shard.shardId()); } int toRead = readCount + BUFFER_SIZE > len ? (int) (len - readCount) : BUFFER_SIZE; final long position = indexInput.getFilePointer(); // Pause using the rate limiter, if desired, to throttle the recovery RateLimiter rl = recoverySettings.rateLimiter(); long throttleTimeInNanos = 0; if (rl != null) { long bytes = bytesSinceLastPause.addAndGet(toRead); if (bytes > rl.getMinPauseCheckBytes()) { // Time to pause bytesSinceLastPause.addAndGet(-bytes); throttleTimeInNanos = rl.pause(bytes); shard.recoveryStats().addThrottleTime(throttleTimeInNanos); } } indexInput.readBytes(buf, 0, toRead, false); final BytesArray content = new BytesArray(buf, 0, toRead); readCount += toRead; final boolean lastChunk = readCount == len; final RecoveryFileChunkRequest fileChunkRequest = new RecoveryFileChunkRequest( request.recoveryId(), request.shardId(), md, position, content, lastChunk, shard.translog().estimatedNumberOfOperations(), throttleTimeInNanos); cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { // Actually send the file chunk to the target node, waiting for it to // complete transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.FILE_CHUNK, fileChunkRequest, requestOptions, EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); } } catch (Throwable e) { final Throwable corruptIndexException; if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) { if (store.checkIntegrity(md) == false) { // we are corrupted on the primary -- fail! logger.warn( "{} Corrupted file detected {} checksum mismatch", shard.shardId(), md); if (corruptedEngine.compareAndSet(null, corruptIndexException) == false) { // if we are not the first exception, add ourselves as suppressed to the // main one: corruptedEngine.get().addSuppressed(e); } } else { // corruption has happened on the way to replica RemoteTransportException exception = new RemoteTransportException( "File corruption occurred on recovery but checksums are ok", null); exception.addSuppressed(e); exceptions.add(0, exception); // last exception first logger.warn( "{} Remote file corruption on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode(), md); } } else { exceptions.add(0, e); // last exceptions first } } finally { store.decRef(); } } }); fileIndex++; } cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { // Wait for all files that need to be transferred to finish transferring latch.await(); } }); if (corruptedEngine.get() != null) { throw corruptedEngine.get(); } else { ExceptionsHelper.rethrowAndSuppress(exceptions); } cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { // Send the CLEAN_FILES request, which takes all of the files that // were transferred and renames them from their temporary file // names to the actual file names. It also writes checksums for // the files after they have been renamed. // // Once the files have been renamed, any other files that are not // related to this recovery (out of date segments, for example) // are deleted try { transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.CLEAN_FILES, new RecoveryCleanFilesRequest( request.recoveryId(), shard.shardId(), recoverySourceMetadata, shard.translog().estimatedNumberOfOperations()), TransportRequestOptions.options() .withTimeout(recoverySettings.internalActionTimeout()), EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } catch (RemoteTransportException remoteException) { final IOException corruptIndexException; // we realized that after the index was copied and we wanted to finalize the // recovery // the index was corrupted: // - maybe due to a broken segments file on an empty index (transferred with no // checksum) // - maybe due to old segments without checksums or length only checks if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(remoteException)) != null) { try { final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot); StoreFileMetaData[] metadata = Iterables.toArray(recoverySourceMetadata, StoreFileMetaData.class); ArrayUtil.timSort( metadata, new Comparator<StoreFileMetaData>() { @Override public int compare(StoreFileMetaData o1, StoreFileMetaData o2) { return Long.compare( o1.length(), o2.length()); // check small files first } }); for (StoreFileMetaData md : metadata) { logger.debug( "{} checking integrity for file {} after remove corruption exception", shard.shardId(), md); if (store.checkIntegrity(md) == false) { // we are corrupted on the primary -- fail! logger.warn( "{} Corrupted file detected {} checksum mismatch", shard.shardId(), md); throw corruptIndexException; } } } catch (IOException ex) { remoteException.addSuppressed(ex); throw remoteException; } // corruption has happened on the way to replica RemoteTransportException exception = new RemoteTransportException( "File corruption occurred on recovery but checksums are ok", null); exception.addSuppressed(remoteException); logger.warn( "{} Remote file corruption during finalization on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode()); throw exception; } else { throw remoteException; } } } }); } stopWatch.stop(); logger.trace( "[{}][{}] recovery [phase1] to {}: took [{}]", indexName, shardId, request.targetNode(), stopWatch.totalTime()); response.phase1Time = stopWatch.totalTime().millis(); } catch (Throwable e) { throw new RecoverFilesRecoveryException( request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e); } finally { store.decRef(); } }