/** * Perform phase 3 of the recovery process * * <p>Phase3 again takes a snapshot of the translog, however this time the snapshot is acquired * under a write lock. The translog operations are sent to the target node where they are * replayed. * * <p>{@code InternalEngine#recover} is responsible for taking the snapshot of the translog, and * after phase 3 completes the snapshots from all three phases are released. */ @Override public void phase3(Translog.Snapshot snapshot) throws ElasticsearchException { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } cancellableThreads.checkForCancel(); StopWatch stopWatch = new StopWatch().start(); final int totalOperations; logger.trace( "[{}][{}] recovery [phase3] to {}: sending transaction log operations", indexName, shardId, request.targetNode()); // Send the translog operations to the target node totalOperations = sendSnapshot(snapshot); cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { // Send the FINALIZE request to the target node. The finalize request // clears unreferenced translog files, refreshes the engine now that // new segments are available, and enables garbage collection of // tombstone files. The shard is also moved to the POST_RECOVERY phase // during this time transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.FINALIZE, new RecoveryFinalizeRecoveryRequest(request.recoveryId(), request.shardId()), TransportRequestOptions.options() .withTimeout(recoverySettings.internalActionLongTimeout()), EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); if (request.markAsRelocated()) { // TODO what happens if the recovery process fails afterwards, we need to mark this back to // started try { shard.relocated("to " + request.targetNode()); } catch (IllegalIndexShardStateException e) { // we can ignore this exception since, on the other node, when it moved to phase3 // it will also send shard started, which might cause the index shard we work against // to move be closed by the time we get to the the relocated method } } stopWatch.stop(); logger.trace( "[{}][{}] recovery [phase3] to {}: took [{}]", indexName, shardId, request.targetNode(), stopWatch.totalTime()); response.phase3Time = stopWatch.totalTime().millis(); response.phase3Operations = totalOperations; }
/** finalizes the recovery process */ public void finalizeRecovery() { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } cancellableThreads.checkForCancel(); StopWatch stopWatch = new StopWatch().start(); logger.trace("[{}][{}] finalizing recovery to {}", indexName, shardId, request.targetNode()); cancellableThreads.execute(recoveryTarget::finalizeRecovery); if (request.isPrimaryRelocation()) { // in case of primary relocation we have to ensure that the cluster state on the primary // relocation target has all // replica shards that have recovered or are still recovering from the current primary, // otherwise replication actions // will not be send to these replicas. To accomplish this, first block new recoveries, then // take version of latest cluster // state. This means that no new recovery can be completed based on information of a newer // cluster state than the current one. try (Releasable ignored = delayNewRecoveries.apply( "primary relocation hand-off in progress or completed for " + shardId)) { final long currentClusterStateVersion = currentClusterStateVersionSupplier.get(); logger.trace( "[{}][{}] waiting on {} to have cluster state with version [{}]", indexName, shardId, request.targetNode(), currentClusterStateVersion); cancellableThreads.execute( () -> recoveryTarget.ensureClusterStateVersion(currentClusterStateVersion)); logger.trace( "[{}][{}] performing relocation hand-off to {}", indexName, shardId, request.targetNode()); cancellableThreads.execute(() -> shard.relocated("to " + request.targetNode())); } /** * if the recovery process fails after setting the shard state to RELOCATED, both relocation * source and target are failed (see {@link IndexShard#updateRoutingEntry}). */ } stopWatch.stop(); logger.trace( "[{}][{}] finalizing recovery to {}: took [{}]", indexName, shardId, request.targetNode(), stopWatch.totalTime()); }
@Override public String toString() { return "ShardRecoveryHandler{" + "shardId=" + request.shardId() + ", sourceNode=" + request.sourceNode() + ", targetNode=" + request.targetNode() + '}'; }
void sendFiles( Store store, StoreFileMetaData[] files, Function<StoreFileMetaData, OutputStream> outputStreamFactory) throws Exception { store.incRef(); try { ArrayUtil.timSort( files, (a, b) -> Long.compare(a.length(), b.length())); // send smallest first for (int i = 0; i < files.length; i++) { final StoreFileMetaData md = files[i]; try (final IndexInput indexInput = store.directory().openInput(md.name(), IOContext.READONCE)) { // it's fine that we are only having the indexInput in the try/with block. The copy // methods handles // exceptions during close correctly and doesn't hide the original exception. Streams.copy( new InputStreamIndexInput(indexInput, md.length()), outputStreamFactory.apply(md)); } catch (Exception e) { final IOException corruptIndexException; if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) { if (store.checkIntegrityNoException(md) == false) { // we are corrupted on the primary -- fail! logger.warn("{} Corrupted file detected {} checksum mismatch", shardId, md); failEngine(corruptIndexException); throw corruptIndexException; } else { // corruption has happened on the way to replica RemoteTransportException exception = new RemoteTransportException( "File corruption occurred on recovery but " + "checksums are ok", null); exception.addSuppressed(e); logger.warn( (org.apache.logging.log4j.util.Supplier<?>) () -> new ParameterizedMessage( "{} Remote file corruption on node {}, recovering {}. local checksum OK", shardId, request.targetNode(), md), corruptIndexException); throw exception; } } else { throw e; } } } } finally { store.decRef(); } }
/** * Perform phase2 of the recovery process * * <p>Phase2 takes a snapshot of the current translog *without* acquiring the write lock (however, * the translog snapshot is a point-in-time view of the translog). It then sends each translog * operation to the target node so it can be replayed into the new shard. */ public void phase2(Translog.Snapshot snapshot) { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } cancellableThreads.checkForCancel(); StopWatch stopWatch = new StopWatch().start(); logger.trace( "{} recovery [phase2] to {}: sending transaction log operations", request.shardId(), request.targetNode()); // Send all the snapshot's translog operations to the target int totalOperations = sendSnapshot(snapshot); stopWatch.stop(); logger.trace( "{} recovery [phase2] to {}: took [{}]", request.shardId(), request.targetNode(), stopWatch.totalTime()); response.phase2Time = stopWatch.totalTime().millis(); response.phase2Operations = totalOperations; }
protected void prepareTargetForTranslog(final int totalTranslogOps) throws IOException { StopWatch stopWatch = new StopWatch().start(); logger.trace( "{} recovery [phase1] to {}: prepare remote engine for translog", request.shardId(), request.targetNode()); final long startEngineStart = stopWatch.totalTime().millis(); // Send a request preparing the new shard's translog to receive // operations. This ensures the shard engine is started and disables // garbage collection (not the JVM's GC!) of tombstone deletes cancellableThreads.executeIO( () -> recoveryTarget.prepareForTranslogOperations( totalTranslogOps, shard.segmentStats(false).getMaxUnsafeAutoIdTimestamp())); stopWatch.stop(); response.startTime = stopWatch.totalTime().millis() - startEngineStart; logger.trace( "{} recovery [phase1] to {}: remote engine start took [{}]", request.shardId(), request.targetNode(), stopWatch.totalTime()); }
private RecoverySourceHandler createRecoverySourceHandler( StartRecoveryRequest request, String targetAllocationId, IndexShard shard) { RecoverySourceHandler handler; final RemoteRecoveryTargetHandler recoveryTarget = new RemoteRecoveryTargetHandler( request.recoveryId(), request.shardId(), targetAllocationId, transportService, request.targetNode(), recoverySettings, throttleTime -> shard.recoveryStats().addThrottleTime(throttleTime)); Supplier<Long> currentClusterStateVersionSupplier = () -> clusterService.state().getVersion(); if (shard.indexSettings().isOnSharedFilesystem()) { handler = new SharedFSRecoverySourceHandler( shard, recoveryTarget, request, currentClusterStateVersionSupplier, this::delayNewRecoveries, logger); } else { handler = new RecoverySourceHandler( shard, recoveryTarget, request, currentClusterStateVersionSupplier, this::delayNewRecoveries, recoverySettings.getChunkSize().bytesAsInt(), logger); } return handler; }
private RecoveryResponse recover(final StartRecoveryRequest request) throws IOException { final IndexService indexService = indicesService.indexServiceSafe(request.shardId().getIndex()); final IndexShard shard = indexService.getShard(request.shardId().id()); // starting recovery from that our (the source) shard state is marking the shard to be in // recovery mode as well, otherwise // the index operations will not be routed to it properly RoutingNode node = clusterService.state().getRoutingNodes().node(request.targetNode().getId()); if (node == null) { logger.debug( "delaying recovery of {} as source node {} is unknown", request.shardId(), request.targetNode()); throw new DelayRecoveryException( "source node does not have the node [" + request.targetNode() + "] in its state yet.."); } ShardRouting routingEntry = shard.routingEntry(); if (request.isPrimaryRelocation() && (routingEntry.relocating() == false || routingEntry.relocatingNodeId().equals(request.targetNode().getId()) == false)) { logger.debug( "delaying recovery of {} as source shard is not marked yet as relocating to {}", request.shardId(), request.targetNode()); throw new DelayRecoveryException( "source shard is not marked yet as relocating to [" + request.targetNode() + "]"); } ShardRouting targetShardRouting = node.getByShardId(request.shardId()); if (targetShardRouting == null) { logger.debug( "delaying recovery of {} as it is not listed as assigned to target node {}", request.shardId(), request.targetNode()); throw new DelayRecoveryException( "source node does not have the shard listed in its state as allocated on the node"); } if (!targetShardRouting.initializing()) { logger.debug( "delaying recovery of {} as it is not listed as initializing on the target node {}. known shards state is [{}]", request.shardId(), request.targetNode(), targetShardRouting.state()); throw new DelayRecoveryException( "source node has the state of the target shard to be [" + targetShardRouting.state() + "], expecting to be [initializing]"); } RecoverySourceHandler handler = ongoingRecoveries.addNewRecovery(request, targetShardRouting.allocationId().getId(), shard); logger.trace( "[{}][{}] starting recovery to {}", request.shardId().getIndex().getName(), request.shardId().id(), request.targetNode()); try { return handler.recoverToTarget(); } finally { ongoingRecoveries.remove(shard, handler); } }
private RecoveryResponse recover(final StartRecoveryRequest request) { final IndexService indexService = indicesService.indexServiceSafe(request.shardId().index().name()); final IndexShard shard = indexService.getShard(request.shardId().id()); // starting recovery from that our (the source) shard state is marking the shard to be in // recovery mode as well, otherwise // the index operations will not be routed to it properly RoutingNode node = clusterService.state().getRoutingNodes().node(request.targetNode().id()); if (node == null) { logger.debug( "delaying recovery of {} as source node {} is unknown", request.shardId(), request.targetNode()); throw new DelayRecoveryException( "source node does not have the node [" + request.targetNode() + "] in its state yet.."); } ShardRouting targetShardRouting = null; for (ShardRouting shardRouting : node) { if (shardRouting.shardId().equals(request.shardId())) { targetShardRouting = shardRouting; break; } } if (targetShardRouting == null) { logger.debug( "delaying recovery of {} as it is not listed as assigned to target node {}", request.shardId(), request.targetNode()); throw new DelayRecoveryException( "source node does not have the shard listed in its state as allocated on the node"); } if (!targetShardRouting.initializing()) { logger.debug( "delaying recovery of {} as it is not listed as initializing on the target node {}. known shards state is [{}]", request.shardId(), request.targetNode(), targetShardRouting.state()); throw new DelayRecoveryException( "source node has the state of the target shard to be [" + targetShardRouting.state() + "], expecting to be [initializing]"); } logger.trace( "[{}][{}] starting recovery to {}, mark_as_relocated {}", request.shardId().index().name(), request.shardId().id(), request.targetNode(), request.markAsRelocated()); final RecoverySourceHandler handler; if (IndexMetaData.isOnSharedFilesystem(shard.indexSettings())) { handler = new SharedFSRecoverySourceHandler( shard, request, recoverySettings, transportService, logger); } else { handler = new RecoverySourceHandler(shard, request, recoverySettings, transportService, logger); } ongoingRecoveries.add(shard, handler); try { return handler.recoverToTarget(); } finally { ongoingRecoveries.remove(shard, handler); } }
/** * Send the given snapshot's operations to this handler's target node. * * <p>Operations are bulked into a single request depending on an operation count limit or * size-in-bytes limit * * @return the total number of translog operations that were sent */ protected int sendSnapshot(final Translog.Snapshot snapshot) { int ops = 0; long size = 0; int totalOperations = 0; final List<Translog.Operation> operations = new ArrayList<>(); Translog.Operation operation; try { operation = snapshot.next(); // this ex should bubble up } catch (IOException ex) { throw new ElasticsearchException("failed to get next operation from translog", ex); } if (operation == null) { logger.trace( "[{}][{}] no translog operations to send to {}", indexName, shardId, request.targetNode()); } while (operation != null) { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } cancellableThreads.checkForCancel(); operations.add(operation); ops += 1; size += operation.estimateSize(); totalOperations++; // Check if this request is past bytes threshold, and // if so, send it off if (size >= chunkSizeInBytes) { // don't throttle translog, since we lock for phase3 indexing, // so we need to move it as fast as possible. Note, since we // index docs to replicas while the index files are recovered // the lock can potentially be removed, in which case, it might // make sense to re-enable throttling in this phase cancellableThreads.execute( () -> recoveryTarget.indexTranslogOperations(operations, snapshot.totalOperations())); if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] sent batch of [{}][{}] (total: [{}]) translog operations to {}", indexName, shardId, ops, new ByteSizeValue(size), snapshot.totalOperations(), request.targetNode()); } ops = 0; size = 0; operations.clear(); } try { operation = snapshot.next(); // this ex should bubble up } catch (IOException ex) { throw new ElasticsearchException("failed to get next operation from translog", ex); } } // send the leftover if (!operations.isEmpty()) { cancellableThreads.execute( () -> recoveryTarget.indexTranslogOperations(operations, snapshot.totalOperations())); } if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] sent final batch of [{}][{}] (total: [{}]) translog operations to {}", indexName, shardId, ops, new ByteSizeValue(size), snapshot.totalOperations(), request.targetNode()); } return totalOperations; }
/** * Perform phase1 of the recovery operations. Once this {@link IndexCommit} snapshot has been * performed no commit operations (files being fsync'd) are effectively allowed on this index * until all recovery phases are done * * <p>Phase1 examines the segment files on the target node and copies over the segments that are * missing. Only segments that have the same size and checksum can be reused */ public void phase1(final IndexCommit snapshot, final Translog.View translogView) { cancellableThreads.checkForCancel(); // Total size of segment files that are recovered long totalSize = 0; // Total size of segment files that were able to be re-used long existingTotalSize = 0; final Store store = shard.store(); store.incRef(); try { StopWatch stopWatch = new StopWatch().start(); final Store.MetadataSnapshot recoverySourceMetadata; try { recoverySourceMetadata = store.getMetadata(snapshot); } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) { shard.failShard("recovery", ex); throw ex; } for (String name : snapshot.getFileNames()) { final StoreFileMetaData md = recoverySourceMetadata.get(name); if (md == null) { logger.info( "Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.asMap()); throw new CorruptIndexException( "Snapshot differs from actual index - maybe index was removed metadata has " + recoverySourceMetadata.asMap().size() + " files", name); } } // Generate a "diff" of all the identical, different, and missing // segment files on the target node, using the existing files on // the source node String recoverySourceSyncId = recoverySourceMetadata.getSyncId(); String recoveryTargetSyncId = request.metadataSnapshot().getSyncId(); final boolean recoverWithSyncId = recoverySourceSyncId != null && recoverySourceSyncId.equals(recoveryTargetSyncId); if (recoverWithSyncId) { final long numDocsTarget = request.metadataSnapshot().getNumDocs(); final long numDocsSource = recoverySourceMetadata.getNumDocs(); if (numDocsTarget != numDocsSource) { throw new IllegalStateException( "try to recover " + request.shardId() + " from primary shard with sync id but number " + "of docs differ: " + numDocsTarget + " (" + request.sourceNode().getName() + ", primary) vs " + numDocsSource + "(" + request.targetNode().getName() + ")"); } // we shortcut recovery here because we have nothing to copy. but we must still start the // engine on the target. // so we don't return here logger.trace( "[{}][{}] skipping [phase1] to {} - identical sync id [{}] found on both source and target", indexName, shardId, request.targetNode(), recoverySourceSyncId); } else { final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot()); for (StoreFileMetaData md : diff.identical) { response.phase1ExistingFileNames.add(md.name()); response.phase1ExistingFileSizes.add(md.length()); existingTotalSize += md.length(); if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}]," + " size [{}]", indexName, shardId, request.targetNode(), md.name(), md.checksum(), md.length()); } totalSize += md.length(); } List<StoreFileMetaData> phase1Files = new ArrayList<>(diff.different.size() + diff.missing.size()); phase1Files.addAll(diff.different); phase1Files.addAll(diff.missing); for (StoreFileMetaData md : phase1Files) { if (request.metadataSnapshot().asMap().containsKey(md.name())) { logger.trace( "[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote " + "[{}], local [{}]", indexName, shardId, request.targetNode(), md.name(), request.metadataSnapshot().asMap().get(md.name()), md); } else { logger.trace( "[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote", indexName, shardId, request.targetNode(), md.name()); } response.phase1FileNames.add(md.name()); response.phase1FileSizes.add(md.length()); totalSize += md.length(); } response.phase1TotalSize = totalSize; response.phase1ExistingTotalSize = existingTotalSize; logger.trace( "[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with " + "total_size [{}]", indexName, shardId, request.targetNode(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), response.phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSize)); cancellableThreads.execute( () -> recoveryTarget.receiveFileInfo( response.phase1FileNames, response.phase1FileSizes, response.phase1ExistingFileNames, response.phase1ExistingFileSizes, translogView.totalOperations())); // How many bytes we've copied since we last called RateLimiter.pause final Function<StoreFileMetaData, OutputStream> outputStreamFactories = md -> new BufferedOutputStream( new RecoveryOutputStream(md, translogView), chunkSizeInBytes); sendFiles( store, phase1Files.toArray(new StoreFileMetaData[phase1Files.size()]), outputStreamFactories); // Send the CLEAN_FILES request, which takes all of the files that // were transferred and renames them from their temporary file // names to the actual file names. It also writes checksums for // the files after they have been renamed. // // Once the files have been renamed, any other files that are not // related to this recovery (out of date segments, for example) // are deleted try { cancellableThreads.executeIO( () -> recoveryTarget.cleanFiles( translogView.totalOperations(), recoverySourceMetadata)); } catch (RemoteTransportException | IOException targetException) { final IOException corruptIndexException; // we realized that after the index was copied and we wanted to finalize the recovery // the index was corrupted: // - maybe due to a broken segments file on an empty index (transferred with no // checksum) // - maybe due to old segments without checksums or length only checks if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(targetException)) != null) { try { final Store.MetadataSnapshot recoverySourceMetadata1 = store.getMetadata(snapshot); StoreFileMetaData[] metadata = StreamSupport.stream(recoverySourceMetadata1.spliterator(), false) .toArray(size -> new StoreFileMetaData[size]); ArrayUtil.timSort( metadata, (o1, o2) -> { return Long.compare(o1.length(), o2.length()); // check small files first }); for (StoreFileMetaData md : metadata) { cancellableThreads.checkForCancel(); logger.debug( "{} checking integrity for file {} after remove corruption exception", shard.shardId(), md); if (store.checkIntegrityNoException(md) == false) { // we are corrupted on the primary -- fail! shard.failShard("recovery", corruptIndexException); logger.warn( "{} Corrupted file detected {} checksum mismatch", shard.shardId(), md); throw corruptIndexException; } } } catch (IOException ex) { targetException.addSuppressed(ex); throw targetException; } // corruption has happened on the way to replica RemoteTransportException exception = new RemoteTransportException( "File corruption occurred on recovery but " + "checksums are ok", null); exception.addSuppressed(targetException); logger.warn( (org.apache.logging.log4j.util.Supplier<?>) () -> new ParameterizedMessage( "{} Remote file corruption during finalization of recovery on node {}. local checksum OK", shard.shardId(), request.targetNode()), corruptIndexException); throw exception; } else { throw targetException; } } } prepareTargetForTranslog(translogView.totalOperations()); logger.trace( "[{}][{}] recovery [phase1] to {}: took [{}]", indexName, shardId, request.targetNode(), stopWatch.totalTime()); response.phase1Time = stopWatch.totalTime().millis(); } catch (Exception e) { throw new RecoverFilesRecoveryException( request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e); } finally { store.decRef(); } }
/** * Send the given snapshot's operations to this handler's target node. * * <p>Operations are bulked into a single request depending on an operation count limit or * size-in-bytes limit * * @return the total number of translog operations that were sent */ protected int sendSnapshot(Translog.Snapshot snapshot) throws ElasticsearchException { int ops = 0; long size = 0; int totalOperations = 0; final List<Translog.Operation> operations = Lists.newArrayList(); Translog.Operation operation = snapshot.next(); final TransportRequestOptions recoveryOptions = TransportRequestOptions.options() .withCompress(recoverySettings.compress()) .withType(TransportRequestOptions.Type.RECOVERY) .withTimeout(recoverySettings.internalActionLongTimeout()); if (operation == null) { logger.trace( "[{}][{}] no translog operations (id: [{}]) to send to {}", indexName, shardId, snapshot.translogId(), request.targetNode()); } while (operation != null) { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } cancellableThreads.checkForCancel(); operations.add(operation); ops += 1; size += operation.estimateSize(); totalOperations++; // Check if this request is past the size or bytes threshold, and // if so, send it off if (ops >= recoverySettings.translogOps() || size >= recoverySettings.translogSize().bytes()) { // don't throttle translog, since we lock for phase3 indexing, // so we need to move it as fast as possible. Note, since we // index docs to replicas while the index files are recovered // the lock can potentially be removed, in which case, it might // make sense to re-enable throttling in this phase // if (recoverySettings.rateLimiter() != null) { // recoverySettings.rateLimiter().pause(size); // } if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] sending batch of [{}][{}] (total: [{}], id: [{}]) translog operations to {}", indexName, shardId, ops, new ByteSizeValue(size), shard.translog().estimatedNumberOfOperations(), snapshot.translogId(), request.targetNode()); } cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { final RecoveryTranslogOperationsRequest translogOperationsRequest = new RecoveryTranslogOperationsRequest( request.recoveryId(), request.shardId(), operations, shard.translog().estimatedNumberOfOperations()); transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.TRANSLOG_OPS, translogOperationsRequest, recoveryOptions, EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); ops = 0; size = 0; operations.clear(); } operation = snapshot.next(); } // send the leftover if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] sending final batch of [{}][{}] (total: [{}], id: [{}]) translog operations to {}", indexName, shardId, ops, new ByteSizeValue(size), shard.translog().estimatedNumberOfOperations(), snapshot.translogId(), request.targetNode()); } if (!operations.isEmpty()) { cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { RecoveryTranslogOperationsRequest translogOperationsRequest = new RecoveryTranslogOperationsRequest( request.recoveryId(), request.shardId(), operations, shard.translog().estimatedNumberOfOperations()); transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.TRANSLOG_OPS, translogOperationsRequest, recoveryOptions, EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); } return totalOperations; }
/** * Perform phase2 of the recovery process * * <p>Phase2 takes a snapshot of the current translog *without* acquiring the write lock (however, * the translog snapshot is a point-in-time view of the translog). It then sends each translog * operation to the target node so it can be replayed into the new shard. * * <p>{@code InternalEngine#recover} is responsible for taking the snapshot of the translog and * releasing it once all 3 phases of recovery are complete */ @Override public void phase2(Translog.Snapshot snapshot) throws ElasticsearchException { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } cancellableThreads.checkForCancel(); logger.trace("{} recovery [phase2] to {}: start", request.shardId(), request.targetNode()); StopWatch stopWatch = new StopWatch().start(); cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { // Send a request preparing the new shard's translog to receive // operations. This ensures the shard engine is started and disables // garbage collection (not the JVM's GC!) of tombstone deletes transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.PREPARE_TRANSLOG, new RecoveryPrepareForTranslogOperationsRequest( request.recoveryId(), request.shardId(), shard.translog().estimatedNumberOfOperations()), TransportRequestOptions.options() .withTimeout(recoverySettings.internalActionTimeout()), EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); stopWatch.stop(); response.startTime = stopWatch.totalTime().millis(); logger.trace( "{} recovery [phase2] to {}: start took [{}]", request.shardId(), request.targetNode(), stopWatch.totalTime()); logger.trace( "{} recovery [phase2] to {}: updating current mapping to master", request.shardId(), request.targetNode()); // Ensure that the mappings are synced with the master node updateMappingOnMaster(); logger.trace( "{} recovery [phase2] to {}: sending transaction log operations", request.shardId(), request.targetNode()); stopWatch = new StopWatch().start(); // Send all the snapshot's translog operations to the target int totalOperations = sendSnapshot(snapshot); stopWatch.stop(); logger.trace( "{} recovery [phase2] to {}: took [{}]", request.shardId(), request.targetNode(), stopWatch.totalTime()); response.phase2Time = stopWatch.totalTime().millis(); response.phase2Operations = totalOperations; }
/** * Perform phase1 of the recovery operations. Once this {@link SnapshotIndexCommit} snapshot has * been performed no commit operations (files being fsync'd) are effectively allowed on this index * until all recovery phases are done * * <p>Phase1 examines the segment files on the target node and copies over the segments that are * missing. Only segments that have the same size and checksum can be reused * * <p>{@code InternalEngine#recover} is responsible for snapshotting the index and releasing the * snapshot once all 3 phases of recovery are complete */ @Override public void phase1(final SnapshotIndexCommit snapshot) throws ElasticsearchException { cancellableThreads.checkForCancel(); // Total size of segment files that are recovered long totalSize = 0; // Total size of segment files that were able to be re-used long existingTotalSize = 0; final Store store = shard.store(); store.incRef(); try { StopWatch stopWatch = new StopWatch().start(); final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot); for (String name : snapshot.getFiles()) { final StoreFileMetaData md = recoverySourceMetadata.get(name); if (md == null) { logger.info( "Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.asMap()); throw new CorruptIndexException( "Snapshot differs from actual index - maybe index was removed metadata has " + recoverySourceMetadata.asMap().size() + " files"); } } String recoverySourceSyncId = recoverySourceMetadata.getSyncId(); String recoveryTargetSyncId = request.metadataSnapshot().getSyncId(); final boolean recoverWithSyncId = recoverySourceSyncId != null && recoverySourceSyncId.equals(recoveryTargetSyncId); if (recoverWithSyncId) { final long numDocsTarget = request.metadataSnapshot().getNumDocs(); final long numDocsSource = recoverySourceMetadata.getNumDocs(); if (numDocsTarget != numDocsSource) { throw new IllegalStateException( "try to recover " + request.shardId() + " from primary shard with sync id but number of docs differ: " + numDocsTarget + " (" + request.sourceNode().getName() + ", primary) vs " + numDocsSource + "(" + request.targetNode().getName() + ")"); } // we shortcut recovery here because we have nothing to copy. but we must still start the // engine on the target. // so we don't return here logger.trace( "[{}][{}] skipping [phase1] to {} - identical sync id [{}] found on both source and target", indexName, shardId, request.targetNode(), recoverySourceSyncId); } else { // Generate a "diff" of all the identical, different, and missing // segment files on the target node, using the existing files on // the source node final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot()); for (StoreFileMetaData md : diff.identical) { response.phase1ExistingFileNames.add(md.name()); response.phase1ExistingFileSizes.add(md.length()); existingTotalSize += md.length(); if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}], size [{}]", indexName, shardId, request.targetNode(), md.name(), md.checksum(), md.length()); } totalSize += md.length(); } for (StoreFileMetaData md : Iterables.concat(diff.different, diff.missing)) { if (request.metadataSnapshot().asMap().containsKey(md.name())) { logger.trace( "[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote [{}], local [{}]", indexName, shardId, request.targetNode(), md.name(), request.metadataSnapshot().get(md.name()), md); } else { logger.trace( "[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote", indexName, shardId, request.targetNode(), md.name()); } response.phase1FileNames.add(md.name()); response.phase1FileSizes.add(md.length()); totalSize += md.length(); } response.phase1TotalSize = totalSize; response.phase1ExistingTotalSize = existingTotalSize; logger.trace( "[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]", indexName, shardId, request.targetNode(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), response.phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSize)); cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { RecoveryFilesInfoRequest recoveryInfoFilesRequest = new RecoveryFilesInfoRequest( request.recoveryId(), request.shardId(), response.phase1FileNames, response.phase1FileSizes, response.phase1ExistingFileNames, response.phase1ExistingFileSizes, shard.translog().estimatedNumberOfOperations(), response.phase1TotalSize, response.phase1ExistingTotalSize); transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.FILES_INFO, recoveryInfoFilesRequest, TransportRequestOptions.options() .withTimeout(recoverySettings.internalActionTimeout()), EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); // This latch will be used to wait until all files have been transferred to the target node final CountDownLatch latch = new CountDownLatch(response.phase1FileNames.size()); final CopyOnWriteArrayList<Throwable> exceptions = new CopyOnWriteArrayList<>(); final AtomicReference<Throwable> corruptedEngine = new AtomicReference<>(); int fileIndex = 0; ThreadPoolExecutor pool; // How many bytes we've copied since we last called RateLimiter.pause final AtomicLong bytesSinceLastPause = new AtomicLong(); for (final String name : response.phase1FileNames) { long fileSize = response.phase1FileSizes.get(fileIndex); // Files are split into two categories, files that are "small" // (under 5mb) and other files. Small files are transferred // using a separate thread pool dedicated to small files. // // The idea behind this is that while we are transferring an // older, large index, a user may create a new index, but that // index will not be able to recover until the large index // finishes, by using two different thread pools we can allow // tiny files (like segments for a brand new index) to be // recovered while ongoing large segment recoveries are // happening. It also allows these pools to be configured // separately. if (fileSize > RecoverySettings.SMALL_FILE_CUTOFF_BYTES) { pool = recoverySettings.concurrentStreamPool(); } else { pool = recoverySettings.concurrentSmallFileStreamPool(); } pool.execute( new AbstractRunnable() { @Override public void onFailure(Throwable t) { // we either got rejected or the store can't be incremented / we are canceled logger.debug("Failed to transfer file [" + name + "] on recovery"); } public void onAfter() { // Signify this file has completed by decrementing the latch latch.countDown(); } @Override protected void doRun() { cancellableThreads.checkForCancel(); store.incRef(); final StoreFileMetaData md = recoverySourceMetadata.get(name); try (final IndexInput indexInput = store.directory().openInput(name, IOContext.READONCE)) { final int BUFFER_SIZE = (int) recoverySettings.fileChunkSize().bytes(); final byte[] buf = new byte[BUFFER_SIZE]; boolean shouldCompressRequest = recoverySettings.compress(); if (CompressorFactory.isCompressed(indexInput)) { shouldCompressRequest = false; } final long len = indexInput.length(); long readCount = 0; final TransportRequestOptions requestOptions = TransportRequestOptions.options() .withCompress(shouldCompressRequest) .withType(TransportRequestOptions.Type.RECOVERY) .withTimeout(recoverySettings.internalActionTimeout()); while (readCount < len) { if (shard.state() == IndexShardState.CLOSED) { // check if the shard got closed on us throw new IndexShardClosedException(shard.shardId()); } int toRead = readCount + BUFFER_SIZE > len ? (int) (len - readCount) : BUFFER_SIZE; final long position = indexInput.getFilePointer(); // Pause using the rate limiter, if desired, to throttle the recovery RateLimiter rl = recoverySettings.rateLimiter(); long throttleTimeInNanos = 0; if (rl != null) { long bytes = bytesSinceLastPause.addAndGet(toRead); if (bytes > rl.getMinPauseCheckBytes()) { // Time to pause bytesSinceLastPause.addAndGet(-bytes); throttleTimeInNanos = rl.pause(bytes); shard.recoveryStats().addThrottleTime(throttleTimeInNanos); } } indexInput.readBytes(buf, 0, toRead, false); final BytesArray content = new BytesArray(buf, 0, toRead); readCount += toRead; final boolean lastChunk = readCount == len; final RecoveryFileChunkRequest fileChunkRequest = new RecoveryFileChunkRequest( request.recoveryId(), request.shardId(), md, position, content, lastChunk, shard.translog().estimatedNumberOfOperations(), throttleTimeInNanos); cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { // Actually send the file chunk to the target node, waiting for it to // complete transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.FILE_CHUNK, fileChunkRequest, requestOptions, EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); } } catch (Throwable e) { final Throwable corruptIndexException; if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) { if (store.checkIntegrity(md) == false) { // we are corrupted on the primary -- fail! logger.warn( "{} Corrupted file detected {} checksum mismatch", shard.shardId(), md); if (corruptedEngine.compareAndSet(null, corruptIndexException) == false) { // if we are not the first exception, add ourselves as suppressed to the // main one: corruptedEngine.get().addSuppressed(e); } } else { // corruption has happened on the way to replica RemoteTransportException exception = new RemoteTransportException( "File corruption occurred on recovery but checksums are ok", null); exception.addSuppressed(e); exceptions.add(0, exception); // last exception first logger.warn( "{} Remote file corruption on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode(), md); } } else { exceptions.add(0, e); // last exceptions first } } finally { store.decRef(); } } }); fileIndex++; } cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { // Wait for all files that need to be transferred to finish transferring latch.await(); } }); if (corruptedEngine.get() != null) { throw corruptedEngine.get(); } else { ExceptionsHelper.rethrowAndSuppress(exceptions); } cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { // Send the CLEAN_FILES request, which takes all of the files that // were transferred and renames them from their temporary file // names to the actual file names. It also writes checksums for // the files after they have been renamed. // // Once the files have been renamed, any other files that are not // related to this recovery (out of date segments, for example) // are deleted try { transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.CLEAN_FILES, new RecoveryCleanFilesRequest( request.recoveryId(), shard.shardId(), recoverySourceMetadata, shard.translog().estimatedNumberOfOperations()), TransportRequestOptions.options() .withTimeout(recoverySettings.internalActionTimeout()), EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } catch (RemoteTransportException remoteException) { final IOException corruptIndexException; // we realized that after the index was copied and we wanted to finalize the // recovery // the index was corrupted: // - maybe due to a broken segments file on an empty index (transferred with no // checksum) // - maybe due to old segments without checksums or length only checks if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(remoteException)) != null) { try { final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot); StoreFileMetaData[] metadata = Iterables.toArray(recoverySourceMetadata, StoreFileMetaData.class); ArrayUtil.timSort( metadata, new Comparator<StoreFileMetaData>() { @Override public int compare(StoreFileMetaData o1, StoreFileMetaData o2) { return Long.compare( o1.length(), o2.length()); // check small files first } }); for (StoreFileMetaData md : metadata) { logger.debug( "{} checking integrity for file {} after remove corruption exception", shard.shardId(), md); if (store.checkIntegrity(md) == false) { // we are corrupted on the primary -- fail! logger.warn( "{} Corrupted file detected {} checksum mismatch", shard.shardId(), md); throw corruptIndexException; } } } catch (IOException ex) { remoteException.addSuppressed(ex); throw remoteException; } // corruption has happened on the way to replica RemoteTransportException exception = new RemoteTransportException( "File corruption occurred on recovery but checksums are ok", null); exception.addSuppressed(remoteException); logger.warn( "{} Remote file corruption during finalization on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode()); throw exception; } else { throw remoteException; } } } }); } stopWatch.stop(); logger.trace( "[{}][{}] recovery [phase1] to {}: took [{}]", indexName, shardId, request.targetNode(), stopWatch.totalTime()); response.phase1Time = stopWatch.totalTime().millis(); } catch (Throwable e) { throw new RecoverFilesRecoveryException( request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e); } finally { store.decRef(); } }