/** * Restores shard from {@link RestoreSource} associated with this shard in routing table * * @param recoveryState recovery state */ public void restore(final RecoveryState recoveryState) { RestoreSource restoreSource = indexShard.routingEntry().restoreSource(); if (restoreSource == null) { throw new IndexShardRestoreFailedException(shardId, "empty restore source"); } if (logger.isTraceEnabled()) { logger.trace("[{}] restoring shard [{}]", restoreSource.snapshotId(), shardId); } try { recoveryState.getTranslog().totalOperations(0); recoveryState.getTranslog().totalOperationsOnStart(0); indexShard.prepareForIndexRecovery(); IndexShardRepository indexShardRepository = repositoriesService.indexShardRepository(restoreSource.snapshotId().getRepository()); ShardId snapshotShardId = shardId; if (!shardId.getIndex().equals(restoreSource.index())) { snapshotShardId = new ShardId(restoreSource.index(), shardId.id()); } indexShardRepository.restore( restoreSource.snapshotId(), shardId, snapshotShardId, recoveryState); indexShard.prepareForTranslogRecovery(); indexShard.finalizeRecovery(); indexShard.postRecovery("restore done"); restoreService.indexShardRestoreCompleted(restoreSource.snapshotId(), shardId); } catch (Throwable t) { if (Lucene.isCorruptionException(t)) { restoreService.failRestore(restoreSource.snapshotId(), shardId()); } throw new IndexShardRestoreFailedException(shardId, "restore failed", t); } }
private void processFailure(SearchContext context, Throwable t) { freeContext(context.id()); try { if (Lucene.isCorruptionException(t)) { context.indexShard().failShard("search execution corruption failure", t); } } catch (Throwable e) { logger.warn( "failed to process shard failure to (potentially) send back shard failure on corruption", e); } }
@Override public CommitId flush(boolean force, boolean waitIfOngoing) throws EngineException { ensureOpen(); final byte[] newCommitId; /* * Unfortunately the lock order is important here. We have to acquire the readlock first otherwise * if we are flushing at the end of the recovery while holding the write lock we can deadlock if: * Thread 1: flushes via API and gets the flush lock but blocks on the readlock since Thread 2 has the writeLock * Thread 2: flushes at the end of the recovery holding the writeLock and blocks on the flushLock owned by Thread 1 */ try (ReleasableLock lock = readLock.acquire()) { ensureOpen(); if (flushLock.tryLock() == false) { // if we can't get the lock right away we block if needed otherwise barf if (waitIfOngoing) { logger.trace("waiting for in-flight flush to finish"); flushLock.lock(); logger.trace("acquired flush lock after blocking"); } else { throw new FlushNotAllowedEngineException(shardId, "already flushing..."); } } else { logger.trace("acquired flush lock immediately"); } try { if (indexWriter.hasUncommittedChanges() || force) { ensureCanFlush(); try { translog.prepareCommit(); logger.trace("starting commit for flush; commitTranslog=true"); commitIndexWriter(indexWriter, translog, null); logger.trace("finished commit for flush"); // we need to refresh in order to clear older version values refresh("version_table_flush"); // after refresh documents can be retrieved from the index so we can now commit the // translog translog.commit(); } catch (Throwable e) { throw new FlushFailedEngineException(shardId, e); } } /* * we have to inc-ref the store here since if the engine is closed by a tragic event * we don't acquire the write lock and wait until we have exclusive access. This might also * dec the store reference which can essentially close the store and unless we can inc the reference * we can't use it. */ store.incRef(); try { // reread the last committed segment infos lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo(); } catch (Throwable e) { if (isClosed.get() == false) { logger.warn("failed to read latest segment infos on flush", e); if (Lucene.isCorruptionException(e)) { throw new FlushFailedEngineException(shardId, e); } } } finally { store.decRef(); } newCommitId = lastCommittedSegmentInfos.getId(); } catch (FlushFailedEngineException ex) { maybeFailEngine("flush", ex); throw ex; } finally { flushLock.unlock(); } } // We don't have to do this here; we do it defensively to make sure that even if wall clock time // is misbehaving // (e.g., moves backwards) we will at least still sometimes prune deleted tombstones: if (engineConfig.isEnableGcDeletes()) { pruneDeletedTombstones(); } return new CommitId(newCommitId); }