private synchronized void scheduleSnapshotIfNeeded() { if (!shardGateway.requiresSnapshot()) { return; } if (!shardGateway.requiresSnapshotScheduling()) { return; } if (!indexShard.routingEntry().primary()) { // we only do snapshotting on the primary shard return; } if (!indexShard.routingEntry().started()) { // we only schedule when the cluster assumes we have started return; } if (snapshotScheduleFuture != null) { // we are already scheduling this one, ignore return; } if (snapshotInterval.millis() != -1) { // we need to schedule snapshot if (logger.isDebugEnabled()) { logger.debug("scheduling snapshot every [{}]", snapshotInterval); } snapshotScheduleFuture = threadPool.schedule(snapshotInterval, ThreadPool.Names.SNAPSHOT, snapshotRunnable); } }
private Directory getStoreDirectory(String index, int shardId) { Set<String> nodes = cluster().nodesInclude("test"); assertThat(nodes.isEmpty(), equalTo(false)); IndicesService indicesService = cluster().getInstance(IndicesService.class, nodes.iterator().next()); InternalIndexShard indexShard = (InternalIndexShard) (indicesService.indexService(index).shard(shardId)); return indexShard.store().directory(); }
@Override public RecoveryStatus recover() throws IndexShardGatewayRecoveryException { // in the none case, we simply start the shard indexShard.start(); return new RecoveryStatus( new RecoveryStatus.Index(-1, 0, new SizeValue(0, SizeUnit.BYTES)), new RecoveryStatus.Translog(-1, 0, new SizeValue(0, SizeUnit.BYTES))); }
@Override protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) throws ElasticSearchException { synchronized (termlistMutex) { InternalIndexShard indexShard = (InternalIndexShard) indicesService.indexServiceSafe(request.index()).shardSafe(request.shardId()); indexShard.store().directory(); Engine.Searcher searcher = indexShard.searcher(); try { Set<String> set = new CompactHashSet(); Fields fields = MultiFields.getFields(searcher.reader()); if (fields != null) { for (Iterator<String> it = fields.iterator(); it.hasNext(); ) { String field = it.next(); if (field.charAt(0) == '_') { continue; } if (request.getField() == null || field.equals(request.getField())) { Terms terms = fields.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { set.add(text.utf8ToString()); System.out.println("field=" + field + "; text=" + text.utf8ToString()); } } } } } return new ShardTermlistResponse(request.index(), request.shardId(), set); } catch (IOException ex) { throw new ElasticSearchException(ex.getMessage(), ex); } } }
@Override protected ShardStatus shardOperation(IndexShardStatusRequest request) throws ElasticSearchException { InternalIndexService indexService = (InternalIndexService) indicesService.indexServiceSafe(request.index()); InternalIndexShard indexShard = (InternalIndexShard) indexService.shardSafe(request.shardId()); ShardStatus shardStatus = new ShardStatus(indexShard.routingEntry()); shardStatus.state = indexShard.state(); try { shardStatus.storeSize = indexShard.store().estimateSize(); } catch (IOException e) { // failure to get the store size... } if (indexShard.state() == IndexShardState.STARTED) { // shardStatus.estimatedFlushableMemorySize = // indexShard.estimateFlushableMemorySize(); shardStatus.translogId = indexShard.translog().currentId(); shardStatus.translogOperations = indexShard.translog().estimatedNumberOfOperations(); Engine.Searcher searcher = indexShard.searcher(); try { shardStatus.docs = new DocsStatus(); shardStatus.docs.numDocs = searcher.reader().numDocs(); shardStatus.docs.maxDoc = searcher.reader().maxDoc(); shardStatus.docs.deletedDocs = searcher.reader().numDeletedDocs(); } finally { searcher.release(); } shardStatus.mergeStats = indexShard.mergeScheduler().stats(); shardStatus.refreshStats = indexShard.refreshStats(); shardStatus.flushStats = indexShard.flushStats(); } if (request.recovery) { // check on going recovery (from peer or gateway) RecoveryStatus peerRecoveryStatus = indexShard.peerRecoveryStatus(); if (peerRecoveryStatus == null) { peerRecoveryStatus = peerRecoveryTarget.peerRecoveryStatus(indexShard.shardId()); } if (peerRecoveryStatus != null) { PeerRecoveryStatus.Stage stage; switch (peerRecoveryStatus.stage()) { case INIT: stage = PeerRecoveryStatus.Stage.INIT; break; case INDEX: stage = PeerRecoveryStatus.Stage.INDEX; break; case TRANSLOG: stage = PeerRecoveryStatus.Stage.TRANSLOG; break; case FINALIZE: stage = PeerRecoveryStatus.Stage.FINALIZE; break; case DONE: stage = PeerRecoveryStatus.Stage.DONE; break; default: stage = PeerRecoveryStatus.Stage.INIT; } shardStatus.peerRecoveryStatus = new PeerRecoveryStatus( stage, peerRecoveryStatus.startTime(), peerRecoveryStatus.time(), peerRecoveryStatus.phase1TotalSize(), peerRecoveryStatus.phase1ExistingTotalSize(), peerRecoveryStatus.currentFilesSize(), peerRecoveryStatus.currentTranslogOperations()); } IndexShardGatewayService gatewayService = indexService.shardInjector(request.shardId()).getInstance(IndexShardGatewayService.class); org.elasticsearch.index.gateway.RecoveryStatus gatewayRecoveryStatus = gatewayService.recoveryStatus(); if (gatewayRecoveryStatus != null) { GatewayRecoveryStatus.Stage stage; switch (gatewayRecoveryStatus.stage()) { case INIT: stage = GatewayRecoveryStatus.Stage.INIT; break; case INDEX: stage = GatewayRecoveryStatus.Stage.INDEX; break; case TRANSLOG: stage = GatewayRecoveryStatus.Stage.TRANSLOG; break; case DONE: stage = GatewayRecoveryStatus.Stage.DONE; break; default: stage = GatewayRecoveryStatus.Stage.INIT; } shardStatus.gatewayRecoveryStatus = new GatewayRecoveryStatus( stage, gatewayRecoveryStatus.startTime(), gatewayRecoveryStatus.time(), gatewayRecoveryStatus.index().totalSize(), gatewayRecoveryStatus.index().reusedTotalSize(), gatewayRecoveryStatus.index().currentFilesSize(), gatewayRecoveryStatus.translog().currentTranslogOperations()); } } if (request.snapshot) { IndexShardGatewayService gatewayService = indexService.shardInjector(request.shardId()).getInstance(IndexShardGatewayService.class); SnapshotStatus snapshotStatus = gatewayService.snapshotStatus(); if (snapshotStatus != null) { GatewaySnapshotStatus.Stage stage; switch (snapshotStatus.stage()) { case DONE: stage = GatewaySnapshotStatus.Stage.DONE; break; case FAILURE: stage = GatewaySnapshotStatus.Stage.FAILURE; break; case TRANSLOG: stage = GatewaySnapshotStatus.Stage.TRANSLOG; break; case FINALIZE: stage = GatewaySnapshotStatus.Stage.FINALIZE; break; case INDEX: stage = GatewaySnapshotStatus.Stage.INDEX; break; default: stage = GatewaySnapshotStatus.Stage.NONE; break; } shardStatus.gatewaySnapshotStatus = new GatewaySnapshotStatus( stage, snapshotStatus.startTime(), snapshotStatus.time(), snapshotStatus.index().totalSize(), snapshotStatus.translog().expectedNumberOfOperations()); } } return shardStatus; }
private void recoverTranslog(CommitPoint commitPoint, ImmutableMap<String, BlobMetaData> blobs) throws IndexShardGatewayRecoveryException { if (commitPoint.translogFiles().isEmpty()) { // no translog files, bail indexShard.start("post recovery from gateway, no translog"); return; } try { indexShard.performRecoveryPrepareForTranslog(); final AtomicReference<Throwable> failure = new AtomicReference<Throwable>(); final CountDownLatch latch = new CountDownLatch(1); final Iterator<CommitPoint.FileInfo> transIt = commitPoint.translogFiles().iterator(); blobContainer.readBlob( transIt.next().name(), new BlobContainer.ReadBlobListener() { FastByteArrayOutputStream bos = new FastByteArrayOutputStream(); boolean ignore = false; @Override public synchronized void onPartial(byte[] data, int offset, int size) throws IOException { if (ignore) { return; } bos.write(data, offset, size); // if we don't have enough to read the header size of the first translog, bail and // wait for the next one if (bos.size() < 4) { return; } BytesStreamInput si = new BytesStreamInput(bos.unsafeByteArray(), 0, bos.size()); int position; while (true) { try { position = si.position(); if (position + 4 > bos.size()) { break; } int opSize = si.readInt(); int curPos = si.position(); if ((si.position() + opSize) > bos.size()) { break; } Translog.Operation operation = TranslogStreams.readTranslogOperation(si); if ((si.position() - curPos) != opSize) { logger.warn( "mismatch in size, expected [{}], got [{}]", opSize, si.position() - curPos); } recoveryStatus.translog().addTranslogOperations(1); indexShard.performRecoveryOperation(operation); if (si.position() >= bos.size()) { position = si.position(); break; } } catch (Exception e) { logger.warn( "failed to retrieve translog after [{}] operations, ignoring the rest, considered corrupted", e, recoveryStatus.translog().currentTranslogOperations()); ignore = true; latch.countDown(); return; } } FastByteArrayOutputStream newBos = new FastByteArrayOutputStream(); int leftOver = bos.size() - position; if (leftOver > 0) { newBos.write(bos.unsafeByteArray(), position, leftOver); } bos = newBos; } @Override public synchronized void onCompleted() { if (ignore) { return; } if (!transIt.hasNext()) { latch.countDown(); return; } blobContainer.readBlob(transIt.next().name(), this); } @Override public void onFailure(Throwable t) { failure.set(t); latch.countDown(); } }); latch.await(); if (failure.get() != null) { throw failure.get(); } indexShard.performRecoveryFinalization(true); } catch (Throwable e) { throw new IndexShardGatewayRecoveryException(shardId, "Failed to recover translog", e); } }
@Override public void recover(RecoveryStatus recoveryStatus) throws IndexShardGatewayRecoveryException { this.recoveryStatus = recoveryStatus; final ImmutableMap<String, BlobMetaData> blobs; try { blobs = blobContainer.listBlobs(); } catch (IOException e) { throw new IndexShardGatewayRecoveryException(shardId, "Failed to list content of gateway", e); } List<CommitPoint> commitPointsList = Lists.newArrayList(); boolean atLeastOneCommitPointExists = false; for (String name : blobs.keySet()) { if (name.startsWith("commit-")) { atLeastOneCommitPointExists = true; try { commitPointsList.add(CommitPoints.fromXContent(blobContainer.readBlobFully(name))); } catch (Exception e) { logger.warn("failed to read commit point [{}]", e, name); } } } if (atLeastOneCommitPointExists && commitPointsList.isEmpty()) { // no commit point managed to load, bail so we won't corrupt the index, will require manual // intervention throw new IndexShardGatewayRecoveryException( shardId, "Commit points exists but none could be loaded", null); } CommitPoints commitPoints = new CommitPoints(commitPointsList); if (commitPoints.commits().isEmpty()) { // no commit points, clean the store just so we won't recover wrong files try { indexShard.store().deleteContent(); } catch (IOException e) { logger.warn("failed to clean store before starting shard", e); } recoveryStatus.index().startTime(System.currentTimeMillis()); recoveryStatus.index().time(System.currentTimeMillis() - recoveryStatus.index().startTime()); recoveryStatus.translog().startTime(System.currentTimeMillis()); recoveryStatus .translog() .time(System.currentTimeMillis() - recoveryStatus.index().startTime()); return; } for (CommitPoint commitPoint : commitPoints) { if (!commitPointExistsInBlobs(commitPoint, blobs)) { logger.warn( "listed commit_point [{}]/[{}], but not all files exists, ignoring", commitPoint.name(), commitPoint.version()); continue; } try { recoveryStatus.index().startTime(System.currentTimeMillis()); recoveryStatus.updateStage(RecoveryStatus.Stage.INDEX); recoverIndex(commitPoint, blobs); recoveryStatus .index() .time(System.currentTimeMillis() - recoveryStatus.index().startTime()); recoveryStatus.translog().startTime(System.currentTimeMillis()); recoveryStatus.updateStage(RecoveryStatus.Stage.TRANSLOG); recoverTranslog(commitPoint, blobs); recoveryStatus .translog() .time(System.currentTimeMillis() - recoveryStatus.index().startTime()); return; } catch (Exception e) { throw new IndexShardGatewayRecoveryException( shardId, "failed to recover commit_point [" + commitPoint.name() + "]/[" + commitPoint.version() + "]", e); } } throw new IndexShardGatewayRecoveryException( shardId, "No commit point data is available in gateway", null); }
/** Snapshots the given shard into the gateway. */ public synchronized void snapshot(final String reason) throws IndexShardGatewaySnapshotFailedException { if (!indexShard.routingEntry().primary()) { return; // throw new IndexShardGatewaySnapshotNotAllowedException(shardId, "Snapshot not // allowed on non primary shard"); } if (indexShard.routingEntry().relocating()) { // do not snapshot when in the process of relocation of primaries so we won't get conflicts return; } if (indexShard.state() == IndexShardState.CREATED) { // shard has just been created, ignore it and return return; } if (indexShard.state() == IndexShardState.RECOVERING) { // shard is recovering, don't snapshot return; } if (snapshotLock == null) { try { snapshotLock = shardGateway.obtainSnapshotLock(); } catch (Exception e) { logger.warn("failed to obtain snapshot lock, ignoring snapshot", e); return; } } try { SnapshotStatus snapshotStatus = indexShard.snapshot( new Engine.SnapshotHandler<SnapshotStatus>() { @Override public SnapshotStatus snapshot( SnapshotIndexCommit snapshotIndexCommit, Translog.Snapshot translogSnapshot) throws EngineException { if (lastIndexVersion != snapshotIndexCommit.getGeneration() || lastTranslogId != translogSnapshot.translogId() || lastTranslogLength < translogSnapshot.length()) { logger.debug("snapshot ({}) to {} ...", reason, shardGateway); SnapshotStatus snapshotStatus = shardGateway.snapshot( new IndexShardGateway.Snapshot( snapshotIndexCommit, translogSnapshot, lastIndexVersion, lastTranslogId, lastTranslogLength, lastTotalTranslogOperations)); lastIndexVersion = snapshotIndexCommit.getGeneration(); lastTranslogId = translogSnapshot.translogId(); lastTranslogLength = translogSnapshot.length(); lastTotalTranslogOperations = translogSnapshot.estimatedTotalOperations(); return snapshotStatus; } return null; } }); if (snapshotStatus != null) { if (logger.isDebugEnabled()) { StringBuilder sb = new StringBuilder(); sb.append("snapshot (") .append(reason) .append(") completed to ") .append(shardGateway) .append(", took [") .append(TimeValue.timeValueMillis(snapshotStatus.time())) .append("]\n"); sb.append(" index : version [") .append(lastIndexVersion) .append("], number_of_files [") .append(snapshotStatus.index().numberOfFiles()) .append("] with total_size [") .append(new ByteSizeValue(snapshotStatus.index().totalSize())) .append("], took [") .append(TimeValue.timeValueMillis(snapshotStatus.index().time())) .append("]\n"); sb.append(" translog : id [") .append(lastTranslogId) .append("], number_of_operations [") .append(snapshotStatus.translog().expectedNumberOfOperations()) .append("], took [") .append(TimeValue.timeValueMillis(snapshotStatus.translog().time())) .append("]"); logger.debug(sb.toString()); } } } catch (SnapshotFailedEngineException e) { if (e.getCause() instanceof IllegalStateException) { // ignore, that's fine, snapshot has not started yet } else { throw new IndexShardGatewaySnapshotFailedException(shardId, "Failed to snapshot", e); } } catch (IllegalIndexShardStateException e) { // ignore, that's fine, snapshot has not started yet } catch (IndexShardGatewaySnapshotFailedException e) { throw e; } catch (Exception e) { throw new IndexShardGatewaySnapshotFailedException(shardId, "Failed to snapshot", e); } }
/** Recovers the state of the shard from the gateway. */ public void recover(final boolean indexShouldExists, final RecoveryListener listener) throws IndexShardGatewayRecoveryException, IgnoreGatewayRecoveryException { if (indexShard.state() == IndexShardState.CLOSED) { // got closed on us, just ignore this recovery listener.onIgnoreRecovery("shard closed"); return; } if (!indexShard.routingEntry().primary()) { listener.onRecoveryFailed( new IndexShardGatewayRecoveryException( shardId, "Trying to recover when the shard is in backup state", null)); return; } try { if (indexShard.routingEntry().restoreSource() != null) { indexShard.recovering("from snapshot"); } else { indexShard.recovering("from gateway"); } } catch (IllegalIndexShardStateException e) { // that's fine, since we might be called concurrently, just ignore this, we are already // recovering listener.onIgnoreRecovery("already in recovering process, " + e.getMessage()); return; } threadPool .generic() .execute( new Runnable() { @Override public void run() { recoveryStatus = new RecoveryStatus(); recoveryStatus.updateStage(RecoveryStatus.Stage.INIT); try { if (indexShard.routingEntry().restoreSource() != null) { logger.debug( "restoring from {} ...", indexShard.routingEntry().restoreSource()); snapshotService.restore(recoveryStatus); } else { logger.debug("starting recovery from {} ...", shardGateway); shardGateway.recover(indexShouldExists, recoveryStatus); } lastIndexVersion = recoveryStatus.index().version(); lastTranslogId = -1; lastTranslogLength = 0; lastTotalTranslogOperations = recoveryStatus.translog().currentTranslogOperations(); // start the shard if the gateway has not started it already. Note that if the // gateway // moved shard to POST_RECOVERY, it may have been started as well if: // 1) master sent a new cluster state indicating shard is initializing // 2) IndicesClusterStateService#applyInitializingShard will send a shard started // event // 3) Master will mark shard as started and this will be processed locally. IndexShardState shardState = indexShard.state(); if (shardState != IndexShardState.POST_RECOVERY && shardState != IndexShardState.STARTED) { indexShard.postRecovery("post recovery from gateway"); } // refresh the shard indexShard.refresh(new Engine.Refresh("post_gateway").force(true)); recoveryStatus.time(System.currentTimeMillis() - recoveryStatus.startTime()); recoveryStatus.updateStage(RecoveryStatus.Stage.DONE); if (logger.isDebugEnabled()) { logger.debug( "recovery completed from [{}], took [{}]", shardGateway, timeValueMillis(recoveryStatus.time())); } else if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append("recovery completed from ") .append(shardGateway) .append(", took [") .append(timeValueMillis(recoveryStatus.time())) .append("]\n"); sb.append(" index : files [") .append(recoveryStatus.index().numberOfFiles()) .append("] with total_size [") .append(new ByteSizeValue(recoveryStatus.index().totalSize())) .append("], took[") .append(TimeValue.timeValueMillis(recoveryStatus.index().time())) .append("]\n"); sb.append(" : recovered_files [") .append(recoveryStatus.index().numberOfRecoveredFiles()) .append("] with total_size [") .append(new ByteSizeValue(recoveryStatus.index().recoveredTotalSize())) .append("]\n"); sb.append(" : reusing_files [") .append(recoveryStatus.index().numberOfReusedFiles()) .append("] with total_size [") .append(new ByteSizeValue(recoveryStatus.index().reusedTotalSize())) .append("]\n"); sb.append(" start : took [") .append(TimeValue.timeValueMillis(recoveryStatus.start().time())) .append("], check_index [") .append(timeValueMillis(recoveryStatus.start().checkIndexTime())) .append("]\n"); sb.append(" translog : number_of_operations [") .append(recoveryStatus.translog().currentTranslogOperations()) .append("], took [") .append(TimeValue.timeValueMillis(recoveryStatus.translog().time())) .append("]"); logger.trace(sb.toString()); } listener.onRecoveryDone(); scheduleSnapshotIfNeeded(); } catch (IndexShardGatewayRecoveryException e) { if (indexShard.state() == IndexShardState.CLOSED) { // got closed on us, just ignore this recovery listener.onIgnoreRecovery("shard closed"); return; } if ((e.getCause() instanceof IndexShardClosedException) || (e.getCause() instanceof IndexShardNotStartedException)) { // got closed on us, just ignore this recovery listener.onIgnoreRecovery("shard closed"); return; } listener.onRecoveryFailed(e); } catch (IndexShardClosedException e) { listener.onIgnoreRecovery("shard closed"); } catch (IndexShardNotStartedException e) { listener.onIgnoreRecovery("shard closed"); } catch (Exception e) { if (indexShard.state() == IndexShardState.CLOSED) { // got closed on us, just ignore this recovery listener.onIgnoreRecovery("shard closed"); return; } listener.onRecoveryFailed( new IndexShardGatewayRecoveryException(shardId, "failed recovery", e)); } } }); }