public class GatewayService extends AbstractLifecycleComponent implements ClusterStateListener { public static final Setting<Integer> EXPECTED_NODES_SETTING = Setting.intSetting("gateway.expected_nodes", -1, -1, Property.NodeScope); public static final Setting<Integer> EXPECTED_DATA_NODES_SETTING = Setting.intSetting("gateway.expected_data_nodes", -1, -1, Property.NodeScope); public static final Setting<Integer> EXPECTED_MASTER_NODES_SETTING = Setting.intSetting("gateway.expected_master_nodes", -1, -1, Property.NodeScope); public static final Setting<TimeValue> RECOVER_AFTER_TIME_SETTING = Setting.positiveTimeSetting( "gateway.recover_after_time", TimeValue.timeValueMillis(0), Property.NodeScope); public static final Setting<Integer> RECOVER_AFTER_NODES_SETTING = Setting.intSetting("gateway.recover_after_nodes", -1, -1, Property.NodeScope); public static final Setting<Integer> RECOVER_AFTER_DATA_NODES_SETTING = Setting.intSetting("gateway.recover_after_data_nodes", -1, -1, Property.NodeScope); public static final Setting<Integer> RECOVER_AFTER_MASTER_NODES_SETTING = Setting.intSetting("gateway.recover_after_master_nodes", 0, 0, Property.NodeScope); public static final ClusterBlock STATE_NOT_RECOVERED_BLOCK = new ClusterBlock( 1, "state not recovered / initialized", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL); public static final TimeValue DEFAULT_RECOVER_AFTER_TIME_IF_EXPECTED_NODES_IS_SET = TimeValue.timeValueMinutes(5); private final Gateway gateway; private final ThreadPool threadPool; private final AllocationService allocationService; private final ClusterService clusterService; private final TimeValue recoverAfterTime; private final int recoverAfterNodes; private final int expectedNodes; private final int recoverAfterDataNodes; private final int expectedDataNodes; private final int recoverAfterMasterNodes; private final int expectedMasterNodes; private final AtomicBoolean recovered = new AtomicBoolean(); private final AtomicBoolean scheduledRecovery = new AtomicBoolean(); @Inject public GatewayService( Settings settings, AllocationService allocationService, ClusterService clusterService, ThreadPool threadPool, GatewayMetaState metaState, TransportNodesListGatewayMetaState listGatewayMetaState, Discovery discovery, IndicesService indicesService) { super(settings); this.gateway = new Gateway( settings, clusterService, metaState, listGatewayMetaState, discovery, indicesService); this.allocationService = allocationService; this.clusterService = clusterService; this.threadPool = threadPool; // allow to control a delay of when indices will get created this.expectedNodes = EXPECTED_NODES_SETTING.get(this.settings); this.expectedDataNodes = EXPECTED_DATA_NODES_SETTING.get(this.settings); this.expectedMasterNodes = EXPECTED_MASTER_NODES_SETTING.get(this.settings); if (RECOVER_AFTER_TIME_SETTING.exists(this.settings)) { recoverAfterTime = RECOVER_AFTER_TIME_SETTING.get(this.settings); } else if (expectedNodes >= 0 || expectedDataNodes >= 0 || expectedMasterNodes >= 0) { recoverAfterTime = DEFAULT_RECOVER_AFTER_TIME_IF_EXPECTED_NODES_IS_SET; } else { recoverAfterTime = null; } this.recoverAfterNodes = RECOVER_AFTER_NODES_SETTING.get(this.settings); this.recoverAfterDataNodes = RECOVER_AFTER_DATA_NODES_SETTING.get(this.settings); // default the recover after master nodes to the minimum master nodes in the discovery if (RECOVER_AFTER_MASTER_NODES_SETTING.exists(this.settings)) { recoverAfterMasterNodes = RECOVER_AFTER_MASTER_NODES_SETTING.get(this.settings); } else { // TODO: change me once the minimum_master_nodes is changed too recoverAfterMasterNodes = settings.getAsInt("discovery.zen.minimum_master_nodes", -1); } // Add the not recovered as initial state block, we don't allow anything until this.clusterService.addInitialStateBlock(STATE_NOT_RECOVERED_BLOCK); } @Override protected void doStart() { // use post applied so that the state will be visible to the background recovery thread we spawn // in performStateRecovery clusterService.addListener(this); } @Override protected void doStop() { clusterService.removeListener(this); } @Override protected void doClose() {} @Override public void clusterChanged(final ClusterChangedEvent event) { if (lifecycle.stoppedOrClosed()) { return; } final ClusterState state = event.state(); if (state.nodes().isLocalNodeElectedMaster() == false) { // not our job to recover return; } if (state.blocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) { // already recovered return; } DiscoveryNodes nodes = state.nodes(); if (state.nodes().getMasterNodeId() == null) { logger.debug("not recovering from gateway, no master elected yet"); } else if (recoverAfterNodes != -1 && (nodes.getMasterAndDataNodes().size()) < recoverAfterNodes) { logger.debug( "not recovering from gateway, nodes_size (data+master) [{}] < recover_after_nodes [{}]", nodes.getMasterAndDataNodes().size(), recoverAfterNodes); } else if (recoverAfterDataNodes != -1 && nodes.getDataNodes().size() < recoverAfterDataNodes) { logger.debug( "not recovering from gateway, nodes_size (data) [{}] < recover_after_data_nodes [{}]", nodes.getDataNodes().size(), recoverAfterDataNodes); } else if (recoverAfterMasterNodes != -1 && nodes.getMasterNodes().size() < recoverAfterMasterNodes) { logger.debug( "not recovering from gateway, nodes_size (master) [{}] < recover_after_master_nodes [{}]", nodes.getMasterNodes().size(), recoverAfterMasterNodes); } else { boolean enforceRecoverAfterTime; String reason; if (expectedNodes == -1 && expectedMasterNodes == -1 && expectedDataNodes == -1) { // no expected is set, honor the setting if they are there enforceRecoverAfterTime = true; reason = "recover_after_time was set to [" + recoverAfterTime + "]"; } else { // one of the expected is set, see if all of them meet the need, and ignore the timeout in // this case enforceRecoverAfterTime = false; reason = ""; if (expectedNodes != -1 && (nodes.getMasterAndDataNodes().size() < expectedNodes)) { // does not meet the expected... enforceRecoverAfterTime = true; reason = "expecting [" + expectedNodes + "] nodes, but only have [" + nodes.getMasterAndDataNodes().size() + "]"; } else if (expectedDataNodes != -1 && (nodes.getDataNodes().size() < expectedDataNodes)) { // does not meet the expected... enforceRecoverAfterTime = true; reason = "expecting [" + expectedDataNodes + "] data nodes, but only have [" + nodes.getDataNodes().size() + "]"; } else if (expectedMasterNodes != -1 && (nodes.getMasterNodes().size() < expectedMasterNodes)) { // does not meet the expected... enforceRecoverAfterTime = true; reason = "expecting [" + expectedMasterNodes + "] master nodes, but only have [" + nodes.getMasterNodes().size() + "]"; } } performStateRecovery(enforceRecoverAfterTime, reason); } } private void performStateRecovery(boolean enforceRecoverAfterTime, String reason) { final Gateway.GatewayStateRecoveredListener recoveryListener = new GatewayRecoveryListener(); if (enforceRecoverAfterTime && recoverAfterTime != null) { if (scheduledRecovery.compareAndSet(false, true)) { logger.info("delaying initial state recovery for [{}]. {}", recoverAfterTime, reason); threadPool.schedule( recoverAfterTime, ThreadPool.Names.GENERIC, () -> { if (recovered.compareAndSet(false, true)) { logger.info( "recover_after_time [{}] elapsed. performing state recovery...", recoverAfterTime); gateway.performStateRecovery(recoveryListener); } }); } } else { if (recovered.compareAndSet(false, true)) { threadPool .generic() .execute( new AbstractRunnable() { @Override public void onFailure(Exception e) { logger.warn("Recovery failed", e); // we reset `recovered` in the listener don't reset it here otherwise there // might be a race // that resets it to false while a new recover is already running? recoveryListener.onFailure("state recovery failed: " + e.getMessage()); } @Override protected void doRun() throws Exception { gateway.performStateRecovery(recoveryListener); } }); } } } public Gateway getGateway() { return gateway; } class GatewayRecoveryListener implements Gateway.GatewayStateRecoveredListener { @Override public void onSuccess(final ClusterState recoveredState) { logger.trace("successful state recovery, importing cluster state..."); clusterService.submitStateUpdateTask( "local-gateway-elected-state", new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { assert currentState.metaData().indices().isEmpty(); // remove the block, since we recovered from gateway ClusterBlocks.Builder blocks = ClusterBlocks.builder() .blocks(currentState.blocks()) .blocks(recoveredState.blocks()) .removeGlobalBlock(STATE_NOT_RECOVERED_BLOCK); MetaData.Builder metaDataBuilder = MetaData.builder(recoveredState.metaData()); // automatically generate a UID for the metadata if we need to metaDataBuilder.generateClusterUuidIfNeeded(); if (MetaData.SETTING_READ_ONLY_SETTING.get(recoveredState.metaData().settings()) || MetaData.SETTING_READ_ONLY_SETTING.get(currentState.metaData().settings())) { blocks.addGlobalBlock(MetaData.CLUSTER_READ_ONLY_BLOCK); } for (IndexMetaData indexMetaData : recoveredState.metaData()) { metaDataBuilder.put(indexMetaData, false); blocks.addBlocks(indexMetaData); } // update the state to reflect the new metadata and routing ClusterState updatedState = ClusterState.builder(currentState) .blocks(blocks) .metaData(metaDataBuilder) .build(); // initialize all index routing tables as empty RoutingTable.Builder routingTableBuilder = RoutingTable.builder(updatedState.routingTable()); for (ObjectCursor<IndexMetaData> cursor : updatedState.metaData().indices().values()) { routingTableBuilder.addAsRecovery(cursor.value); } // start with 0 based versions for routing table routingTableBuilder.version(0); // now, reroute updatedState = ClusterState.builder(updatedState) .routingTable(routingTableBuilder.build()) .build(); return allocationService.reroute(updatedState, "state recovered"); } @Override public void onFailure(String source, Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage("unexpected failure during [{}]", source), e); GatewayRecoveryListener.this.onFailure("failed to updated cluster state"); } @Override public void clusterStateProcessed( String source, ClusterState oldState, ClusterState newState) { logger.info( "recovered [{}] indices into cluster_state", newState.metaData().indices().size()); } }); } @Override public void onFailure(String message) { recovered.set(false); scheduledRecovery.set(false); // don't remove the block here, we don't want to allow anything in such a case logger.info("metadata state not restored, reason: {}", message); } } // used for testing public TimeValue recoverAfterTime() { return recoverAfterTime; } }
/** A node level service that delete expired docs on node primary shards. */ public class IndicesTTLService extends AbstractLifecycleComponent<IndicesTTLService> { public static final Setting<TimeValue> INDICES_TTL_INTERVAL_SETTING = Setting.positiveTimeSetting( "indices.ttl.interval", TimeValue.timeValueSeconds(60), Property.Dynamic, Property.NodeScope); private final ClusterService clusterService; private final IndicesService indicesService; private final TransportBulkAction bulkAction; private final int bulkSize; private PurgerThread purgerThread; @Inject public IndicesTTLService( Settings settings, ClusterService clusterService, IndicesService indicesService, ClusterSettings clusterSettings, TransportBulkAction bulkAction) { super(settings); this.clusterService = clusterService; this.indicesService = indicesService; TimeValue interval = INDICES_TTL_INTERVAL_SETTING.get(settings); this.bulkAction = bulkAction; this.bulkSize = this.settings.getAsInt("indices.ttl.bulk_size", 10000); this.purgerThread = new PurgerThread(EsExecutors.threadName(settings, "[ttl_expire]"), interval); clusterSettings.addSettingsUpdateConsumer( INDICES_TTL_INTERVAL_SETTING, this.purgerThread::resetInterval); } @Override protected void doStart() { this.purgerThread.start(); } @Override protected void doStop() { try { this.purgerThread.shutdown(); } catch (InterruptedException e) { // we intentionally do not want to restore the interruption flag, we're about to shutdown // anyway } } @Override protected void doClose() {} private class PurgerThread extends Thread { private final AtomicBoolean running = new AtomicBoolean(true); private final Notifier notifier; private final CountDownLatch shutdownLatch = new CountDownLatch(1); public PurgerThread(String name, TimeValue interval) { super(name); setDaemon(true); this.notifier = new Notifier(interval); } public void shutdown() throws InterruptedException { if (running.compareAndSet(true, false)) { notifier.doNotify(); shutdownLatch.await(); } } public void resetInterval(TimeValue interval) { notifier.setTimeout(interval); } @Override public void run() { try { while (running.get()) { try { List<IndexShard> shardsToPurge = getShardsToPurge(); purgeShards(shardsToPurge); } catch (Throwable e) { if (running.get()) { logger.warn("failed to execute ttl purge", e); } } if (running.get()) { notifier.await(); } } } finally { shutdownLatch.countDown(); } } /** * Returns the shards to purge, i.e. the local started primary shards that have ttl enabled and * disable_purge to false */ private List<IndexShard> getShardsToPurge() { List<IndexShard> shardsToPurge = new ArrayList<>(); MetaData metaData = clusterService.state().metaData(); for (IndexService indexService : indicesService) { // check the value of disable_purge for this index IndexMetaData indexMetaData = metaData.index(indexService.index()); if (indexMetaData == null) { continue; } if (indexService.getIndexSettings().isTTLPurgeDisabled()) { continue; } // check if ttl is enabled for at least one type of this index boolean hasTTLEnabled = false; for (String type : indexService.mapperService().types()) { DocumentMapper documentType = indexService.mapperService().documentMapper(type); if (documentType.TTLFieldMapper().enabled()) { hasTTLEnabled = true; break; } } if (hasTTLEnabled) { for (IndexShard indexShard : indexService) { if (indexShard.state() == IndexShardState.STARTED && indexShard.routingEntry().primary() && indexShard.routingEntry().started()) { shardsToPurge.add(indexShard); } } } } return shardsToPurge; } public TimeValue getInterval() { return notifier.getTimeout(); } } private void purgeShards(List<IndexShard> shardsToPurge) { for (IndexShard shardToPurge : shardsToPurge) { Query query = shardToPurge .mapperService() .fullName(TTLFieldMapper.NAME) .rangeQuery(null, System.currentTimeMillis(), false, true); Engine.Searcher searcher = shardToPurge.acquireSearcher("indices_ttl"); try { logger.debug( "[{}][{}] purging shard", shardToPurge.routingEntry().index(), shardToPurge.routingEntry().id()); ExpiredDocsCollector expiredDocsCollector = new ExpiredDocsCollector(); searcher.searcher().search(query, expiredDocsCollector); List<DocToPurge> docsToPurge = expiredDocsCollector.getDocsToPurge(); BulkRequest bulkRequest = new BulkRequest(); for (DocToPurge docToPurge : docsToPurge) { bulkRequest.add( new DeleteRequest() .index(shardToPurge.routingEntry().getIndexName()) .type(docToPurge.type) .id(docToPurge.id) .version(docToPurge.version) .routing(docToPurge.routing)); bulkRequest = processBulkIfNeeded(bulkRequest, false); } processBulkIfNeeded(bulkRequest, true); } catch (Exception e) { logger.warn("failed to purge", e); } finally { searcher.close(); } } } private static class DocToPurge { public final String type; public final String id; public final long version; public final String routing; public DocToPurge(String type, String id, long version, String routing) { this.type = type; this.id = id; this.version = version; this.routing = routing; } } private class ExpiredDocsCollector extends SimpleCollector { private LeafReaderContext context; private List<DocToPurge> docsToPurge = new ArrayList<>(); public ExpiredDocsCollector() {} @Override public void setScorer(Scorer scorer) {} @Override public boolean needsScores() { return false; } @Override public void collect(int doc) { try { FieldsVisitor fieldsVisitor = new FieldsVisitor(false); context.reader().document(doc, fieldsVisitor); Uid uid = fieldsVisitor.uid(); final long version = Versions.loadVersion(context.reader(), new Term(UidFieldMapper.NAME, uid.toBytesRef())); docsToPurge.add(new DocToPurge(uid.type(), uid.id(), version, fieldsVisitor.routing())); } catch (Exception e) { logger.trace("failed to collect doc", e); } } @Override public void doSetNextReader(LeafReaderContext context) throws IOException { this.context = context; } public List<DocToPurge> getDocsToPurge() { return this.docsToPurge; } } private BulkRequest processBulkIfNeeded(BulkRequest bulkRequest, boolean force) { if ((force && bulkRequest.numberOfActions() > 0) || bulkRequest.numberOfActions() >= bulkSize) { try { bulkAction.executeBulk( bulkRequest, new ActionListener<BulkResponse>() { @Override public void onResponse(BulkResponse bulkResponse) { if (bulkResponse.hasFailures()) { int failedItems = 0; for (BulkItemResponse response : bulkResponse) { if (response.isFailed()) failedItems++; } if (logger.isTraceEnabled()) { logger.trace( "bulk deletion failures for [{}]/[{}] items, failure message: [{}]", failedItems, bulkResponse.getItems().length, bulkResponse.buildFailureMessage()); } else { logger.error( "bulk deletion failures for [{}]/[{}] items", failedItems, bulkResponse.getItems().length); } } else { logger.trace("bulk deletion took {}ms", bulkResponse.getTookInMillis()); } } @Override public void onFailure(Throwable e) { if (logger.isTraceEnabled()) { logger.trace("failed to execute bulk", e); } else { logger.warn("failed to execute bulk: ", e); } } }); } catch (Exception e) { logger.warn("failed to process bulk", e); } bulkRequest = new BulkRequest(); } return bulkRequest; } private static final class Notifier { private final ReentrantLock lock = new ReentrantLock(); private final Condition condition = lock.newCondition(); private volatile TimeValue timeout; public Notifier(TimeValue timeout) { assert timeout != null; this.timeout = timeout; } public void await() { lock.lock(); try { condition.await(timeout.millis(), TimeUnit.MILLISECONDS); } catch (InterruptedException e) { // we intentionally do not want to restore the interruption flag, we're about to shutdown // anyway } finally { lock.unlock(); } } public void setTimeout(TimeValue timeout) { assert timeout != null; this.timeout = timeout; doNotify(); } public TimeValue getTimeout() { return timeout; } public void doNotify() { lock.lock(); try { condition.signalAll(); } finally { lock.unlock(); } } } }
public class ClusterService extends AbstractLifecycleComponent { public static final Setting<TimeValue> CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING = Setting.positiveTimeSetting( "cluster.service.slow_task_logging_threshold", TimeValue.timeValueSeconds(30), Property.Dynamic, Property.NodeScope); public static final String UPDATE_THREAD_NAME = "clusterService#updateTask"; private final ThreadPool threadPool; private final ClusterName clusterName; private BiConsumer<ClusterChangedEvent, Discovery.AckListener> clusterStatePublisher; private final OperationRouting operationRouting; private final ClusterSettings clusterSettings; private TimeValue slowTaskLoggingThreshold; private volatile PrioritizedEsThreadPoolExecutor updateTasksExecutor; /** Those 3 state listeners are changing infrequently - CopyOnWriteArrayList is just fine */ private final Collection<ClusterStateListener> priorityClusterStateListeners = new CopyOnWriteArrayList<>(); private final Collection<ClusterStateListener> clusterStateListeners = new CopyOnWriteArrayList<>(); private final Collection<ClusterStateListener> lastClusterStateListeners = new CopyOnWriteArrayList<>(); private final Map<ClusterStateTaskExecutor, List<UpdateTask>> updateTasksPerExecutor = new HashMap<>(); // TODO this is rather frequently changing I guess a Synced Set would be better here and a // dedicated remove API private final Collection<ClusterStateListener> postAppliedListeners = new CopyOnWriteArrayList<>(); private final Iterable<ClusterStateListener> preAppliedListeners = Iterables.concat( priorityClusterStateListeners, clusterStateListeners, lastClusterStateListeners); private final LocalNodeMasterListeners localNodeMasterListeners; private final Queue<NotifyTimeout> onGoingTimeouts = ConcurrentCollections.newQueue(); private volatile ClusterState clusterState; private final ClusterBlocks.Builder initialBlocks; private NodeConnectionsService nodeConnectionsService; public ClusterService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool) { super(settings); this.operationRouting = new OperationRouting(settings, clusterSettings); this.threadPool = threadPool; this.clusterSettings = clusterSettings; this.clusterName = ClusterName.CLUSTER_NAME_SETTING.get(settings); // will be replaced on doStart. this.clusterState = ClusterState.builder(clusterName).build(); this.clusterSettings.addSettingsUpdateConsumer( CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING, this::setSlowTaskLoggingThreshold); this.slowTaskLoggingThreshold = CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING.get(settings); localNodeMasterListeners = new LocalNodeMasterListeners(threadPool); initialBlocks = ClusterBlocks.builder(); } private void setSlowTaskLoggingThreshold(TimeValue slowTaskLoggingThreshold) { this.slowTaskLoggingThreshold = slowTaskLoggingThreshold; } public synchronized void setClusterStatePublisher( BiConsumer<ClusterChangedEvent, Discovery.AckListener> publisher) { clusterStatePublisher = publisher; } public synchronized void setLocalNode(DiscoveryNode localNode) { assert clusterState.nodes().getLocalNodeId() == null : "local node is already set"; DiscoveryNodes.Builder nodeBuilder = DiscoveryNodes.builder(clusterState.nodes()).add(localNode).localNodeId(localNode.getId()); this.clusterState = ClusterState.builder(clusterState).nodes(nodeBuilder).build(); } public synchronized void setNodeConnectionsService( NodeConnectionsService nodeConnectionsService) { assert this.nodeConnectionsService == null : "nodeConnectionsService is already set"; this.nodeConnectionsService = nodeConnectionsService; } /** Adds an initial block to be set on the first cluster state created. */ public synchronized void addInitialStateBlock(ClusterBlock block) throws IllegalStateException { if (lifecycle.started()) { throw new IllegalStateException("can't set initial block when started"); } initialBlocks.addGlobalBlock(block); } /** Remove an initial block to be set on the first cluster state created. */ public synchronized void removeInitialStateBlock(ClusterBlock block) throws IllegalStateException { removeInitialStateBlock(block.id()); } /** Remove an initial block to be set on the first cluster state created. */ public synchronized void removeInitialStateBlock(int blockId) throws IllegalStateException { if (lifecycle.started()) { throw new IllegalStateException("can't set initial block when started"); } initialBlocks.removeGlobalBlock(blockId); } @Override protected synchronized void doStart() { Objects.requireNonNull( clusterStatePublisher, "please set a cluster state publisher before starting"); Objects.requireNonNull( clusterState.nodes().getLocalNode(), "please set the local node before starting"); Objects.requireNonNull( nodeConnectionsService, "please set the node connection service before starting"); add(localNodeMasterListeners); this.clusterState = ClusterState.builder(clusterState).blocks(initialBlocks).build(); this.updateTasksExecutor = EsExecutors.newSinglePrioritizing( UPDATE_THREAD_NAME, daemonThreadFactory(settings, UPDATE_THREAD_NAME), threadPool.getThreadContext()); this.clusterState = ClusterState.builder(clusterState).blocks(initialBlocks).build(); } @Override protected synchronized void doStop() { for (NotifyTimeout onGoingTimeout : onGoingTimeouts) { onGoingTimeout.cancel(); try { onGoingTimeout.cancel(); onGoingTimeout.listener.onClose(); } catch (Exception ex) { logger.debug("failed to notify listeners on shutdown", ex); } } ThreadPool.terminate(updateTasksExecutor, 10, TimeUnit.SECONDS); // close timeout listeners that did not have an ongoing timeout postAppliedListeners .stream() .filter(listener -> listener instanceof TimeoutClusterStateListener) .map(listener -> (TimeoutClusterStateListener) listener) .forEach(TimeoutClusterStateListener::onClose); remove(localNodeMasterListeners); } @Override protected synchronized void doClose() {} /** The local node. */ public DiscoveryNode localNode() { DiscoveryNode localNode = clusterState.getNodes().getLocalNode(); if (localNode == null) { throw new IllegalStateException("No local node found. Is the node started?"); } return localNode; } public OperationRouting operationRouting() { return operationRouting; } /** The current state. */ public ClusterState state() { return this.clusterState; } /** Adds a priority listener for updated cluster states. */ public void addFirst(ClusterStateListener listener) { priorityClusterStateListeners.add(listener); } /** Adds last listener. */ public void addLast(ClusterStateListener listener) { lastClusterStateListeners.add(listener); } /** Adds a listener for updated cluster states. */ public void add(ClusterStateListener listener) { clusterStateListeners.add(listener); } /** Removes a listener for updated cluster states. */ public void remove(ClusterStateListener listener) { clusterStateListeners.remove(listener); priorityClusterStateListeners.remove(listener); lastClusterStateListeners.remove(listener); postAppliedListeners.remove(listener); for (Iterator<NotifyTimeout> it = onGoingTimeouts.iterator(); it.hasNext(); ) { NotifyTimeout timeout = it.next(); if (timeout.listener.equals(listener)) { timeout.cancel(); it.remove(); } } } /** Add a listener for on/off local node master events */ public void add(LocalNodeMasterListener listener) { localNodeMasterListeners.add(listener); } /** Remove the given listener for on/off local master events */ public void remove(LocalNodeMasterListener listener) { localNodeMasterListeners.remove(listener); } /** * Adds a cluster state listener that will timeout after the provided timeout, and is executed * after the clusterstate has been successfully applied ie. is in state {@link * org.elasticsearch.cluster.ClusterState.ClusterStateStatus#APPLIED} NOTE: a {@code null} timeout * means that the listener will never be removed automatically */ public void add(@Nullable final TimeValue timeout, final TimeoutClusterStateListener listener) { if (lifecycle.stoppedOrClosed()) { listener.onClose(); return; } // call the post added notification on the same event thread try { updateTasksExecutor.execute( new SourcePrioritizedRunnable(Priority.HIGH, "_add_listener_") { @Override public void run() { if (timeout != null) { NotifyTimeout notifyTimeout = new NotifyTimeout(listener, timeout); notifyTimeout.future = threadPool.schedule(timeout, ThreadPool.Names.GENERIC, notifyTimeout); onGoingTimeouts.add(notifyTimeout); } postAppliedListeners.add(listener); listener.postAdded(); } }); } catch (EsRejectedExecutionException e) { if (lifecycle.stoppedOrClosed()) { listener.onClose(); } else { throw e; } } } /** * Submits a cluster state update task; unlike {@link #submitStateUpdateTask(String, Object, * ClusterStateTaskConfig, ClusterStateTaskExecutor, ClusterStateTaskListener)}, submitted updates * will not be batched. * * @param source the source of the cluster state update task * @param updateTask the full context for the cluster state update task */ public void submitStateUpdateTask(final String source, final ClusterStateUpdateTask updateTask) { submitStateUpdateTask(source, updateTask, updateTask, updateTask, updateTask); } /** * Submits a cluster state update task; submitted updates will be batched across the same instance * of executor. The exact batching semantics depend on the underlying implementation but a rough * guideline is that if the update task is submitted while there are pending update tasks for the * same executor, these update tasks will all be executed on the executor in a single batch * * @param source the source of the cluster state update task * @param task the state needed for the cluster state update task * @param config the cluster state update task configuration * @param executor the cluster state update task executor; tasks that share the same executor will * be executed batches on this executor * @param listener callback after the cluster state update task completes * @param <T> the type of the cluster state update task state */ public <T> void submitStateUpdateTask( final String source, final T task, final ClusterStateTaskConfig config, final ClusterStateTaskExecutor<T> executor, final ClusterStateTaskListener listener) { submitStateUpdateTasks(source, Collections.singletonMap(task, listener), config, executor); } /** * Submits a batch of cluster state update tasks; submitted updates are guaranteed to be processed * together, potentially with more tasks of the same executor. * * @param source the source of the cluster state update task * @param tasks a map of update tasks and their corresponding listeners * @param config the cluster state update task configuration * @param executor the cluster state update task executor; tasks that share the same executor will * be executed batches on this executor * @param <T> the type of the cluster state update task state */ public <T> void submitStateUpdateTasks( final String source, final Map<T, ClusterStateTaskListener> tasks, final ClusterStateTaskConfig config, final ClusterStateTaskExecutor<T> executor) { if (!lifecycle.started()) { return; } if (tasks.isEmpty()) { return; } try { // convert to an identity map to check for dups based on update tasks semantics of using // identity instead of equal final IdentityHashMap<T, ClusterStateTaskListener> tasksIdentity = new IdentityHashMap<>(tasks); final List<UpdateTask<T>> updateTasks = tasksIdentity .entrySet() .stream() .map( entry -> new UpdateTask<>( source, entry.getKey(), config, executor, safe(entry.getValue(), logger))) .collect(Collectors.toList()); synchronized (updateTasksPerExecutor) { List<UpdateTask> existingTasks = updateTasksPerExecutor.computeIfAbsent(executor, k -> new ArrayList<>()); for (@SuppressWarnings("unchecked") UpdateTask<T> existing : existingTasks) { if (tasksIdentity.containsKey(existing.task)) { throw new IllegalStateException( "task [" + executor.describeTasks(Collections.singletonList(existing.task)) + "] with source [" + source + "] is already queued"); } } existingTasks.addAll(updateTasks); } final UpdateTask<T> firstTask = updateTasks.get(0); if (config.timeout() != null) { updateTasksExecutor.execute( firstTask, threadPool.scheduler(), config.timeout(), () -> threadPool .generic() .execute( () -> { for (UpdateTask<T> task : updateTasks) { if (task.processed.getAndSet(true) == false) { logger.debug( "cluster state update task [{}] timed out after [{}]", source, config.timeout()); task.listener.onFailure( source, new ProcessClusterEventTimeoutException( config.timeout(), source)); } } })); } else { updateTasksExecutor.execute(firstTask); } } catch (EsRejectedExecutionException e) { // ignore cases where we are shutting down..., there is really nothing interesting // to be done here... if (!lifecycle.stoppedOrClosed()) { throw e; } } } /** Returns the tasks that are pending. */ public List<PendingClusterTask> pendingTasks() { PrioritizedEsThreadPoolExecutor.Pending[] pendings = updateTasksExecutor.getPending(); List<PendingClusterTask> pendingClusterTasks = new ArrayList<>(pendings.length); for (PrioritizedEsThreadPoolExecutor.Pending pending : pendings) { final String source; final long timeInQueue; // we have to capture the task as it will be nulled after execution and we don't want to // change while we check things here. final Object task = pending.task; if (task == null) { continue; } else if (task instanceof SourcePrioritizedRunnable) { SourcePrioritizedRunnable runnable = (SourcePrioritizedRunnable) task; source = runnable.source(); timeInQueue = runnable.getAgeInMillis(); } else { assert false : "expected SourcePrioritizedRunnable got " + task.getClass(); source = "unknown [" + task.getClass() + "]"; timeInQueue = 0; } pendingClusterTasks.add( new PendingClusterTask( pending.insertionOrder, pending.priority, new Text(source), timeInQueue, pending.executing)); } return pendingClusterTasks; } /** Returns the number of currently pending tasks. */ public int numberOfPendingTasks() { return updateTasksExecutor.getNumberOfPendingTasks(); } /** * Returns the maximum wait time for tasks in the queue * * @return A zero time value if the queue is empty, otherwise the time value oldest task waiting * in the queue */ public TimeValue getMaxTaskWaitTime() { return updateTasksExecutor.getMaxTaskWaitTime(); } /** asserts that the current thread is the cluster state update thread */ public static boolean assertClusterStateThread() { assert Thread.currentThread().getName().contains(ClusterService.UPDATE_THREAD_NAME) : "not called from the cluster state update thread"; return true; } public ClusterName getClusterName() { return clusterName; } abstract static class SourcePrioritizedRunnable extends PrioritizedRunnable { protected final String source; public SourcePrioritizedRunnable(Priority priority, String source) { super(priority); this.source = source; } public String source() { return source; } } <T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) { final ArrayList<UpdateTask<T>> toExecute = new ArrayList<>(); final Map<String, ArrayList<T>> processTasksBySource = new HashMap<>(); synchronized (updateTasksPerExecutor) { List<UpdateTask> pending = updateTasksPerExecutor.remove(executor); if (pending != null) { for (UpdateTask<T> task : pending) { if (task.processed.getAndSet(true) == false) { logger.trace("will process {}", task.toString(executor)); toExecute.add(task); processTasksBySource .computeIfAbsent(task.source, s -> new ArrayList<>()) .add(task.task); } else { logger.trace("skipping {}, already processed", task.toString(executor)); } } } } if (toExecute.isEmpty()) { return; } final String tasksSummary = processTasksBySource .entrySet() .stream() .map( entry -> { String tasks = executor.describeTasks(entry.getValue()); return tasks.isEmpty() ? entry.getKey() : entry.getKey() + "[" + tasks + "]"; }) .reduce((s1, s2) -> s1 + ", " + s2) .orElse(""); if (!lifecycle.started()) { logger.debug("processing [{}]: ignoring, cluster_service not started", tasksSummary); return; } logger.debug("processing [{}]: execute", tasksSummary); ClusterState previousClusterState = clusterState; if (!previousClusterState.nodes().isLocalNodeElectedMaster() && executor.runOnlyOnMaster()) { logger.debug("failing [{}]: local node is no longer master", tasksSummary); toExecute.stream().forEach(task -> task.listener.onNoLongerMaster(task.source)); return; } ClusterStateTaskExecutor.BatchResult<T> batchResult; long startTimeNS = currentTimeInNanos(); try { List<T> inputs = toExecute.stream().map(tUpdateTask -> tUpdateTask.task).collect(Collectors.toList()); batchResult = executor.execute(previousClusterState, inputs); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); if (logger.isTraceEnabled()) { logger.trace( (Supplier<?>) () -> new ParameterizedMessage( "failed to execute cluster state update in [{}], state:\nversion [{}], source [{}]\n{}{}{}", executionTime, previousClusterState.version(), tasksSummary, previousClusterState.nodes().prettyPrint(), previousClusterState.routingTable().prettyPrint(), previousClusterState.getRoutingNodes().prettyPrint()), e); } warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); batchResult = ClusterStateTaskExecutor.BatchResult.<T>builder() .failures(toExecute.stream().map(updateTask -> updateTask.task)::iterator, e) .build(previousClusterState); } assert batchResult.executionResults != null; assert batchResult.executionResults.size() == toExecute.size() : String.format( Locale.ROOT, "expected [%d] task result%s but was [%d]", toExecute.size(), toExecute.size() == 1 ? "" : "s", batchResult.executionResults.size()); boolean assertsEnabled = false; assert (assertsEnabled = true); if (assertsEnabled) { for (UpdateTask<T> updateTask : toExecute) { assert batchResult.executionResults.containsKey(updateTask.task) : "missing task result for " + updateTask.toString(executor); } } ClusterState newClusterState = batchResult.resultingState; final ArrayList<UpdateTask<T>> proccessedListeners = new ArrayList<>(); // fail all tasks that have failed and extract those that are waiting for results for (UpdateTask<T> updateTask : toExecute) { assert batchResult.executionResults.containsKey(updateTask.task) : "missing " + updateTask.toString(executor); final ClusterStateTaskExecutor.TaskResult executionResult = batchResult.executionResults.get(updateTask.task); executionResult.handle( () -> proccessedListeners.add(updateTask), ex -> { logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "cluster state update task {} failed", updateTask.toString(executor)), ex); updateTask.listener.onFailure(updateTask.source, ex); }); } if (previousClusterState == newClusterState) { for (UpdateTask<T> task : proccessedListeners) { if (task.listener instanceof AckedClusterStateTaskListener) { // no need to wait for ack if nothing changed, the update can be counted as acknowledged ((AckedClusterStateTaskListener) task.listener).onAllNodesAcked(null); } task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState); } TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug( "processing [{}]: took [{}] no change in cluster_state", tasksSummary, executionTime); warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); return; } try { ArrayList<Discovery.AckListener> ackListeners = new ArrayList<>(); if (newClusterState.nodes().isLocalNodeElectedMaster()) { // only the master controls the version numbers Builder builder = ClusterState.builder(newClusterState).incrementVersion(); if (previousClusterState.routingTable() != newClusterState.routingTable()) { builder.routingTable( RoutingTable.builder(newClusterState.routingTable()) .version(newClusterState.routingTable().version() + 1) .build()); } if (previousClusterState.metaData() != newClusterState.metaData()) { builder.metaData( MetaData.builder(newClusterState.metaData()) .version(newClusterState.metaData().version() + 1)); } newClusterState = builder.build(); for (UpdateTask<T> task : proccessedListeners) { if (task.listener instanceof AckedClusterStateTaskListener) { final AckedClusterStateTaskListener ackedListener = (AckedClusterStateTaskListener) task.listener; if (ackedListener.ackTimeout() == null || ackedListener.ackTimeout().millis() == 0) { ackedListener.onAckTimeout(); } else { try { ackListeners.add( new AckCountDownListener( ackedListener, newClusterState.version(), newClusterState.nodes(), threadPool)); } catch (EsRejectedExecutionException ex) { if (logger.isDebugEnabled()) { logger.debug( "Couldn't schedule timeout thread - node might be shutting down", ex); } // timeout straightaway, otherwise we could wait forever as the timeout thread has // not started ackedListener.onAckTimeout(); } } } } } final Discovery.AckListener ackListener = new DelegetingAckListener(ackListeners); newClusterState.status(ClusterState.ClusterStateStatus.BEING_APPLIED); if (logger.isTraceEnabled()) { logger.trace( "cluster state updated, source [{}]\n{}", tasksSummary, newClusterState.prettyPrint()); } else if (logger.isDebugEnabled()) { logger.debug( "cluster state updated, version [{}], source [{}]", newClusterState.version(), tasksSummary); } ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(tasksSummary, newClusterState, previousClusterState); // new cluster state, notify all listeners final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta(); if (nodesDelta.hasChanges() && logger.isInfoEnabled()) { String summary = nodesDelta.shortSummary(); if (summary.length() > 0) { logger.info("{}, reason: {}", summary, tasksSummary); } } nodeConnectionsService.connectToAddedNodes(clusterChangedEvent); // if we are the master, publish the new state to all nodes // we publish here before we send a notification to all the listeners, since if it fails // we don't want to notify if (newClusterState.nodes().isLocalNodeElectedMaster()) { logger.debug("publishing cluster state version [{}]", newClusterState.version()); try { clusterStatePublisher.accept(clusterChangedEvent, ackListener); } catch (Discovery.FailedToCommitClusterStateException t) { final long version = newClusterState.version(); logger.warn( (Supplier<?>) () -> new ParameterizedMessage( "failing [{}]: failed to commit cluster state version [{}]", tasksSummary, version), t); proccessedListeners.forEach(task -> task.listener.onFailure(task.source, t)); return; } } // update the current cluster state clusterState = newClusterState; logger.debug("set local cluster state to version {}", newClusterState.version()); try { // nothing to do until we actually recover from the gateway or any other block indicates we // need to disable persistency if (clusterChangedEvent.state().blocks().disableStatePersistence() == false && clusterChangedEvent.metaDataChanged()) { final Settings incomingSettings = clusterChangedEvent.state().metaData().settings(); clusterSettings.applySettings(incomingSettings); } } catch (Exception ex) { logger.warn("failed to apply cluster settings", ex); } for (ClusterStateListener listener : preAppliedListeners) { try { listener.clusterChanged(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateListener", ex); } } nodeConnectionsService.disconnectFromRemovedNodes(clusterChangedEvent); newClusterState.status(ClusterState.ClusterStateStatus.APPLIED); for (ClusterStateListener listener : postAppliedListeners) { try { listener.clusterChanged(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateListener", ex); } } // manual ack only from the master at the end of the publish if (newClusterState.nodes().isLocalNodeElectedMaster()) { try { ackListener.onNodeAck(newClusterState.nodes().getLocalNode(), null); } catch (Exception e) { final DiscoveryNode localNode = newClusterState.nodes().getLocalNode(); logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "error while processing ack for master node [{}]", localNode), e); } } for (UpdateTask<T> task : proccessedListeners) { task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState); } try { executor.clusterStatePublished(clusterChangedEvent); } catch (Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown while notifying executor of new cluster state publication [{}]", tasksSummary), e); } TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug( "processing [{}]: took [{}] done applying updated cluster_state (version: {}, uuid: {})", tasksSummary, executionTime, newClusterState.version(), newClusterState.stateUUID()); warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); final long version = newClusterState.version(); final String stateUUID = newClusterState.stateUUID(); final String prettyPrint = newClusterState.prettyPrint(); logger.warn( (Supplier<?>) () -> new ParameterizedMessage( "failed to apply updated cluster state in [{}]:\nversion [{}], uuid [{}], source [{}]\n{}", executionTime, version, stateUUID, tasksSummary, prettyPrint), e); // TODO: do we want to call updateTask.onFailure here? } } // this one is overridden in tests so we can control time protected long currentTimeInNanos() { return System.nanoTime(); } private static SafeClusterStateTaskListener safe( ClusterStateTaskListener listener, Logger logger) { if (listener instanceof AckedClusterStateTaskListener) { return new SafeAckedClusterStateTaskListener( (AckedClusterStateTaskListener) listener, logger); } else { return new SafeClusterStateTaskListener(listener, logger); } } private static class SafeClusterStateTaskListener implements ClusterStateTaskListener { private final ClusterStateTaskListener listener; private final Logger logger; public SafeClusterStateTaskListener(ClusterStateTaskListener listener, Logger logger) { this.listener = listener; this.logger = logger; } @Override public void onFailure(String source, Exception e) { try { listener.onFailure(source, e); } catch (Exception inner) { inner.addSuppressed(e); logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown by listener notifying of failure from [{}]", source), inner); } } @Override public void onNoLongerMaster(String source) { try { listener.onNoLongerMaster(source); } catch (Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown by listener while notifying no longer master from [{}]", source), e); } } @Override public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { try { listener.clusterStateProcessed(source, oldState, newState); } catch (Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown by listener while notifying of cluster state processed from [{}], old cluster state:\n" + "{}\nnew cluster state:\n{}", source, oldState.prettyPrint(), newState.prettyPrint()), e); } } } private static class SafeAckedClusterStateTaskListener extends SafeClusterStateTaskListener implements AckedClusterStateTaskListener { private final AckedClusterStateTaskListener listener; private final Logger logger; public SafeAckedClusterStateTaskListener( AckedClusterStateTaskListener listener, Logger logger) { super(listener, logger); this.listener = listener; this.logger = logger; } @Override public boolean mustAck(DiscoveryNode discoveryNode) { return listener.mustAck(discoveryNode); } @Override public void onAllNodesAcked(@Nullable Exception e) { try { listener.onAllNodesAcked(e); } catch (Exception inner) { inner.addSuppressed(e); logger.error("exception thrown by listener while notifying on all nodes acked", inner); } } @Override public void onAckTimeout() { try { listener.onAckTimeout(); } catch (Exception e) { logger.error("exception thrown by listener while notifying on ack timeout", e); } } @Override public TimeValue ackTimeout() { return listener.ackTimeout(); } } class UpdateTask<T> extends SourcePrioritizedRunnable { public final T task; public final ClusterStateTaskConfig config; public final ClusterStateTaskExecutor<T> executor; public final ClusterStateTaskListener listener; public final AtomicBoolean processed = new AtomicBoolean(); UpdateTask( String source, T task, ClusterStateTaskConfig config, ClusterStateTaskExecutor<T> executor, ClusterStateTaskListener listener) { super(config.priority(), source); this.task = task; this.config = config; this.executor = executor; this.listener = listener; } @Override public void run() { // if this task is already processed, the executor shouldn't execute other tasks (that arrived // later), // to give other executors a chance to execute their tasks. if (processed.get() == false) { runTasksForExecutor(executor); } } public String toString(ClusterStateTaskExecutor<T> executor) { String taskDescription = executor.describeTasks(Collections.singletonList(task)); if (taskDescription.isEmpty()) { return "[" + source + "]"; } else { return "[" + source + "[" + taskDescription + "]]"; } } } private void warnAboutSlowTaskIfNeeded(TimeValue executionTime, String source) { if (executionTime.getMillis() > slowTaskLoggingThreshold.getMillis()) { logger.warn( "cluster state update task [{}] took [{}] above the warn threshold of {}", source, executionTime, slowTaskLoggingThreshold); } } class NotifyTimeout implements Runnable { final TimeoutClusterStateListener listener; final TimeValue timeout; volatile ScheduledFuture future; NotifyTimeout(TimeoutClusterStateListener listener, TimeValue timeout) { this.listener = listener; this.timeout = timeout; } public void cancel() { FutureUtils.cancel(future); } @Override public void run() { if (future != null && future.isCancelled()) { return; } if (lifecycle.stoppedOrClosed()) { listener.onClose(); } else { listener.onTimeout(this.timeout); } // note, we rely on the listener to remove itself in case of timeout if needed } } private static class LocalNodeMasterListeners implements ClusterStateListener { private final List<LocalNodeMasterListener> listeners = new CopyOnWriteArrayList<>(); private final ThreadPool threadPool; private volatile boolean master = false; private LocalNodeMasterListeners(ThreadPool threadPool) { this.threadPool = threadPool; } @Override public void clusterChanged(ClusterChangedEvent event) { if (!master && event.localNodeMaster()) { master = true; for (LocalNodeMasterListener listener : listeners) { Executor executor = threadPool.executor(listener.executorName()); executor.execute(new OnMasterRunnable(listener)); } return; } if (master && !event.localNodeMaster()) { master = false; for (LocalNodeMasterListener listener : listeners) { Executor executor = threadPool.executor(listener.executorName()); executor.execute(new OffMasterRunnable(listener)); } } } private void add(LocalNodeMasterListener listener) { listeners.add(listener); } private void remove(LocalNodeMasterListener listener) { listeners.remove(listener); } private void clear() { listeners.clear(); } } private static class OnMasterRunnable implements Runnable { private final LocalNodeMasterListener listener; private OnMasterRunnable(LocalNodeMasterListener listener) { this.listener = listener; } @Override public void run() { listener.onMaster(); } } private static class OffMasterRunnable implements Runnable { private final LocalNodeMasterListener listener; private OffMasterRunnable(LocalNodeMasterListener listener) { this.listener = listener; } @Override public void run() { listener.offMaster(); } } private static class DelegetingAckListener implements Discovery.AckListener { private final List<Discovery.AckListener> listeners; private DelegetingAckListener(List<Discovery.AckListener> listeners) { this.listeners = listeners; } @Override public void onNodeAck(DiscoveryNode node, @Nullable Exception e) { for (Discovery.AckListener listener : listeners) { listener.onNodeAck(node, e); } } @Override public void onTimeout() { throw new UnsupportedOperationException("no timeout delegation"); } } private static class AckCountDownListener implements Discovery.AckListener { private static final Logger logger = Loggers.getLogger(AckCountDownListener.class); private final AckedClusterStateTaskListener ackedTaskListener; private final CountDown countDown; private final DiscoveryNodes nodes; private final long clusterStateVersion; private final Future<?> ackTimeoutCallback; private Exception lastFailure; AckCountDownListener( AckedClusterStateTaskListener ackedTaskListener, long clusterStateVersion, DiscoveryNodes nodes, ThreadPool threadPool) { this.ackedTaskListener = ackedTaskListener; this.clusterStateVersion = clusterStateVersion; this.nodes = nodes; int countDown = 0; for (DiscoveryNode node : nodes) { if (ackedTaskListener.mustAck(node)) { countDown++; } } // we always wait for at least 1 node (the master) countDown = Math.max(1, countDown); logger.trace( "expecting {} acknowledgements for cluster_state update (version: {})", countDown, clusterStateVersion); this.countDown = new CountDown(countDown); this.ackTimeoutCallback = threadPool.schedule( ackedTaskListener.ackTimeout(), ThreadPool.Names.GENERIC, new Runnable() { @Override public void run() { onTimeout(); } }); } @Override public void onNodeAck(DiscoveryNode node, @Nullable Exception e) { if (!ackedTaskListener.mustAck(node)) { // we always wait for the master ack anyway if (!node.equals(nodes.getMasterNode())) { return; } } if (e == null) { logger.trace( "ack received from node [{}], cluster_state update (version: {})", node, clusterStateVersion); } else { this.lastFailure = e; logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "ack received from node [{}], cluster_state update (version: {})", node, clusterStateVersion), e); } if (countDown.countDown()) { logger.trace( "all expected nodes acknowledged cluster_state update (version: {})", clusterStateVersion); FutureUtils.cancel(ackTimeoutCallback); ackedTaskListener.onAllNodesAcked(lastFailure); } } @Override public void onTimeout() { if (countDown.fastForward()) { logger.trace( "timeout waiting for acknowledgement for cluster_state update (version: {})", clusterStateVersion); ackedTaskListener.onAckTimeout(); } } } public ClusterSettings getClusterSettings() { return clusterSettings; } public Settings getSettings() { return settings; } }
public class RecoverySettings extends AbstractComponent { public static final Setting<ByteSizeValue> INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING = Setting.byteSizeSetting( "indices.recovery.max_bytes_per_sec", new ByteSizeValue(40, ByteSizeUnit.MB), Property.Dynamic, Property.NodeScope); /** * how long to wait before retrying after issues cause by cluster state syncing between nodes * i.e., local node is not yet known on remote node, remote shard not yet started etc. */ public static final Setting<TimeValue> INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING = Setting.positiveTimeSetting( "indices.recovery.retry_delay_state_sync", TimeValue.timeValueMillis(500), Property.Dynamic, Property.NodeScope); /** how long to wait before retrying after network related issues */ public static final Setting<TimeValue> INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING = Setting.positiveTimeSetting( "indices.recovery.retry_delay_network", TimeValue.timeValueSeconds(5), Property.Dynamic, Property.NodeScope); /** timeout value to use for requests made as part of the recovery process */ public static final Setting<TimeValue> INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING = Setting.positiveTimeSetting( "indices.recovery.internal_action_timeout", TimeValue.timeValueMinutes(15), Property.Dynamic, Property.NodeScope); /** * timeout value to use for requests made as part of the recovery process that are expected to * take long time. defaults to twice `indices.recovery.internal_action_timeout`. */ public static final Setting<TimeValue> INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING = Setting.timeSetting( "indices.recovery.internal_action_long_timeout", (s) -> TimeValue.timeValueMillis( INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.get(s).millis() * 2), TimeValue.timeValueSeconds(0), Property.Dynamic, Property.NodeScope); /** * recoveries that don't show any activity for more then this interval will be failed. defaults to * `indices.recovery.internal_action_long_timeout` */ public static final Setting<TimeValue> INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING = Setting.timeSetting( "indices.recovery.recovery_activity_timeout", INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING::get, TimeValue.timeValueSeconds(0), Property.Dynamic, Property.NodeScope); public static final ByteSizeValue DEFAULT_CHUNK_SIZE = new ByteSizeValue(512, ByteSizeUnit.KB); private volatile ByteSizeValue maxBytesPerSec; private volatile SimpleRateLimiter rateLimiter; private volatile TimeValue retryDelayStateSync; private volatile TimeValue retryDelayNetwork; private volatile TimeValue activityTimeout; private volatile TimeValue internalActionTimeout; private volatile TimeValue internalActionLongTimeout; private volatile ByteSizeValue chunkSize = DEFAULT_CHUNK_SIZE; @Inject public RecoverySettings(Settings settings, ClusterSettings clusterSettings) { super(settings); this.retryDelayStateSync = INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING.get(settings); // doesn't have to be fast as nodes are reconnected every 10s by default (see // InternalClusterService.ReconnectToNodes) // and we want to give the master time to remove a faulty node this.retryDelayNetwork = INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING.get(settings); this.internalActionTimeout = INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.get(settings); this.internalActionLongTimeout = INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING.get(settings); this.activityTimeout = INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING.get(settings); this.maxBytesPerSec = INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING.get(settings); if (maxBytesPerSec.getBytes() <= 0) { rateLimiter = null; } else { rateLimiter = new SimpleRateLimiter(maxBytesPerSec.getMbFrac()); } logger.debug("using max_bytes_per_sec[{}]", maxBytesPerSec); clusterSettings.addSettingsUpdateConsumer( INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING, this::setMaxBytesPerSec); clusterSettings.addSettingsUpdateConsumer( INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING, this::setRetryDelayStateSync); clusterSettings.addSettingsUpdateConsumer( INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING, this::setRetryDelayNetwork); clusterSettings.addSettingsUpdateConsumer( INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING, this::setInternalActionTimeout); clusterSettings.addSettingsUpdateConsumer( INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING, this::setInternalActionLongTimeout); clusterSettings.addSettingsUpdateConsumer( INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING, this::setActivityTimeout); } public RateLimiter rateLimiter() { return rateLimiter; } public TimeValue retryDelayNetwork() { return retryDelayNetwork; } public TimeValue retryDelayStateSync() { return retryDelayStateSync; } public TimeValue activityTimeout() { return activityTimeout; } public TimeValue internalActionTimeout() { return internalActionTimeout; } public TimeValue internalActionLongTimeout() { return internalActionLongTimeout; } public ByteSizeValue getChunkSize() { return chunkSize; } void setChunkSize(ByteSizeValue chunkSize) { // only settable for tests if (chunkSize.bytesAsInt() <= 0) { throw new IllegalArgumentException("chunkSize must be > 0"); } this.chunkSize = chunkSize; } public void setRetryDelayStateSync(TimeValue retryDelayStateSync) { this.retryDelayStateSync = retryDelayStateSync; } public void setRetryDelayNetwork(TimeValue retryDelayNetwork) { this.retryDelayNetwork = retryDelayNetwork; } public void setActivityTimeout(TimeValue activityTimeout) { this.activityTimeout = activityTimeout; } public void setInternalActionTimeout(TimeValue internalActionTimeout) { this.internalActionTimeout = internalActionTimeout; } public void setInternalActionLongTimeout(TimeValue internalActionLongTimeout) { this.internalActionLongTimeout = internalActionLongTimeout; } private void setMaxBytesPerSec(ByteSizeValue maxBytesPerSec) { this.maxBytesPerSec = maxBytesPerSec; if (maxBytesPerSec.getBytes() <= 0) { rateLimiter = null; } else if (rateLimiter != null) { rateLimiter.setMBPerSec(maxBytesPerSec.getMbFrac()); } else { rateLimiter = new SimpleRateLimiter(maxBytesPerSec.getMbFrac()); } } }
/** * A base class for {@link org.elasticsearch.discovery.zen.fd.MasterFaultDetection} & {@link * org.elasticsearch.discovery.zen.fd.NodesFaultDetection}, making sure both use the same setting. */ public abstract class FaultDetection extends AbstractComponent { public static final Setting<Boolean> CONNECT_ON_NETWORK_DISCONNECT_SETTING = Setting.boolSetting( "discovery.zen.fd.connect_on_network_disconnect", false, Property.NodeScope); public static final Setting<TimeValue> PING_INTERVAL_SETTING = Setting.positiveTimeSetting( "discovery.zen.fd.ping_interval", timeValueSeconds(1), Property.NodeScope); public static final Setting<TimeValue> PING_TIMEOUT_SETTING = Setting.timeSetting( "discovery.zen.fd.ping_timeout", timeValueSeconds(30), Property.NodeScope); public static final Setting<Integer> PING_RETRIES_SETTING = Setting.intSetting("discovery.zen.fd.ping_retries", 3, Property.NodeScope); public static final Setting<Boolean> REGISTER_CONNECTION_LISTENER_SETTING = Setting.boolSetting( "discovery.zen.fd.register_connection_listener", true, Property.NodeScope); protected final ThreadPool threadPool; protected final ClusterName clusterName; protected final TransportService transportService; // used mainly for testing, should always be true protected final boolean registerConnectionListener; protected final FDConnectionListener connectionListener; protected final boolean connectOnNetworkDisconnect; protected final TimeValue pingInterval; protected final TimeValue pingRetryTimeout; protected final int pingRetryCount; public FaultDetection( Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName) { super(settings); this.threadPool = threadPool; this.transportService = transportService; this.clusterName = clusterName; this.connectOnNetworkDisconnect = CONNECT_ON_NETWORK_DISCONNECT_SETTING.get(settings); this.pingInterval = PING_INTERVAL_SETTING.get(settings); this.pingRetryTimeout = PING_TIMEOUT_SETTING.get(settings); this.pingRetryCount = PING_RETRIES_SETTING.get(settings); this.registerConnectionListener = REGISTER_CONNECTION_LISTENER_SETTING.get(settings); this.connectionListener = new FDConnectionListener(); if (registerConnectionListener) { transportService.addConnectionListener(connectionListener); } } public void close() { transportService.removeConnectionListener(connectionListener); } /** * This method will be called when the {@link org.elasticsearch.transport.TransportService} raised * a node disconnected event */ abstract void handleTransportDisconnect(DiscoveryNode node); private class FDConnectionListener implements TransportConnectionListener { @Override public void onNodeConnected(DiscoveryNode node) {} @Override public void onNodeDisconnected(DiscoveryNode node) { handleTransportDisconnect(node); } } }