private void applyCleanedIndices(final ClusterChangedEvent event) { // handle closed indices, since they are not allocated on a node once they are closed // so applyDeletedIndices might not take them into account for (IndexService indexService : indicesService) { String index = indexService.index().getName(); IndexMetaData indexMetaData = event.state().metaData().index(index); if (indexMetaData != null && indexMetaData.state() == IndexMetaData.State.CLOSE) { for (Integer shardId : indexService.shardIds()) { logger.debug("[{}][{}] removing shard (index is closed)", index, shardId); try { indexService.removeShard(shardId, "removing shard (index is closed)"); } catch (Throwable e) { logger.warn("[{}] failed to remove shard (index is closed)", e, index); } } } } for (IndexService indexService : indicesService) { String index = indexService.index().getName(); if (indexService.shardIds().isEmpty()) { if (logger.isDebugEnabled()) { logger.debug("[{}] cleaning index (no shards allocated)", index); } // clean the index removeIndex(index, "removing index (no shards allocated)"); } } }
private void applyDeletedIndices(final ClusterChangedEvent event) { final ClusterState previousState = event.previousState(); final String localNodeId = event.state().nodes().localNodeId(); assert localNodeId != null; for (IndexService indexService : indicesService) { IndexMetaData indexMetaData = event.state().metaData().index(indexService.index().name()); if (indexMetaData != null) { if (!indexMetaData.isSameUUID(indexService.indexUUID())) { logger.debug( "[{}] mismatch on index UUIDs between cluster state and local state, cleaning the index so it will be recreated", indexMetaData.index()); deleteIndex( indexMetaData.index(), "mismatch on index UUIDs between cluster state and local state, cleaning the index so it will be recreated"); } } } for (String index : event.indicesDeleted()) { if (logger.isDebugEnabled()) { logger.debug("[{}] cleaning index, no longer part of the metadata", index); } final Settings indexSettings; final IndexService idxService = indicesService.indexService(index); if (idxService != null) { indexSettings = idxService.getIndexSettings(); deleteIndex(index, "index no longer part of the metadata"); } else { final IndexMetaData metaData = previousState.metaData().index(index); assert metaData != null; indexSettings = metaData.settings(); indicesService.deleteClosedIndex( "closed index no longer part of the metadata", metaData, event.state()); } try { nodeIndexDeletedAction.nodeIndexDeleted(event.state(), index, indexSettings, localNodeId); } catch (Throwable e) { logger.debug("failed to send to master index {} deleted event", e, index); } } }
private void cleanFailedShards(final ClusterChangedEvent event) { RoutingTable routingTable = event.state().routingTable(); RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { failedShards.clear(); return; } DiscoveryNodes nodes = event.state().nodes(); long now = System.currentTimeMillis(); String localNodeId = nodes.localNodeId(); Iterator<Map.Entry<ShardId, FailedShard>> iterator = failedShards.entrySet().iterator(); shards: while (iterator.hasNext()) { Map.Entry<ShardId, FailedShard> entry = iterator.next(); FailedShard failedShard = entry.getValue(); IndexRoutingTable indexRoutingTable = routingTable.index(entry.getKey().getIndex()); if (indexRoutingTable != null) { IndexShardRoutingTable shardRoutingTable = indexRoutingTable.shard(entry.getKey().id()); if (shardRoutingTable != null) { for (ShardRouting shardRouting : shardRoutingTable.assignedShards()) { if (localNodeId.equals(shardRouting.currentNodeId())) { // we have a timeout here just to make sure we don't have dangled failed shards for // some reason // its just another safely layer if (shardRouting.version() == failedShard.version && ((now - failedShard.timestamp) < TimeValue.timeValueMinutes(60).millis())) { // It's the same failed shard - keep it if it hasn't timed out continue shards; } else { // Different version or expired, remove it break; } } } } } iterator.remove(); } }
private void applyDeletedShards(final ClusterChangedEvent event) { RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { return; } IntHashSet newShardIds = new IntHashSet(); for (IndexService indexService : indicesService) { String index = indexService.index().name(); IndexMetaData indexMetaData = event.state().metaData().index(index); if (indexMetaData == null) { continue; } // now, go over and delete shards that needs to get deleted newShardIds.clear(); for (ShardRouting shard : routingNode) { if (shard.index().equals(index)) { newShardIds.add(shard.id()); } } for (Integer existingShardId : indexService.shardIds()) { if (!newShardIds.contains(existingShardId)) { if (indexMetaData.state() == IndexMetaData.State.CLOSE) { if (logger.isDebugEnabled()) { logger.debug("[{}][{}] removing shard (index is closed)", index, existingShardId); } indexService.removeShard(existingShardId, "removing shard (index is closed)"); } else { // we can just remove the shard, without cleaning it locally, since we will clean it // when all shards are allocated in the IndicesStore if (logger.isDebugEnabled()) { logger.debug("[{}][{}] removing shard (not allocated)", index, existingShardId); } indexService.removeShard(existingShardId, "removing shard (not allocated)"); } } } } }
@Override public void clusterChanged(ClusterChangedEvent event) { if (!VALID_CLUSTER_STATES.contains(event.state().status())) { // Only process fully applied cluster states return; } if (eventBus != null) { final List<String> indicesDeleted = event.indicesDeleted(); if (!indicesDeleted.isEmpty()) { eventBus.post(IndicesDeletedEvent.create(indicesDeleted)); } } }
private void applyNewIndices(final ClusterChangedEvent event) { // we only create indices for shards that are allocated RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { return; } for (ShardRouting shard : routingNode) { if (!indicesService.hasIndex(shard.index())) { final IndexMetaData indexMetaData = event.state().metaData().index(shard.index()); if (logger.isDebugEnabled()) { logger.debug("[{}] creating index", indexMetaData.index()); } try { indicesService.createIndex( indexMetaData.index(), indexMetaData.settings(), event.state().nodes().localNode().id()); } catch (Throwable e) { sendFailShard(shard, indexMetaData.getIndexUUID(), "failed to create index", e); } } } }
@Override public void clusterChanged(ClusterChangedEvent event) { if (event.state().blocks().disableStatePersistence()) { // reset the current metadata, we need to start fresh... this.currentMetaData = null; return; } MetaData newMetaData = event.state().metaData(); // delete indices that were there before, but are deleted now // we need to do it so they won't be detected as dangling if (currentMetaData != null) { // only delete indices when we already received a state (currentMetaData != null) for (IndexMetaData current : currentMetaData) { if (!newMetaData.hasIndex(current.index())) { logger.debug( "[{}] deleting index that is no longer part of the metadata (indices: [{}])", current.index(), newMetaData.indices().keys()); if (nodeEnv.hasNodeFile()) { FileSystemUtils.deleteRecursively(nodeEnv.indexLocations(new Index(current.index()))); } try { nodeIndexDeletedAction.nodeIndexStoreDeleted( event.state(), current.index(), event.state().nodes().localNodeId()); } catch (Exception e) { logger.debug( "[{}] failed to notify master on local index store deletion", e, current.index()); } } } } currentMetaData = newMetaData; }
private void applyAliases(ClusterChangedEvent event) { // check if aliases changed if (aliasesChanged(event)) { // go over and update aliases for (IndexMetaData indexMetaData : event.state().metaData()) { String index = indexMetaData.index(); IndexService indexService = indicesService.indexService(index); if (indexService == null) { // we only create / update here continue; } IndexAliasesService indexAliasesService = indexService.aliasesService(); indexAliasesService.setAliases(indexMetaData.getAliases()); } } }
@Override public void clusterChanged(final ClusterChangedEvent event) { if (!indicesService.changesAllowed()) { return; } if (!lifecycle.started()) { return; } synchronized (mutex) { // we need to clean the shards and indices we have on this node, since we // are going to recover them again once state persistence is disabled (no master / not // recovered) // TODO: this feels a bit hacky here, a block disables state persistence, and then we clean // the allocated shards, maybe another flag in blocks? if (event.state().blocks().disableStatePersistence()) { for (IndexService indexService : indicesService) { String index = indexService.index().getName(); for (Integer shardId : indexService.shardIds()) { logger.debug("[{}][{}] removing shard (disabled block persistence)", index, shardId); try { indexService.removeShard(shardId, "removing shard (disabled block persistence)"); } catch (Throwable e) { logger.warn("[{}] failed to remove shard (disabled block persistence)", e, index); } } removeIndex(index, "cleaning index (disabled block persistence)"); } return; } cleanFailedShards(event); applyDeletedIndices(event); applyNewIndices(event); applyMappings(event); applyAliases(event); applyNewOrUpdatedShards(event); applyDeletedShards(event); applyCleanedIndices(event); applySettings(event); } }
@Override public void clusterChanged(ClusterChangedEvent event) { if (event.source().equals(CLUSTER_UPDATE_TASK_SOURCE)) { // that's us, ignore this event return; } if (event.state().nodes().localNodeMaster()) { // we are master, schedule the routing table updater if (scheduledRoutingTableFuture == null) { // a new master (us), make sure we reroute shards routingTableDirty = true; scheduledRoutingTableFuture = threadPool.scheduleWithFixedDelay(new RoutingTableUpdater(), schedule); } if (event.nodesRemoved()) { // if nodes were removed, we don't want to wait for the scheduled task // since we want to get primary election as fast as possible routingTableDirty = true; reroute(); // Commented out since we make sure to reroute whenever shards changes state or metadata // changes state // } else if (event.routingTableChanged()) { // routingTableDirty = true; // reroute(); } else { if (event.nodesAdded()) { for (DiscoveryNode node : event.nodesDelta().addedNodes()) { if (node.dataNode()) { routingTableDirty = true; break; } } } } } else { FutureUtils.cancel(scheduledRoutingTableFuture); scheduledRoutingTableFuture = null; } }
private void applySettings(ClusterChangedEvent event) { if (!event.metaDataChanged()) { return; } for (IndexMetaData indexMetaData : event.state().metaData()) { if (!indicesService.hasIndex(indexMetaData.index())) { // we only create / update here continue; } // if the index meta data didn't change, no need check for refreshed settings if (!event.indexMetaDataChanged(indexMetaData)) { continue; } String index = indexMetaData.index(); IndexService indexService = indicesService.indexService(index); if (indexService == null) { // already deleted on us, ignore it continue; } IndexSettingsService indexSettingsService = indexService.injector().getInstance(IndexSettingsService.class); indexSettingsService.refreshSettings(indexMetaData.settings()); } }
private void applyNewOrUpdatedShards(final ClusterChangedEvent event) { if (!indicesService.changesAllowed()) { return; } RoutingTable routingTable = event.state().routingTable(); RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { failedShards.clear(); return; } DiscoveryNodes nodes = event.state().nodes(); for (final ShardRouting shardRouting : routingNode) { final IndexService indexService = indicesService.indexService(shardRouting.index()); if (indexService == null) { // got deleted on us, ignore continue; } final IndexMetaData indexMetaData = event.state().metaData().index(shardRouting.index()); if (indexMetaData == null) { // the index got deleted on the metadata, we will clean it later in the apply deleted method // call continue; } final int shardId = shardRouting.id(); if (!indexService.hasShard(shardId) && shardRouting.started()) { if (failedShards.containsKey(shardRouting.shardId())) { if (nodes.masterNode() != null) { shardStateAction.resendShardFailed( shardRouting, indexMetaData.getIndexUUID(), "master " + nodes.masterNode() + " marked shard as started, but shard has previous failed. resending shard failure.", nodes.masterNode()); } } else { // the master thinks we are started, but we don't have this shard at all, mark it as // failed sendFailShard( shardRouting, indexMetaData.getIndexUUID(), "master [" + nodes.masterNode() + "] marked shard as started, but shard has not been created, mark shard as failed", null); } continue; } IndexShard indexShard = indexService.shard(shardId); if (indexShard != null) { ShardRouting currentRoutingEntry = indexShard.routingEntry(); // if the current and global routing are initializing, but are still not the same, its a // different "shard" being allocated // for example: a shard that recovers from one node and now needs to recover to another // node, // or a replica allocated and then allocating a primary because the primary // failed on another node boolean shardHasBeenRemoved = false; if (currentRoutingEntry.initializing() && shardRouting.initializing() && !currentRoutingEntry.equals(shardRouting)) { logger.debug( "[{}][{}] removing shard (different instance of it allocated on this node, current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting); // closing the shard will also cancel any ongoing recovery. indexService.removeShard( shardRouting.id(), "removing shard (different instance of it allocated on this node)"); shardHasBeenRemoved = true; } else if (isPeerRecovery(shardRouting)) { final DiscoveryNode sourceNode = findSourceNodeForPeerRecovery(routingTable, nodes, shardRouting); // check if there is an existing recovery going, and if so, and the source node is not the // same, cancel the recovery to restart it final Predicate<RecoveryStatus> shouldCancel = new Predicate<RecoveryStatus>() { @Override public boolean apply(@Nullable RecoveryStatus status) { return status.sourceNode().equals(sourceNode) == false; } }; if (recoveryTarget.cancelRecoveriesForShard( indexShard.shardId(), "recovery source node changed", shouldCancel)) { logger.debug( "[{}][{}] removing shard (recovery source changed), current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting); // closing the shard will also cancel any ongoing recovery. indexService.removeShard( shardRouting.id(), "removing shard (recovery source node changed)"); shardHasBeenRemoved = true; } } if (shardHasBeenRemoved == false && (shardRouting.equals(indexShard.routingEntry()) == false || shardRouting.version() > indexShard.routingEntry().version())) { if (shardRouting.primary() && indexShard.routingEntry().primary() == false && shardRouting.initializing() && indexShard.allowsPrimaryPromotion() == false) { logger.debug("{} reinitialize shard on primary promotion", indexShard.shardId()); indexService.removeShard(shardId, "promoted to primary"); } else { // if we happen to remove the shardRouting by id above we don't need to jump in here! indexShard.updateRoutingEntry( shardRouting, event.state().blocks().disableStatePersistence() == false); } } } if (shardRouting.initializing()) { applyInitializingShard(event.state(), indexMetaData, shardRouting); } } }
private boolean aliasesChanged(ClusterChangedEvent event) { return !event.state().metaData().aliases().equals(event.previousState().metaData().aliases()) || !event.state().routingTable().equals(event.previousState().routingTable()); }
private void applyMappings(ClusterChangedEvent event) { // go over and update mappings for (IndexMetaData indexMetaData : event.state().metaData()) { if (!indicesService.hasIndex(indexMetaData.index())) { // we only create / update here continue; } List<String> typesToRefresh = Lists.newArrayList(); String index = indexMetaData.index(); IndexService indexService = indicesService.indexService(index); if (indexService == null) { // got deleted on us, ignore (closing the node) return; } try { MapperService mapperService = indexService.mapperService(); // first, go over and update the _default_ mapping (if exists) if (indexMetaData.mappings().containsKey(MapperService.DEFAULT_MAPPING)) { boolean requireRefresh = processMapping( index, mapperService, MapperService.DEFAULT_MAPPING, indexMetaData.mapping(MapperService.DEFAULT_MAPPING).source()); if (requireRefresh) { typesToRefresh.add(MapperService.DEFAULT_MAPPING); } } // go over and add the relevant mappings (or update them) for (ObjectCursor<MappingMetaData> cursor : indexMetaData.mappings().values()) { MappingMetaData mappingMd = cursor.value; String mappingType = mappingMd.type(); CompressedXContent mappingSource = mappingMd.source(); if (mappingType.equals(MapperService.DEFAULT_MAPPING)) { // we processed _default_ first continue; } boolean requireRefresh = processMapping(index, mapperService, mappingType, mappingSource); if (requireRefresh) { typesToRefresh.add(mappingType); } } if (!typesToRefresh.isEmpty() && sendRefreshMapping) { nodeMappingRefreshAction.nodeMappingRefresh( event.state(), new NodeMappingRefreshAction.NodeMappingRefreshRequest( index, indexMetaData.indexUUID(), typesToRefresh.toArray(new String[typesToRefresh.size()]), event.state().nodes().localNodeId())); } } catch (Throwable t) { // if we failed the mappings anywhere, we need to fail the shards for this index, note, we // safeguard // by creating the processing the mappings on the master, or on the node the mapping was // introduced on, // so this failure typically means wrong node level configuration or something similar for (IndexShard indexShard : indexService) { ShardRouting shardRouting = indexShard.routingEntry(); failAndRemoveShard(shardRouting, indexService, true, "failed to update mappings", t); } } } }
private ClusterState applyUpdate(ClusterState currentState, ClusterChangedEvent task) { boolean clusterStateChanged = false; ClusterState tribeState = task.state(); DiscoveryNodes.Builder nodes = DiscoveryNodes.builder(currentState.nodes()); // -- merge nodes // go over existing nodes, and see if they need to be removed for (DiscoveryNode discoNode : currentState.nodes()) { String markedTribeName = discoNode.attributes().get(TRIBE_NAME); if (markedTribeName != null && markedTribeName.equals(tribeName)) { if (tribeState.nodes().get(discoNode.id()) == null) { clusterStateChanged = true; logger.info("[{}] removing node [{}]", tribeName, discoNode); nodes.remove(discoNode.id()); } } } // go over tribe nodes, and see if they need to be added for (DiscoveryNode tribe : tribeState.nodes()) { if (currentState.nodes().get(tribe.id()) == null) { // a new node, add it, but also add the tribe name to the attributes Map<String, String> tribeAttr = new HashMap<>(); for (ObjectObjectCursor<String, String> attr : tribe.attributes()) { tribeAttr.put(attr.key, attr.value); } tribeAttr.put(TRIBE_NAME, tribeName); DiscoveryNode discoNode = new DiscoveryNode( tribe.name(), tribe.id(), tribe.getHostName(), tribe.getHostAddress(), tribe.address(), unmodifiableMap(tribeAttr), tribe.version()); clusterStateChanged = true; logger.info("[{}] adding node [{}]", tribeName, discoNode); nodes.put(discoNode); } } // -- merge metadata ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks()); MetaData.Builder metaData = MetaData.builder(currentState.metaData()); RoutingTable.Builder routingTable = RoutingTable.builder(currentState.routingTable()); // go over existing indices, and see if they need to be removed for (IndexMetaData index : currentState.metaData()) { String markedTribeName = index.getSettings().get(TRIBE_NAME); if (markedTribeName != null && markedTribeName.equals(tribeName)) { IndexMetaData tribeIndex = tribeState.metaData().index(index.getIndex()); clusterStateChanged = true; if (tribeIndex == null || tribeIndex.getState() == IndexMetaData.State.CLOSE) { logger.info("[{}] removing index [{}]", tribeName, index.getIndex()); removeIndex(blocks, metaData, routingTable, index); } else { // always make sure to update the metadata and routing table, in case // there are changes in them (new mapping, shards moving from initializing to started) routingTable.add(tribeState.routingTable().index(index.getIndex())); Settings tribeSettings = Settings.builder().put(tribeIndex.getSettings()).put(TRIBE_NAME, tribeName).build(); metaData.put(IndexMetaData.builder(tribeIndex).settings(tribeSettings)); } } } // go over tribe one, and see if they need to be added for (IndexMetaData tribeIndex : tribeState.metaData()) { // if there is no routing table yet, do nothing with it... IndexRoutingTable table = tribeState.routingTable().index(tribeIndex.getIndex()); if (table == null) { continue; } final IndexMetaData indexMetaData = currentState.metaData().index(tribeIndex.getIndex()); if (indexMetaData == null) { if (!droppedIndices.contains(tribeIndex.getIndex())) { // a new index, add it, and add the tribe name as a setting clusterStateChanged = true; logger.info("[{}] adding index [{}]", tribeName, tribeIndex.getIndex()); addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex); } } else { String existingFromTribe = indexMetaData.getSettings().get(TRIBE_NAME); if (!tribeName.equals(existingFromTribe)) { // we have a potential conflict on index names, decide what to do... if (ON_CONFLICT_ANY.equals(onConflict)) { // we chose any tribe, carry on } else if (ON_CONFLICT_DROP.equals(onConflict)) { // drop the indices, there is a conflict clusterStateChanged = true; logger.info( "[{}] dropping index [{}] due to conflict with [{}]", tribeName, tribeIndex.getIndex(), existingFromTribe); removeIndex(blocks, metaData, routingTable, tribeIndex); droppedIndices.add(tribeIndex.getIndex()); } else if (onConflict.startsWith(ON_CONFLICT_PREFER)) { // on conflict, prefer a tribe... String preferredTribeName = onConflict.substring(ON_CONFLICT_PREFER.length()); if (tribeName.equals(preferredTribeName)) { // the new one is hte preferred one, replace... clusterStateChanged = true; logger.info( "[{}] adding index [{}], preferred over [{}]", tribeName, tribeIndex.getIndex(), existingFromTribe); removeIndex(blocks, metaData, routingTable, tribeIndex); addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex); } // else: either the existing one is the preferred one, or we haven't seen one, carry // on } } } } if (!clusterStateChanged) { return currentState; } else { return ClusterState.builder(currentState) .incrementVersion() .blocks(blocks) .nodes(nodes) .metaData(metaData) .routingTable(routingTable.build()) .build(); } }
<T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) { final ArrayList<UpdateTask<T>> toExecute = new ArrayList<>(); final Map<String, ArrayList<T>> processTasksBySource = new HashMap<>(); synchronized (updateTasksPerExecutor) { List<UpdateTask> pending = updateTasksPerExecutor.remove(executor); if (pending != null) { for (UpdateTask<T> task : pending) { if (task.processed.getAndSet(true) == false) { logger.trace("will process {}", task.toString(executor)); toExecute.add(task); processTasksBySource .computeIfAbsent(task.source, s -> new ArrayList<>()) .add(task.task); } else { logger.trace("skipping {}, already processed", task.toString(executor)); } } } } if (toExecute.isEmpty()) { return; } final String tasksSummary = processTasksBySource .entrySet() .stream() .map( entry -> { String tasks = executor.describeTasks(entry.getValue()); return tasks.isEmpty() ? entry.getKey() : entry.getKey() + "[" + tasks + "]"; }) .reduce((s1, s2) -> s1 + ", " + s2) .orElse(""); if (!lifecycle.started()) { logger.debug("processing [{}]: ignoring, cluster_service not started", tasksSummary); return; } logger.debug("processing [{}]: execute", tasksSummary); ClusterState previousClusterState = clusterState; if (!previousClusterState.nodes().isLocalNodeElectedMaster() && executor.runOnlyOnMaster()) { logger.debug("failing [{}]: local node is no longer master", tasksSummary); toExecute.stream().forEach(task -> task.listener.onNoLongerMaster(task.source)); return; } ClusterStateTaskExecutor.BatchResult<T> batchResult; long startTimeNS = currentTimeInNanos(); try { List<T> inputs = toExecute.stream().map(tUpdateTask -> tUpdateTask.task).collect(Collectors.toList()); batchResult = executor.execute(previousClusterState, inputs); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); if (logger.isTraceEnabled()) { logger.trace( (Supplier<?>) () -> new ParameterizedMessage( "failed to execute cluster state update in [{}], state:\nversion [{}], source [{}]\n{}{}{}", executionTime, previousClusterState.version(), tasksSummary, previousClusterState.nodes().prettyPrint(), previousClusterState.routingTable().prettyPrint(), previousClusterState.getRoutingNodes().prettyPrint()), e); } warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); batchResult = ClusterStateTaskExecutor.BatchResult.<T>builder() .failures(toExecute.stream().map(updateTask -> updateTask.task)::iterator, e) .build(previousClusterState); } assert batchResult.executionResults != null; assert batchResult.executionResults.size() == toExecute.size() : String.format( Locale.ROOT, "expected [%d] task result%s but was [%d]", toExecute.size(), toExecute.size() == 1 ? "" : "s", batchResult.executionResults.size()); boolean assertsEnabled = false; assert (assertsEnabled = true); if (assertsEnabled) { for (UpdateTask<T> updateTask : toExecute) { assert batchResult.executionResults.containsKey(updateTask.task) : "missing task result for " + updateTask.toString(executor); } } ClusterState newClusterState = batchResult.resultingState; final ArrayList<UpdateTask<T>> proccessedListeners = new ArrayList<>(); // fail all tasks that have failed and extract those that are waiting for results for (UpdateTask<T> updateTask : toExecute) { assert batchResult.executionResults.containsKey(updateTask.task) : "missing " + updateTask.toString(executor); final ClusterStateTaskExecutor.TaskResult executionResult = batchResult.executionResults.get(updateTask.task); executionResult.handle( () -> proccessedListeners.add(updateTask), ex -> { logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "cluster state update task {} failed", updateTask.toString(executor)), ex); updateTask.listener.onFailure(updateTask.source, ex); }); } if (previousClusterState == newClusterState) { for (UpdateTask<T> task : proccessedListeners) { if (task.listener instanceof AckedClusterStateTaskListener) { // no need to wait for ack if nothing changed, the update can be counted as acknowledged ((AckedClusterStateTaskListener) task.listener).onAllNodesAcked(null); } task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState); } TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug( "processing [{}]: took [{}] no change in cluster_state", tasksSummary, executionTime); warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); return; } try { ArrayList<Discovery.AckListener> ackListeners = new ArrayList<>(); if (newClusterState.nodes().isLocalNodeElectedMaster()) { // only the master controls the version numbers Builder builder = ClusterState.builder(newClusterState).incrementVersion(); if (previousClusterState.routingTable() != newClusterState.routingTable()) { builder.routingTable( RoutingTable.builder(newClusterState.routingTable()) .version(newClusterState.routingTable().version() + 1) .build()); } if (previousClusterState.metaData() != newClusterState.metaData()) { builder.metaData( MetaData.builder(newClusterState.metaData()) .version(newClusterState.metaData().version() + 1)); } newClusterState = builder.build(); for (UpdateTask<T> task : proccessedListeners) { if (task.listener instanceof AckedClusterStateTaskListener) { final AckedClusterStateTaskListener ackedListener = (AckedClusterStateTaskListener) task.listener; if (ackedListener.ackTimeout() == null || ackedListener.ackTimeout().millis() == 0) { ackedListener.onAckTimeout(); } else { try { ackListeners.add( new AckCountDownListener( ackedListener, newClusterState.version(), newClusterState.nodes(), threadPool)); } catch (EsRejectedExecutionException ex) { if (logger.isDebugEnabled()) { logger.debug( "Couldn't schedule timeout thread - node might be shutting down", ex); } // timeout straightaway, otherwise we could wait forever as the timeout thread has // not started ackedListener.onAckTimeout(); } } } } } final Discovery.AckListener ackListener = new DelegetingAckListener(ackListeners); newClusterState.status(ClusterState.ClusterStateStatus.BEING_APPLIED); if (logger.isTraceEnabled()) { logger.trace( "cluster state updated, source [{}]\n{}", tasksSummary, newClusterState.prettyPrint()); } else if (logger.isDebugEnabled()) { logger.debug( "cluster state updated, version [{}], source [{}]", newClusterState.version(), tasksSummary); } ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(tasksSummary, newClusterState, previousClusterState); // new cluster state, notify all listeners final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta(); if (nodesDelta.hasChanges() && logger.isInfoEnabled()) { String summary = nodesDelta.shortSummary(); if (summary.length() > 0) { logger.info("{}, reason: {}", summary, tasksSummary); } } nodeConnectionsService.connectToAddedNodes(clusterChangedEvent); // if we are the master, publish the new state to all nodes // we publish here before we send a notification to all the listeners, since if it fails // we don't want to notify if (newClusterState.nodes().isLocalNodeElectedMaster()) { logger.debug("publishing cluster state version [{}]", newClusterState.version()); try { clusterStatePublisher.accept(clusterChangedEvent, ackListener); } catch (Discovery.FailedToCommitClusterStateException t) { final long version = newClusterState.version(); logger.warn( (Supplier<?>) () -> new ParameterizedMessage( "failing [{}]: failed to commit cluster state version [{}]", tasksSummary, version), t); proccessedListeners.forEach(task -> task.listener.onFailure(task.source, t)); return; } } // update the current cluster state clusterState = newClusterState; logger.debug("set local cluster state to version {}", newClusterState.version()); try { // nothing to do until we actually recover from the gateway or any other block indicates we // need to disable persistency if (clusterChangedEvent.state().blocks().disableStatePersistence() == false && clusterChangedEvent.metaDataChanged()) { final Settings incomingSettings = clusterChangedEvent.state().metaData().settings(); clusterSettings.applySettings(incomingSettings); } } catch (Exception ex) { logger.warn("failed to apply cluster settings", ex); } for (ClusterStateListener listener : preAppliedListeners) { try { listener.clusterChanged(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateListener", ex); } } nodeConnectionsService.disconnectFromRemovedNodes(clusterChangedEvent); newClusterState.status(ClusterState.ClusterStateStatus.APPLIED); for (ClusterStateListener listener : postAppliedListeners) { try { listener.clusterChanged(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateListener", ex); } } // manual ack only from the master at the end of the publish if (newClusterState.nodes().isLocalNodeElectedMaster()) { try { ackListener.onNodeAck(newClusterState.nodes().getLocalNode(), null); } catch (Exception e) { final DiscoveryNode localNode = newClusterState.nodes().getLocalNode(); logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "error while processing ack for master node [{}]", localNode), e); } } for (UpdateTask<T> task : proccessedListeners) { task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState); } try { executor.clusterStatePublished(clusterChangedEvent); } catch (Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown while notifying executor of new cluster state publication [{}]", tasksSummary), e); } TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug( "processing [{}]: took [{}] done applying updated cluster_state (version: {}, uuid: {})", tasksSummary, executionTime, newClusterState.version(), newClusterState.stateUUID()); warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); final long version = newClusterState.version(); final String stateUUID = newClusterState.stateUUID(); final String prettyPrint = newClusterState.prettyPrint(); logger.warn( (Supplier<?>) () -> new ParameterizedMessage( "failed to apply updated cluster state in [{}]:\nversion [{}], uuid [{}], source [{}]\n{}", executionTime, version, stateUUID, tasksSummary, prettyPrint), e); // TODO: do we want to call updateTask.onFailure here? } }
@Override public void clusterChanged(final ClusterChangedEvent event) { if (lifecycle.stoppedOrClosed()) { return; } final ClusterState state = event.state(); if (state.nodes().isLocalNodeElectedMaster() == false) { // not our job to recover return; } if (state.blocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) { // already recovered return; } DiscoveryNodes nodes = state.nodes(); if (state.nodes().getMasterNodeId() == null) { logger.debug("not recovering from gateway, no master elected yet"); } else if (recoverAfterNodes != -1 && (nodes.getMasterAndDataNodes().size()) < recoverAfterNodes) { logger.debug( "not recovering from gateway, nodes_size (data+master) [{}] < recover_after_nodes [{}]", nodes.getMasterAndDataNodes().size(), recoverAfterNodes); } else if (recoverAfterDataNodes != -1 && nodes.getDataNodes().size() < recoverAfterDataNodes) { logger.debug( "not recovering from gateway, nodes_size (data) [{}] < recover_after_data_nodes [{}]", nodes.getDataNodes().size(), recoverAfterDataNodes); } else if (recoverAfterMasterNodes != -1 && nodes.getMasterNodes().size() < recoverAfterMasterNodes) { logger.debug( "not recovering from gateway, nodes_size (master) [{}] < recover_after_master_nodes [{}]", nodes.getMasterNodes().size(), recoverAfterMasterNodes); } else { boolean enforceRecoverAfterTime; String reason; if (expectedNodes == -1 && expectedMasterNodes == -1 && expectedDataNodes == -1) { // no expected is set, honor the setting if they are there enforceRecoverAfterTime = true; reason = "recover_after_time was set to [" + recoverAfterTime + "]"; } else { // one of the expected is set, see if all of them meet the need, and ignore the timeout in // this case enforceRecoverAfterTime = false; reason = ""; if (expectedNodes != -1 && (nodes.getMasterAndDataNodes().size() < expectedNodes)) { // does not meet the expected... enforceRecoverAfterTime = true; reason = "expecting [" + expectedNodes + "] nodes, but only have [" + nodes.getMasterAndDataNodes().size() + "]"; } else if (expectedDataNodes != -1 && (nodes.getDataNodes().size() < expectedDataNodes)) { // does not meet the expected... enforceRecoverAfterTime = true; reason = "expecting [" + expectedDataNodes + "] data nodes, but only have [" + nodes.getDataNodes().size() + "]"; } else if (expectedMasterNodes != -1 && (nodes.getMasterNodes().size() < expectedMasterNodes)) { // does not meet the expected... enforceRecoverAfterTime = true; reason = "expecting [" + expectedMasterNodes + "] master nodes, but only have [" + nodes.getMasterNodes().size() + "]"; } } performStateRecovery(enforceRecoverAfterTime, reason); } }
@Override public void clusterChanged(ClusterChangedEvent event) { if (event.state().blocks().disableStatePersistence()) { // reset the current metadata, we need to start fresh... this.currentMetaData = null; return; } MetaData newMetaData = event.state().metaData(); // we don't check if metaData changed, since we might be called several times and we need to // check dangling... boolean success = true; // only applied to master node, writing the global and index level states if (event.state().nodes().localNode().masterNode()) { // check if the global state changed? if (currentMetaData == null || !MetaData.isGlobalStateEquals(currentMetaData, newMetaData)) { try { writeGlobalState("changed", newMetaData, currentMetaData); } catch (Exception e) { success = false; } } // check and write changes in indices for (IndexMetaData indexMetaData : newMetaData) { String writeReason = null; IndexMetaData currentIndexMetaData; if (currentMetaData == null) { // a new event..., check from the state stored currentIndexMetaData = loadIndex(indexMetaData.index()); } else { currentIndexMetaData = currentMetaData.index(indexMetaData.index()); } if (currentIndexMetaData == null) { writeReason = "freshly created"; } else if (currentIndexMetaData.version() != indexMetaData.version()) { writeReason = "version changed from [" + currentIndexMetaData.version() + "] to [" + indexMetaData.version() + "]"; } // we update the writeReason only if we really need to write it if (writeReason == null) { continue; } try { writeIndex(writeReason, indexMetaData, currentIndexMetaData); } catch (Exception e) { success = false; } } } // delete indices that were there before, but are deleted now // we need to do it so they won't be detected as dangling if (nodeEnv.hasNodeFile()) { if (currentMetaData != null) { // only delete indices when we already received a state (currentMetaData != null) // and we had a go at processing dangling indices at least once // this will also delete the _state of the index itself for (IndexMetaData current : currentMetaData) { if (danglingIndices.containsKey(current.index())) { continue; } if (!newMetaData.hasIndex(current.index())) { logger.debug( "[{}] deleting index that is no longer part of the metadata (indices: [{}])", current.index(), newMetaData.indices().keySet()); FileSystemUtils.deleteRecursively(nodeEnv.indexLocations(new Index(current.index()))); } } } } // handle dangling indices, we handle those for all nodes that have a node file (data or master) if (nodeEnv.hasNodeFile()) { if (danglingTimeout.millis() >= 0) { synchronized (danglingMutex) { for (String danglingIndex : danglingIndices.keySet()) { if (newMetaData.hasIndex(danglingIndex)) { logger.debug("[{}] no longer dangling (created), removing", danglingIndex); DanglingIndex removed = danglingIndices.remove(danglingIndex); removed.future.cancel(false); } } // delete indices that are no longer part of the metadata try { for (String indexName : nodeEnv.findAllIndices()) { // if we have the index on the metadata, don't delete it if (newMetaData.hasIndex(indexName)) { continue; } if (danglingIndices.containsKey(indexName)) { // already dangling, continue continue; } IndexMetaData indexMetaData = loadIndex(indexName); if (indexMetaData != null) { if (danglingTimeout.millis() == 0) { logger.info( "[{}] dangling index, exists on local file system, but not in cluster metadata, timeout set to 0, deleting now", indexName); FileSystemUtils.deleteRecursively(nodeEnv.indexLocations(new Index(indexName))); } else { logger.info( "[{}] dangling index, exists on local file system, but not in cluster metadata, scheduling to delete in [{}], auto import to cluster state [{}]", indexName, danglingTimeout, autoImportDangled); danglingIndices.put( indexName, new DanglingIndex( indexName, threadPool.schedule( danglingTimeout, ThreadPool.Names.SAME, new RemoveDanglingIndex(indexName)))); } } } } catch (Exception e) { logger.warn("failed to find dangling indices", e); } } } if (autoImportDangled.shouldImport() && !danglingIndices.isEmpty()) { final List<IndexMetaData> dangled = Lists.newArrayList(); for (String indexName : danglingIndices.keySet()) { IndexMetaData indexMetaData = loadIndex(indexName); if (indexMetaData == null) { logger.debug("failed to find state for dangling index [{}]", indexName); continue; } // we might have someone copying over an index, renaming the directory, handle that if (!indexMetaData.index().equals(indexName)) { logger.info( "dangled index directory name is [{}], state name is [{}], renaming to directory name", indexName, indexMetaData.index()); indexMetaData = IndexMetaData.newIndexMetaDataBuilder(indexMetaData).index(indexName).build(); } if (autoImportDangled == AutoImportDangledState.CLOSED) { indexMetaData = IndexMetaData.newIndexMetaDataBuilder(indexMetaData) .state(IndexMetaData.State.CLOSE) .build(); } if (indexMetaData != null) { dangled.add(indexMetaData); } } IndexMetaData[] dangledIndices = dangled.toArray(new IndexMetaData[dangled.size()]); try { allocateDangledIndices.allocateDangled( dangledIndices, new LocalAllocateDangledIndices.Listener() { @Override public void onResponse( LocalAllocateDangledIndices.AllocateDangledResponse response) { logger.trace("allocated dangled"); } @Override public void onFailure(Throwable e) { logger.info("failed to send allocated dangled", e); } }); } catch (Exception e) { logger.warn("failed to send allocate dangled", e); } } } if (success) { currentMetaData = newMetaData; } }