private boolean moveShards(RoutingAllocation allocation) { boolean changed = false; // create a copy of the shards interleaving between nodes, and check if they can remain List<MutableShardRouting> shards = new ArrayList<>(); int index = 0; boolean found = true; final RoutingNodes routingNodes = allocation.routingNodes(); while (found) { found = false; for (RoutingNode routingNode : routingNodes) { if (index >= routingNode.size()) { continue; } found = true; shards.add(routingNode.get(index)); } index++; } for (int i = 0; i < shards.size(); i++) { MutableShardRouting shardRouting = shards.get(i); // we can only move started shards... if (!shardRouting.started()) { continue; } final RoutingNode routingNode = routingNodes.node(shardRouting.currentNodeId()); Decision decision = allocation.deciders().canRemain(shardRouting, routingNode, allocation); if (decision.type() == Decision.Type.NO) { logger.debug( "[{}][{}] allocated on [{}], but can no longer be allocated on it, moving...", shardRouting.index(), shardRouting.id(), routingNode.node()); boolean moved = shardsAllocators.move(shardRouting, routingNode, allocation); if (!moved) { logger.debug("[{}][{}] can't move", shardRouting.index(), shardRouting.id()); } else { changed = true; } } } return changed; }
@Override public void execute(RoutingAllocation allocation) throws ElasticSearchException { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); boolean found = false; for (RoutingNodes.RoutingNodeIterator it = allocation.routingNodes().routingNodeIter(discoNode.id()); it.hasNext(); ) { MutableShardRouting shardRouting = it.next(); if (!shardRouting.shardId().equals(shardId)) { continue; } found = true; if (shardRouting.relocatingNodeId() != null) { if (shardRouting.initializing()) { // the shard is initializing and recovering from another node, simply cancel the recovery it.remove(); // and cancel the relocating state from the shard its being relocated from RoutingNode relocatingFromNode = allocation.routingNodes().node(shardRouting.relocatingNodeId()); if (relocatingFromNode != null) { for (MutableShardRouting fromShardRouting : relocatingFromNode) { if (fromShardRouting.shardId().equals(shardRouting.shardId()) && fromShardRouting.state() == RELOCATING) { allocation.routingNodes().cancelRelocation(fromShardRouting); break; } } } } else if (shardRouting.relocating()) { // the shard is relocating to another node, cancel the recovery on the other node, and // deallocate this one if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and initializing its state"); } it.moveToUnassigned(); // now, go and find the shard that is initializing on the target node, and cancel it as // well... RoutingNodes.RoutingNodeIterator initializingNode = allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId()); if (initializingNode != null) { while (initializingNode.hasNext()) { MutableShardRouting initializingShardRouting = initializingNode.next(); if (initializingShardRouting.shardId().equals(shardRouting.shardId()) && initializingShardRouting.state() == INITIALIZING) { initializingNode.remove(); } } } } } else { // the shard is not relocating, its either started, or initializing, just cancel it and move // on... if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and started"); } it.remove(); allocation .routingNodes() .unassigned() .add( new MutableShardRouting( shardRouting.index(), shardRouting.id(), null, shardRouting.primary(), ShardRoutingState.UNASSIGNED, shardRouting.version() + 1)); } } if (!found) { throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + ", failed to find it on node " + discoNode); } }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; DiscoveryNodes nodes = allocation.nodes(); RoutingNodes routingNodes = allocation.routingNodes(); // First, handle primaries, they must find a place to be allocated on here Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { MutableShardRouting shard = unassignedIterator.next(); if (!shard.primary()) { continue; } // this is an API allocation, ignore since we know there is no data... if (!routingNodes .routingTable() .index(shard.index()) .shard(shard.id()) .primaryAllocatedPostApi()) { continue; } ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard); int numberOfAllocationsFound = 0; long highestVersion = -1; Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet(); final boolean[] states = nodesState.allocated; final Object[] keys = nodesState.keys; final long[] values = nodesState.values; for (int i = 0; i < states.length; i++) { if (!states[i]) { continue; } DiscoveryNode node = (DiscoveryNode) keys[i]; long version = values[i]; // since we don't check in NO allocation, we need to double check here if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) { continue; } if (version != -1) { numberOfAllocationsFound++; if (highestVersion == -1) { nodesWithHighestVersion.add(node); highestVersion = version; } else { if (version > highestVersion) { nodesWithHighestVersion.clear(); nodesWithHighestVersion.add(node); highestVersion = version; } else if (version == highestVersion) { nodesWithHighestVersion.add(node); } } } } // check if the counts meets the minimum set int requiredAllocation = 1; // if we restore from a repository one copy is more then enough if (shard.restoreSource() == null) { try { IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index()); String initialShards = indexMetaData .settings() .get( INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards)); if ("quorum".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1; } } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 2) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2); } } else if ("one".equals(initialShards)) { requiredAllocation = 1; } else if ("full".equals(initialShards) || "all".equals(initialShards)) { requiredAllocation = indexMetaData.numberOfReplicas() + 1; } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = indexMetaData.numberOfReplicas(); } } else { requiredAllocation = Integer.parseInt(initialShards); } } catch (Exception e) { logger.warn( "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(), shard.id(), initialShards, shard); } } // not enough found for this shard, continue... if (numberOfAllocationsFound < requiredAllocation) { // if we are restoring this shard we still can allocate if (shard.restoreSource() == null) { // we can't really allocate, so ignore it and continue unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]", shard.index(), shard.id(), numberOfAllocationsFound, requiredAllocation); } } else if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: missing local data, will restore from [{}]", shard.index(), shard.id(), shard.restoreSource()); } continue; } Set<DiscoveryNode> throttledNodes = Sets.newHashSet(); Set<DiscoveryNode> noNodes = Sets.newHashSet(); for (DiscoveryNode discoNode : nodesWithHighestVersion) { RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.THROTTLE) { throttledNodes.add(discoNode); } else if (decision.type() == Decision.Type.NO) { noNodes.add(discoNode); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state allocation .routingNodes() .assign(new MutableShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); // found a node, so no throttling, no "no", and break out of the loop throttledNodes.clear(); noNodes.clear(); break; } } if (throttledNodes.isEmpty()) { // if we have a node that we "can't" allocate to, force allocation, since this is our master // data! if (!noNodes.isEmpty()) { DiscoveryNode discoNode = noNodes.iterator().next(); RoutingNode node = routingNodes.node(discoNode.id()); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state allocation .routingNodes() .assign(new MutableShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); } } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, throttledNodes); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } if (!routingNodes.hasUnassigned()) { return changed; } // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was // allocated on unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { MutableShardRouting shard = unassignedIterator.next(); // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { continue; } Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = buildShardStores(nodes, shard); long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> nodeStoreEntry : shardStores.entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { MutableShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = shardStores.get(primaryNode); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { if (primaryNodeStore.fileExists(storeFileMetaData.name()) && primaryNodeStore .file(storeFileMetaData.name()) .isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; allocation.routingNodes().assign(shard, lastNodeMatched.nodeId()); unassignedIterator.remove(); } } } return changed; }