private Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) { Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = cachedStores.get(shard.shardId()); ObjectOpenHashSet<String> nodesIds; if (shardStores == null) { shardStores = Maps.newHashMap(); cachedStores.put(shard.shardId(), shardStores); nodesIds = ObjectOpenHashSet.from(nodes.dataNodes().keys()); } else { nodesIds = ObjectOpenHashSet.newInstance(); // clean nodes that have failed for (Iterator<DiscoveryNode> it = shardStores.keySet().iterator(); it.hasNext(); ) { DiscoveryNode node = it.next(); if (!nodes.nodeExists(node.id())) { it.remove(); } } for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { DiscoveryNode node = cursor.value; if (!shardStores.containsKey(node)) { nodesIds.add(node.id()); } } } if (!nodesIds.isEmpty()) { String[] nodesIdsArray = nodesIds.toArray(String.class); TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData = listShardStoreMetaData .list(shard.shardId(), false, nodesIdsArray, listTimeout) .actionGet(); if (logger.isTraceEnabled()) { if (nodesStoreFilesMetaData.failures().length > 0) { StringBuilder sb = new StringBuilder(shard + ": failures when trying to list stores on nodes:"); for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) { Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]); if (cause instanceof ConnectTransportException) { continue; } sb.append("\n -> ") .append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage()); } logger.trace(sb.toString()); } } for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData : nodesStoreFilesMetaData) { if (nodeStoreFilesMetaData.storeFilesMetaData() != null) { shardStores.put( nodeStoreFilesMetaData.getNode(), nodeStoreFilesMetaData.storeFilesMetaData()); } } } return shardStores; }
@Override public boolean canRebalance(ShardRouting shardRouting, RoutingAllocation allocation) { if (clusterConcurrentRebalance == -1) { return true; } int rebalance = 0; for (RoutingNode node : allocation.routingNodes()) { for (MutableShardRouting shard : node) { if (shard.state() == ShardRoutingState.RELOCATING) { rebalance++; } } } if (rebalance >= clusterConcurrentRebalance) { return false; } return true; }
private boolean applyStartedShards( RoutingNodes routingNodes, Iterable<? extends ShardRouting> startedShardEntries) { boolean dirty = false; // apply shards might be called several times with the same shard, ignore it for (ShardRouting startedShard : startedShardEntries) { assert startedShard.state() == INITIALIZING; // retrieve the relocating node id before calling startedShard(). String relocatingNodeId = null; RoutingNodes.RoutingNodeIterator currentRoutingNode = routingNodes.routingNodeIter(startedShard.currentNodeId()); if (currentRoutingNode != null) { for (MutableShardRouting shard : currentRoutingNode) { if (shard.shardId().equals(startedShard.shardId())) { relocatingNodeId = shard.relocatingNodeId(); if (!shard.started()) { dirty = true; routingNodes.started(shard); } break; } } } // startedShard is the current state of the shard (post relocation for example) // this means that after relocation, the state will be started and the currentNodeId will be // the node we relocated to if (relocatingNodeId == null) { continue; } RoutingNodes.RoutingNodeIterator sourceRoutingNode = routingNodes.routingNodeIter(relocatingNodeId); if (sourceRoutingNode != null) { while (sourceRoutingNode.hasNext()) { MutableShardRouting shard = sourceRoutingNode.next(); if (shard.shardId().equals(startedShard.shardId())) { if (shard.relocating()) { dirty = true; sourceRoutingNode.remove(); break; } } } } } return dirty; }
private boolean moveShards(RoutingAllocation allocation) { boolean changed = false; // create a copy of the shards interleaving between nodes, and check if they can remain List<MutableShardRouting> shards = new ArrayList<>(); int index = 0; boolean found = true; final RoutingNodes routingNodes = allocation.routingNodes(); while (found) { found = false; for (RoutingNode routingNode : routingNodes) { if (index >= routingNode.size()) { continue; } found = true; shards.add(routingNode.get(index)); } index++; } for (int i = 0; i < shards.size(); i++) { MutableShardRouting shardRouting = shards.get(i); // we can only move started shards... if (!shardRouting.started()) { continue; } final RoutingNode routingNode = routingNodes.node(shardRouting.currentNodeId()); Decision decision = allocation.deciders().canRemain(shardRouting, routingNode, allocation); if (decision.type() == Decision.Type.NO) { logger.debug( "[{}][{}] allocated on [{}], but can no longer be allocated on it, moving...", shardRouting.index(), shardRouting.id(), routingNode.node()); boolean moved = shardsAllocators.move(shardRouting, routingNode, allocation); if (!moved) { logger.debug("[{}][{}] can't move", shardRouting.index(), shardRouting.id()); } else { changed = true; } } } return changed; }
@Override public void execute(RoutingAllocation allocation) throws ElasticSearchException { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); MutableShardRouting shardRouting = null; for (MutableShardRouting routing : allocation.routingNodes().unassigned()) { if (routing.shardId().equals(shardId)) { // prefer primaries first to allocate if (shardRouting == null || routing.primary()) { shardRouting = routing; } } } if (shardRouting == null) { throw new ElasticSearchIllegalArgumentException( "[allocate] failed to find " + shardId + " on the list of unassigned shards"); } if (shardRouting.primary() && !allowPrimary) { throw new ElasticSearchIllegalArgumentException( "[allocate] trying to allocate a primary shard " + shardId + "], which is disabled"); } RoutingNode routingNode = allocation.routingNodes().node(discoNode.id()); allocation.addIgnoreDisable(shardRouting.shardId(), routingNode.nodeId()); if (!allocation.deciders().canAllocate(shardRouting, routingNode, allocation).allowed()) { throw new ElasticSearchIllegalArgumentException( "[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed"); } // go over and remove it from the unassigned for (Iterator<MutableShardRouting> it = allocation.routingNodes().unassigned().iterator(); it.hasNext(); ) { if (it.next() != shardRouting) { continue; } it.remove(); routingNode.add(shardRouting); break; } }
@Test public void indexLevelShardsLimitRemain() { AllocationService strategy = new AllocationService( settingsBuilder() .put("cluster.routing.allocation.concurrent_recoveries", 10) .put("cluster.routing.allocation.node_initial_primaries_recoveries", 10) .put("cluster.routing.allocation.cluster_concurrent_rebalance", -1) .put("cluster.routing.allocation.balance.index", 0.0f) .put("cluster.routing.allocation.balance.replica", 1.0f) .put("cluster.routing.allocation.balance.primary", 0.0f) .build()); logger.info("Building initial routing table"); MetaData metaData = newMetaDataBuilder() .put( newIndexMetaDataBuilder("test") .settings( ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0))) .build(); RoutingTable routingTable = routingTable().addAsNew(metaData.index("test")).build(); ClusterState clusterState = newClusterStateBuilder().metaData(metaData).routingTable(routingTable).build(); logger.info("Adding one node and reroute"); clusterState = newClusterStateBuilder() .state(clusterState) .nodes(newNodesBuilder().put(newNode("node1"))) .build(); routingTable = strategy.reroute(clusterState).routingTable(); clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build(); logger.info("Start the primary shards"); RoutingNodes routingNodes = clusterState.routingNodes(); routingTable = strategy .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING)) .routingTable(); clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build(); assertThat(clusterState.readOnlyRoutingNodes().numberOfShardsOfType(STARTED), equalTo(5)); logger.info("add another index with 5 shards"); metaData = newMetaDataBuilder() .metaData(metaData) .put( newIndexMetaDataBuilder("test1") .settings( ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0))) .build(); routingTable = routingTable().routingTable(routingTable).addAsNew(metaData.index("test1")).build(); clusterState = newClusterStateBuilder() .state(clusterState) .metaData(metaData) .routingTable(routingTable) .build(); logger.info("Add another one node and reroute"); clusterState = newClusterStateBuilder() .state(clusterState) .nodes(newNodesBuilder().putAll(clusterState.nodes()).put(newNode("node2"))) .build(); routingTable = strategy.reroute(clusterState).routingTable(); clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build(); routingNodes = clusterState.routingNodes(); routingTable = strategy .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING)) .routingTable(); clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build(); assertThat(clusterState.readOnlyRoutingNodes().numberOfShardsOfType(STARTED), equalTo(10)); for (MutableShardRouting shardRouting : clusterState.readOnlyRoutingNodes().node("node1")) { assertThat(shardRouting.index(), equalTo("test")); } for (MutableShardRouting shardRouting : clusterState.readOnlyRoutingNodes().node("node2")) { assertThat(shardRouting.index(), equalTo("test1")); } logger.info( "update " + ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE + " for test, see that things move"); metaData = newMetaDataBuilder() .metaData(metaData) .put( newIndexMetaDataBuilder("test") .settings( ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) .put(ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE, 3))) .build(); clusterState = newClusterStateBuilder().state(clusterState).metaData(metaData).build(); logger.info("reroute after setting"); routingTable = strategy.reroute(clusterState).routingTable(); clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build(); assertThat( clusterState.readOnlyRoutingNodes().node("node1").numberOfShardsWithState(STARTED), equalTo(3)); assertThat( clusterState.readOnlyRoutingNodes().node("node1").numberOfShardsWithState(RELOCATING), equalTo(2)); assertThat( clusterState.readOnlyRoutingNodes().node("node2").numberOfShardsWithState(RELOCATING), equalTo(2)); assertThat( clusterState.readOnlyRoutingNodes().node("node2").numberOfShardsWithState(STARTED), equalTo(3)); // the first move will destroy the balance and the balancer will move 2 shards from node2 to // node one right after // moving the nodes to node2 since we consider INITIALIZING nodes during rebalance routingNodes = clusterState.routingNodes(); routingTable = strategy .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING)) .routingTable(); clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build(); // now we are done compared to EvenShardCountAllocator since the Balancer is not soely based on // the average assertThat( clusterState.readOnlyRoutingNodes().node("node1").numberOfShardsWithState(STARTED), equalTo(5)); assertThat( clusterState.readOnlyRoutingNodes().node("node2").numberOfShardsWithState(STARTED), equalTo(5)); }
@Test public void sameHost() { AllocationService strategy = new AllocationService( settingsBuilder().put(SameShardAllocationDecider.SAME_HOST_SETTING, true).build()); MetaData metaData = newMetaDataBuilder() .put(newIndexMetaDataBuilder("test").numberOfShards(2).numberOfReplicas(1)) .build(); RoutingTable routingTable = routingTable().addAsNew(metaData.index("test")).build(); ClusterState clusterState = newClusterStateBuilder().metaData(metaData).routingTable(routingTable).build(); logger.info("--> adding two nodes with the same host"); clusterState = newClusterStateBuilder() .state(clusterState) .nodes( newNodesBuilder() .put(newNode("node1", new InetSocketTransportAddress("test1", 80))) .put(newNode("node2", new InetSocketTransportAddress("test1", 80)))) .build(); routingTable = strategy.reroute(clusterState).routingTable(); clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build(); assertThat( clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.INITIALIZING), equalTo(2)); logger.info( "--> start all primary shards, no replica will be started since its on the same host"); routingTable = strategy .applyStartedShards( clusterState, clusterState.readOnlyRoutingNodes().shardsWithState(INITIALIZING)) .routingTable(); clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build(); assertThat( clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.STARTED), equalTo(2)); assertThat( clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.INITIALIZING), equalTo(0)); logger.info("--> add another node, with a different host, replicas will be allocating"); clusterState = newClusterStateBuilder() .state(clusterState) .nodes( newNodesBuilder() .putAll(clusterState.nodes()) .put(newNode("node3", new InetSocketTransportAddress("test2", 80)))) .build(); routingTable = strategy.reroute(clusterState).routingTable(); clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build(); assertThat( clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.STARTED), equalTo(2)); assertThat( clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.INITIALIZING), equalTo(2)); for (MutableShardRouting shardRouting : clusterState.readOnlyRoutingNodes().shardsWithState(INITIALIZING)) { assertThat(shardRouting.currentNodeId(), equalTo("node3")); } }
@Test public void testMultiIndexEvenDistribution() { AllocationService strategy = createAllocationService( settingsBuilder() .put("cluster.routing.allocation.concurrent_recoveries", 10) .put("cluster.routing.allocation.allow_rebalance", "always") .put("cluster.routing.allocation.cluster_concurrent_rebalance", -1) .build()); final int numberOfIndices = 50; logger.info("Building initial routing table with " + numberOfIndices + " indices"); MetaData.Builder metaDataBuilder = MetaData.builder(); for (int i = 0; i < numberOfIndices; i++) { metaDataBuilder.put(IndexMetaData.builder("test" + i).numberOfShards(1).numberOfReplicas(0)); } MetaData metaData = metaDataBuilder.build(); RoutingTable.Builder routingTableBuilder = RoutingTable.builder(); for (int i = 0; i < numberOfIndices; i++) { routingTableBuilder.addAsNew(metaData.index("test" + i)); } RoutingTable routingTable = routingTableBuilder.build(); ClusterState clusterState = ClusterState.builder().metaData(metaData).routingTable(routingTable).build(); assertThat(routingTable.indicesRouting().size(), equalTo(numberOfIndices)); for (int i = 0; i < numberOfIndices; i++) { assertThat(routingTable.index("test" + i).shards().size(), equalTo(1)); assertThat(routingTable.index("test" + i).shard(0).size(), equalTo(1)); assertThat(routingTable.index("test" + i).shard(0).shards().size(), equalTo(1)); assertThat( routingTable.index("test" + i).shard(0).shards().get(0).state(), equalTo(UNASSIGNED)); assertThat( routingTable.index("test" + i).shard(0).shards().get(0).currentNodeId(), nullValue()); } logger.info("Adding " + (numberOfIndices / 2) + " nodes"); DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder(); List<DiscoveryNode> nodes = newArrayList(); for (int i = 0; i < (numberOfIndices / 2); i++) { nodesBuilder.put(newNode("node" + i)); } RoutingTable prevRoutingTable = routingTable; clusterState = ClusterState.builder(clusterState).nodes(nodesBuilder).build(); routingTable = strategy.reroute(clusterState).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); assertThat(prevRoutingTable != routingTable, equalTo(true)); for (int i = 0; i < numberOfIndices; i++) { assertThat(routingTable.index("test" + i).shards().size(), equalTo(1)); assertThat(routingTable.index("test" + i).shard(0).size(), equalTo(1)); assertThat(routingTable.index("test" + i).shard(0).shards().size(), equalTo(1)); assertThat( routingTable.index("test" + i).shard(0).shards().get(0).unassigned(), equalTo(false)); assertThat( routingTable.index("test" + i).shard(0).shards().get(0).state(), equalTo(INITIALIZING)); assertThat(routingTable.index("test" + i).shard(0).shards().get(0).primary(), equalTo(true)); // make sure we still have 2 shards initializing per node on the first 25 nodes String nodeId = routingTable.index("test" + i).shard(0).shards().get(0).currentNodeId(); int nodeIndex = Integer.parseInt(nodeId.substring("node".length())); assertThat(nodeIndex, lessThan(25)); } RoutingNodes routingNodes = clusterState.routingNodes(); Set<String> encounteredIndices = newHashSet(); for (RoutingNode routingNode : routingNodes) { assertThat(routingNode.numberOfShardsWithState(STARTED), equalTo(0)); assertThat(routingNode.size(), equalTo(2)); // make sure we still have 2 shards initializing per node on the only 25 nodes int nodeIndex = Integer.parseInt(routingNode.nodeId().substring("node".length())); assertThat(nodeIndex, lessThan(25)); // check that we don't have a shard associated with a node with the same index name (we have a // single shard) for (MutableShardRouting shardRoutingEntry : routingNode) { assertThat(encounteredIndices, not(hasItem(shardRoutingEntry.index()))); encounteredIndices.add(shardRoutingEntry.index()); } } logger.info("Adding additional " + (numberOfIndices / 2) + " nodes, nothing should change"); nodesBuilder = DiscoveryNodes.builder(clusterState.nodes()); for (int i = (numberOfIndices / 2); i < numberOfIndices; i++) { nodesBuilder.put(newNode("node" + i)); } prevRoutingTable = routingTable; clusterState = ClusterState.builder(clusterState).nodes(nodesBuilder).build(); routingTable = strategy.reroute(clusterState).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); assertThat(prevRoutingTable != routingTable, equalTo(false)); logger.info("Marking the shard as started"); prevRoutingTable = routingTable; routingTable = strategy .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING)) .routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); assertThat(prevRoutingTable != routingTable, equalTo(true)); int numberOfRelocatingShards = 0; int numberOfStartedShards = 0; for (int i = 0; i < numberOfIndices; i++) { assertThat(routingTable.index("test" + i).shards().size(), equalTo(1)); assertThat(routingTable.index("test" + i).shard(0).size(), equalTo(1)); assertThat(routingTable.index("test" + i).shard(0).shards().size(), equalTo(1)); assertThat( routingTable.index("test" + i).shard(0).shards().get(0).unassigned(), equalTo(false)); assertThat( routingTable.index("test" + i).shard(0).shards().get(0).state(), anyOf(equalTo(STARTED), equalTo(RELOCATING))); if (routingTable.index("test" + i).shard(0).shards().get(0).state() == STARTED) { numberOfStartedShards++; } else if (routingTable.index("test" + i).shard(0).shards().get(0).state() == RELOCATING) { numberOfRelocatingShards++; } assertThat(routingTable.index("test" + i).shard(0).shards().get(0).primary(), equalTo(true)); // make sure we still have 2 shards either relocating or started on the first 25 nodes (still) String nodeId = routingTable.index("test" + i).shard(0).shards().get(0).currentNodeId(); int nodeIndex = Integer.parseInt(nodeId.substring("node".length())); assertThat(nodeIndex, lessThan(25)); } assertThat(numberOfRelocatingShards, equalTo(25)); assertThat(numberOfStartedShards, equalTo(25)); }
@Override public void execute(RoutingAllocation allocation) throws ElasticSearchException { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); boolean found = false; for (RoutingNodes.RoutingNodeIterator it = allocation.routingNodes().routingNodeIter(discoNode.id()); it.hasNext(); ) { MutableShardRouting shardRouting = it.next(); if (!shardRouting.shardId().equals(shardId)) { continue; } found = true; if (shardRouting.relocatingNodeId() != null) { if (shardRouting.initializing()) { // the shard is initializing and recovering from another node, simply cancel the recovery it.remove(); // and cancel the relocating state from the shard its being relocated from RoutingNode relocatingFromNode = allocation.routingNodes().node(shardRouting.relocatingNodeId()); if (relocatingFromNode != null) { for (MutableShardRouting fromShardRouting : relocatingFromNode) { if (fromShardRouting.shardId().equals(shardRouting.shardId()) && fromShardRouting.state() == RELOCATING) { allocation.routingNodes().cancelRelocation(fromShardRouting); break; } } } } else if (shardRouting.relocating()) { // the shard is relocating to another node, cancel the recovery on the other node, and // deallocate this one if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and initializing its state"); } it.moveToUnassigned(); // now, go and find the shard that is initializing on the target node, and cancel it as // well... RoutingNodes.RoutingNodeIterator initializingNode = allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId()); if (initializingNode != null) { while (initializingNode.hasNext()) { MutableShardRouting initializingShardRouting = initializingNode.next(); if (initializingShardRouting.shardId().equals(shardRouting.shardId()) && initializingShardRouting.state() == INITIALIZING) { initializingNode.remove(); } } } } } else { // the shard is not relocating, its either started, or initializing, just cancel it and move // on... if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and started"); } it.remove(); allocation .routingNodes() .unassigned() .add( new MutableShardRouting( shardRouting.index(), shardRouting.id(), null, shardRouting.primary(), ShardRoutingState.UNASSIGNED, shardRouting.version() + 1)); } } if (!found) { throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + ", failed to find it on node " + discoNode); } }
/** * Applies the relevant logic to handle a failed shard. Returns <tt>true</tt> if changes happened * that require relocation. */ private boolean applyFailedShard( RoutingAllocation allocation, ShardRouting failedShard, boolean addToIgnoreList) { // create a copy of the failed shard, since we assume we can change possible references to it // without // changing the state of failed shard failedShard = new ImmutableShardRouting(failedShard); IndexRoutingTable indexRoutingTable = allocation.routingTable().index(failedShard.index()); if (indexRoutingTable == null) { return false; } RoutingNodes routingNodes = allocation.routingNodes(); boolean dirty = false; if (failedShard.relocatingNodeId() != null) { // the shard is relocating, either in initializing (recovery from another node) or relocating // (moving to another node) if (failedShard.state() == INITIALIZING) { // the shard is initializing and recovering from another node // first, we need to cancel the current node that is being initialized RoutingNodes.RoutingNodeIterator initializingNode = routingNodes.routingNodeIter(failedShard.currentNodeId()); if (initializingNode != null) { while (initializingNode.hasNext()) { MutableShardRouting shardRouting = initializingNode.next(); if (shardRouting.equals(failedShard)) { dirty = true; initializingNode.remove(); if (addToIgnoreList) { // make sure we ignore this shard on the relevant node allocation.addIgnoreShardForNode( failedShard.shardId(), failedShard.currentNodeId()); } break; } } } if (dirty) { // now, find the node that we are relocating *from*, and cancel its relocation RoutingNode relocatingFromNode = routingNodes.node(failedShard.relocatingNodeId()); if (relocatingFromNode != null) { for (MutableShardRouting shardRouting : relocatingFromNode) { if (shardRouting.shardId().equals(failedShard.shardId()) && shardRouting.relocating()) { dirty = true; routingNodes.cancelRelocation(shardRouting); break; } } } } else { logger.debug("failed shard {} not found in routingNodes, ignoring it", failedShard); } return dirty; } else if (failedShard.state() == RELOCATING) { // the shard is relocating, meaning its the source the shard is relocating from // first, we need to cancel the current relocation from the current node // now, find the node that we are recovering from, cancel the relocation, remove it from the // node // and add it to the unassigned shards list... RoutingNodes.RoutingNodeIterator relocatingFromNode = routingNodes.routingNodeIter(failedShard.currentNodeId()); if (relocatingFromNode != null) { while (relocatingFromNode.hasNext()) { MutableShardRouting shardRouting = relocatingFromNode.next(); if (shardRouting.equals(failedShard)) { dirty = true; relocatingFromNode.remove(); if (addToIgnoreList) { // make sure we ignore this shard on the relevant node allocation.addIgnoreShardForNode( failedShard.shardId(), failedShard.currentNodeId()); } routingNodes .unassigned() .add( new MutableShardRouting( failedShard.index(), failedShard.id(), null, failedShard.primary(), ShardRoutingState.UNASSIGNED, failedShard.version() + 1)); break; } } } if (dirty) { // next, we need to find the target initializing shard that is recovering from, and remove // it... RoutingNodes.RoutingNodeIterator initializingNode = routingNodes.routingNodeIter(failedShard.relocatingNodeId()); if (initializingNode != null) { while (initializingNode.hasNext()) { MutableShardRouting shardRouting = initializingNode.next(); if (shardRouting.shardId().equals(failedShard.shardId()) && shardRouting.state() == INITIALIZING) { dirty = true; initializingNode.remove(); } } } } else { logger.debug("failed shard {} not found in routingNodes, ignoring it", failedShard); } } else { throw new ElasticsearchIllegalStateException( "illegal state for a failed shard, relocating node id is set, but state does not match: " + failedShard); } } else { // the shard is not relocating, its either started, or initializing, just cancel it and move // on... RoutingNodes.RoutingNodeIterator node = routingNodes.routingNodeIter(failedShard.currentNodeId()); if (node != null) { while (node.hasNext()) { MutableShardRouting shardRouting = node.next(); if (shardRouting.equals(failedShard)) { dirty = true; if (addToIgnoreList) { // make sure we ignore this shard on the relevant node allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId()); } node.remove(); // move all the shards matching the failed shard to the end of the unassigned list // so we give a chance for other allocations and won't create poison failed allocations // that can keep other shards from being allocated (because of limits applied on how // many // shards we can start per node) List<MutableShardRouting> shardsToMove = Lists.newArrayList(); for (Iterator<MutableShardRouting> unassignedIt = routingNodes.unassigned().iterator(); unassignedIt.hasNext(); ) { MutableShardRouting unassignedShardRouting = unassignedIt.next(); if (unassignedShardRouting.shardId().equals(failedShard.shardId())) { unassignedIt.remove(); shardsToMove.add(unassignedShardRouting); } } if (!shardsToMove.isEmpty()) { routingNodes.unassigned().addAll(shardsToMove); } routingNodes .unassigned() .add( new MutableShardRouting( failedShard.index(), failedShard.id(), null, null, failedShard.restoreSource(), failedShard.primary(), ShardRoutingState.UNASSIGNED, failedShard.version() + 1)); break; } } } if (!dirty) { logger.debug("failed shard {} not found in routingNodes, ignoring it", failedShard); } } return dirty; }
private boolean electPrimariesAndUnassignedDanglingReplicas(RoutingAllocation allocation) { boolean changed = false; RoutingNodes routingNodes = allocation.routingNodes(); if (!routingNodes.hasUnassignedPrimaries()) { // move out if we don't have unassigned primaries return changed; } // go over and remove dangling replicas that are initializing for primary shards List<ShardRouting> shardsToFail = Lists.newArrayList(); for (MutableShardRouting shardEntry : routingNodes.unassigned()) { if (shardEntry.primary()) { for (MutableShardRouting routing : routingNodes.assignedShards(shardEntry)) { if (!routing.primary() && routing.initializing()) { shardsToFail.add(routing); } } } } for (ShardRouting shardToFail : shardsToFail) { changed |= applyFailedShard(allocation, shardToFail, false); } // now, go over and elect a new primary if possible, not, from this code block on, if one is // elected, // routingNodes.hasUnassignedPrimaries() will potentially be false for (MutableShardRouting shardEntry : routingNodes.unassigned()) { if (shardEntry.primary()) { MutableShardRouting candidate = allocation.routingNodes().activeReplica(shardEntry); if (candidate != null) { routingNodes.swapPrimaryFlag(shardEntry, candidate); if (candidate.relocatingNodeId() != null) { changed = true; // its also relocating, make sure to move the other routing to primary RoutingNode node = routingNodes.node(candidate.relocatingNodeId()); if (node != null) { for (MutableShardRouting shardRouting : node) { if (shardRouting.shardId().equals(candidate.shardId()) && !shardRouting.primary()) { routingNodes.swapPrimaryFlag(shardRouting); break; } } } } } } } return changed; }
private ObjectLongOpenHashMap<DiscoveryNode> buildShardStates( final DiscoveryNodes nodes, MutableShardRouting shard) { ObjectLongOpenHashMap<DiscoveryNode> shardStates = cachedShardsState.get(shard.shardId()); ObjectOpenHashSet<String> nodeIds; if (shardStates == null) { shardStates = new ObjectLongOpenHashMap<>(); cachedShardsState.put(shard.shardId(), shardStates); nodeIds = ObjectOpenHashSet.from(nodes.dataNodes().keys()); } else { // clean nodes that have failed shardStates .keys() .removeAll( new ObjectPredicate<DiscoveryNode>() { @Override public boolean apply(DiscoveryNode node) { return !nodes.nodeExists(node.id()); } }); nodeIds = ObjectOpenHashSet.newInstance(); // we have stored cached from before, see if the nodes changed, if they have, go fetch again for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { DiscoveryNode node = cursor.value; if (!shardStates.containsKey(node)) { nodeIds.add(node.id()); } } } if (nodeIds.isEmpty()) { return shardStates; } String[] nodesIdsArray = nodeIds.toArray(String.class); TransportNodesListGatewayStartedShards.NodesGatewayStartedShards response = listGatewayStartedShards.list(shard.shardId(), nodesIdsArray, listTimeout).actionGet(); if (logger.isDebugEnabled()) { if (response.failures().length > 0) { StringBuilder sb = new StringBuilder(shard + ": failures when trying to list shards on nodes:"); for (int i = 0; i < response.failures().length; i++) { Throwable cause = ExceptionsHelper.unwrapCause(response.failures()[i]); if (cause instanceof ConnectTransportException) { continue; } sb.append("\n -> ").append(response.failures()[i].getDetailedMessage()); } logger.debug(sb.toString()); } } for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : response) { // -1 version means it does not exists, which is what the API returns, and what we expect to logger.trace( "[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), nodeShardState.version()); shardStates.put(nodeShardState.getNode(), nodeShardState.version()); } return shardStates; }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; DiscoveryNodes nodes = allocation.nodes(); RoutingNodes routingNodes = allocation.routingNodes(); // First, handle primaries, they must find a place to be allocated on here Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { MutableShardRouting shard = unassignedIterator.next(); if (!shard.primary()) { continue; } // this is an API allocation, ignore since we know there is no data... if (!routingNodes .routingTable() .index(shard.index()) .shard(shard.id()) .primaryAllocatedPostApi()) { continue; } ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard); int numberOfAllocationsFound = 0; long highestVersion = -1; Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet(); final boolean[] states = nodesState.allocated; final Object[] keys = nodesState.keys; final long[] values = nodesState.values; for (int i = 0; i < states.length; i++) { if (!states[i]) { continue; } DiscoveryNode node = (DiscoveryNode) keys[i]; long version = values[i]; // since we don't check in NO allocation, we need to double check here if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) { continue; } if (version != -1) { numberOfAllocationsFound++; if (highestVersion == -1) { nodesWithHighestVersion.add(node); highestVersion = version; } else { if (version > highestVersion) { nodesWithHighestVersion.clear(); nodesWithHighestVersion.add(node); highestVersion = version; } else if (version == highestVersion) { nodesWithHighestVersion.add(node); } } } } // check if the counts meets the minimum set int requiredAllocation = 1; // if we restore from a repository one copy is more then enough if (shard.restoreSource() == null) { try { IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index()); String initialShards = indexMetaData .settings() .get( INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards)); if ("quorum".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1; } } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 2) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2); } } else if ("one".equals(initialShards)) { requiredAllocation = 1; } else if ("full".equals(initialShards) || "all".equals(initialShards)) { requiredAllocation = indexMetaData.numberOfReplicas() + 1; } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = indexMetaData.numberOfReplicas(); } } else { requiredAllocation = Integer.parseInt(initialShards); } } catch (Exception e) { logger.warn( "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(), shard.id(), initialShards, shard); } } // not enough found for this shard, continue... if (numberOfAllocationsFound < requiredAllocation) { // if we are restoring this shard we still can allocate if (shard.restoreSource() == null) { // we can't really allocate, so ignore it and continue unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]", shard.index(), shard.id(), numberOfAllocationsFound, requiredAllocation); } } else if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: missing local data, will restore from [{}]", shard.index(), shard.id(), shard.restoreSource()); } continue; } Set<DiscoveryNode> throttledNodes = Sets.newHashSet(); Set<DiscoveryNode> noNodes = Sets.newHashSet(); for (DiscoveryNode discoNode : nodesWithHighestVersion) { RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.THROTTLE) { throttledNodes.add(discoNode); } else if (decision.type() == Decision.Type.NO) { noNodes.add(discoNode); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state allocation .routingNodes() .assign(new MutableShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); // found a node, so no throttling, no "no", and break out of the loop throttledNodes.clear(); noNodes.clear(); break; } } if (throttledNodes.isEmpty()) { // if we have a node that we "can't" allocate to, force allocation, since this is our master // data! if (!noNodes.isEmpty()) { DiscoveryNode discoNode = noNodes.iterator().next(); RoutingNode node = routingNodes.node(discoNode.id()); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state allocation .routingNodes() .assign(new MutableShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); } } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, throttledNodes); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } if (!routingNodes.hasUnassigned()) { return changed; } // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was // allocated on unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { MutableShardRouting shard = unassignedIterator.next(); // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { continue; } Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = buildShardStores(nodes, shard); long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> nodeStoreEntry : shardStores.entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { MutableShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = shardStores.get(primaryNode); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { if (primaryNodeStore.fileExists(storeFileMetaData.name()) && primaryNodeStore .file(storeFileMetaData.name()) .isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; allocation.routingNodes().assign(shard, lastNodeMatched.nodeId()); unassignedIterator.remove(); } } } return changed; }