/** * Process existing recoveries of replicas and see if we need to cancel them if we find a better * match. Today, a better match is one that has full sync id match compared to not having one in * the previous recovery. */ public boolean processExistingRecoveries(RoutingAllocation allocation) { boolean changed = false; MetaData metaData = allocation.metaData(); for (RoutingNodes.RoutingNodesIterator nodes = allocation.routingNodes().nodes(); nodes.hasNext(); ) { nodes.next(); for (RoutingNodes.RoutingNodeIterator it = nodes.nodeShards(); it.hasNext(); ) { ShardRouting shard = it.next(); if (shard.primary() == true) { continue; } if (shard.initializing() == false) { continue; } if (shard.relocatingNodeId() != null) { continue; } // if we are allocating a replica because of index creation, no need to go and find a copy, // there isn't one... IndexMetaData indexMetaData = metaData.index(shard.getIndexName()); if (shard.allocatedPostIndexCreate(indexMetaData) == false) { continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: fetching new stores for initializing shard", shard); continue; // still fetching } ShardRouting primaryShard = allocation.routingNodes().activePrimary(shard); assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore = findStore(primaryShard, allocation, shardStores); if (primaryStore == null || primaryStore.allocated() == false) { // if we can't find the primary data, it is probably because the primary shard is // corrupted (and listing failed) // just let the recovery find it out, no need to do anything about it for the initializing // shard logger.trace( "{}: no primary shard store found or allocated, letting actual allocation figure it out", shard); continue; } MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryStore, shardStores); if (matchingNodes.getNodeWithHighestMatch() != null) { DiscoveryNode currentNode = allocation.nodes().get(shard.currentNodeId()); DiscoveryNode nodeWithHighestMatch = matchingNodes.getNodeWithHighestMatch(); if (currentNode.equals(nodeWithHighestMatch) == false && matchingNodes.isNodeMatchBySyncID(currentNode) == false && matchingNodes.isNodeMatchBySyncID(nodeWithHighestMatch) == true) { // we found a better match that has a full sync id match, the existing allocation is not // fully synced // so we found a better one, cancel this one it.moveToUnassigned( new UnassignedInfo( UnassignedInfo.Reason.REALLOCATED_REPLICA, "existing allocation of replica to [" + currentNode + "] cancelled, sync id match found on node [" + nodeWithHighestMatch + "]", null, allocation.getCurrentNanoTime(), System.currentTimeMillis())); changed = true; } } } } return changed; }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; final RoutingNodes routingNodes = allocation.routingNodes(); final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator(); MetaData metaData = allocation.metaData(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // if we are allocating a replica because of index creation, no need to go and find a copy, // there isn't one... IndexMetaData indexMetaData = metaData.index(shard.getIndexName()); if (shard.allocatedPostIndexCreate(indexMetaData) == false) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing if (canBeAllocatedToAtLeastOneNode(shard, allocation) == false) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.removeAndIgnore(); continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); allocation.setHasPendingAsyncFetch(); unassignedIterator.removeAndIgnore(); continue; // still fetching } ShardRouting primaryShard = routingNodes.activePrimary(shard); assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore = findStore(primaryShard, allocation, shardStores); if (primaryStore == null || primaryStore.allocated() == false) { // if we can't find the primary data, it is probably because the primary shard is corrupted // (and listing failed) // we want to let the replica be allocated in order to expose the actual problem with the // primary that the replica // will try and recover from // Note, this is the existing behavior, as exposed in running // CorruptFileTest#testNoPrimaryData logger.trace( "{}: no primary shard store found or allocated, letting actual allocation figure it out", shard); continue; } MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryStore, shardStores); if (matchingNodes.getNodeWithHighestMatch() != null) { RoutingNode nodeWithHighestMatch = allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().id()); // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, nodeWithHighestMatch, allocation); if (decision.type() == Decision.Type.THROTTLE) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store", shard.index(), shard.id(), shard, nodeWithHighestMatch.node()); // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.removeAndIgnore(); } else { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store", shard.index(), shard.id(), shard, nodeWithHighestMatch.node()); // we found a match changed = true; unassignedIterator.initialize( nodeWithHighestMatch.nodeId(), null, allocation .clusterInfo() .getShardSize(shard, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE)); } } else if (matchingNodes.hasAnyData() == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation of the replica shard needs to be delayed changed |= ignoreUnassignedIfDelayed(unassignedIterator, shard); } } return changed; }