/** * Check if the allocation of the replica is to be delayed. Compute the delay and if it is * delayed, add it to the ignore unassigned list Note: we only care about replica in delayed * allocation, since if we have an unassigned primary it will anyhow wait to find an existing copy * of the shard to be allocated Note: the other side of the equation is scheduling a reroute in a * timely manner, which happens in the RoutingService * * <p>PUBLIC FOR TESTS! * * @param unassignedIterator iterator over unassigned shards * @param shard the shard which might be delayed * @return true iff allocation is delayed for this shard */ public boolean ignoreUnassignedIfDelayed( RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator, ShardRouting shard) { // calculate delay and store it in UnassignedInfo to be used by RoutingService long delay = shard.unassignedInfo().getLastComputedLeftDelayNanos(); if (delay > 0) { logger.debug( "[{}][{}]: delaying allocation of [{}] for [{}]", shard.index(), shard.id(), shard, TimeValue.timeValueNanos(delay)); /** * mark it as changed, since we want to kick a publishing to schedule future allocation, see * {@link * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}). */ unassignedIterator.removeAndIgnore(); return true; } return false; }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; final RoutingNodes routingNodes = allocation.routingNodes(); final MetaData metaData = routingNodes.metaData(); final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.removeAndIgnore(); continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); unassignedIterator.removeAndIgnore(); continue; // still fetching } long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; boolean hasReplicaData = false; IndexMetaData indexMetaData = metaData.index(shard.getIndex()); for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : shardStores.getData().entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { hasReplicaData |= storeFilesMetaData.iterator().hasNext(); ShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore = shardStores.getData().get(primaryNode); if (primaryNodeFilesStore != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = primaryNodeFilesStore.storeFilesMetaData(); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; String primarySyncId = primaryNodeStore.syncId(); String replicaSyncId = storeFilesMetaData.syncId(); // see if we have a sync id we can make use of if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) { logger.trace( "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId); lastNodeMatched = node; lastSizeMatched = Long.MAX_VALUE; lastDiscoNodeMatched = discoNode; } else { for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { String metaDataFileName = storeFileMetaData.name(); if (primaryNodeStore.fileExists(metaDataFileName) && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.removeAndIgnore(); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; unassignedIterator.initialize(lastNodeMatched.nodeId()); } } else if (hasReplicaData == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list // note: we only care about replica in delayed allocation, since if we have an unassigned // primary it // will anyhow wait to find an existing copy of the shard to be allocated // note: the other side of the equation is scheduling a reroute in a timely manner, which // happens in the RoutingService long delay = shard .unassignedInfo() .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings()); if (delay > 0) { logger.debug( "[{}][{}]: delaying allocation of [{}] for [{}]", shard.index(), shard.id(), shard, TimeValue.timeValueMillis(delay)); /** * mark it as changed, since we want to kick a publishing to schedule future allocation, * see {@link * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}). */ changed = true; unassignedIterator.removeAndIgnore(); } } } return changed; }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; final RoutingNodes routingNodes = allocation.routingNodes(); final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator(); MetaData metaData = allocation.metaData(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // if we are allocating a replica because of index creation, no need to go and find a copy, // there isn't one... IndexMetaData indexMetaData = metaData.index(shard.getIndexName()); if (shard.allocatedPostIndexCreate(indexMetaData) == false) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing if (canBeAllocatedToAtLeastOneNode(shard, allocation) == false) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.removeAndIgnore(); continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); allocation.setHasPendingAsyncFetch(); unassignedIterator.removeAndIgnore(); continue; // still fetching } ShardRouting primaryShard = routingNodes.activePrimary(shard); assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore = findStore(primaryShard, allocation, shardStores); if (primaryStore == null || primaryStore.allocated() == false) { // if we can't find the primary data, it is probably because the primary shard is corrupted // (and listing failed) // we want to let the replica be allocated in order to expose the actual problem with the // primary that the replica // will try and recover from // Note, this is the existing behavior, as exposed in running // CorruptFileTest#testNoPrimaryData logger.trace( "{}: no primary shard store found or allocated, letting actual allocation figure it out", shard); continue; } MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryStore, shardStores); if (matchingNodes.getNodeWithHighestMatch() != null) { RoutingNode nodeWithHighestMatch = allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().id()); // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, nodeWithHighestMatch, allocation); if (decision.type() == Decision.Type.THROTTLE) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store", shard.index(), shard.id(), shard, nodeWithHighestMatch.node()); // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.removeAndIgnore(); } else { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store", shard.index(), shard.id(), shard, nodeWithHighestMatch.node()); // we found a match changed = true; unassignedIterator.initialize( nodeWithHighestMatch.nodeId(), null, allocation .clusterInfo() .getShardSize(shard, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE)); } } else if (matchingNodes.hasAnyData() == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation of the replica shard needs to be delayed changed |= ignoreUnassignedIfDelayed(unassignedIterator, shard); } } return changed; }
@Override public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) { final DiscoveryNode discoNode = allocation.nodes().resolveNode(node); final RoutingNodes routingNodes = allocation.routingNodes(); ShardRouting shardRouting = null; for (ShardRouting routing : routingNodes.unassigned()) { if (routing.shardId().equals(shardId)) { // prefer primaries first to allocate if (shardRouting == null || routing.primary()) { shardRouting = routing; } } } if (shardRouting == null) { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "allocate_allocation_command", "failed to find " + shardId + " on the list of unassigned shards")); } throw new IllegalArgumentException( "[allocate] failed to find " + shardId + " on the list of unassigned shards"); } if (shardRouting.primary() && !allowPrimary) { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "allocate_allocation_command", "trying to allocate a primary shard " + shardId + ", which is disabled")); } throw new IllegalArgumentException( "[allocate] trying to allocate a primary shard " + shardId + ", which is disabled"); } RoutingNode routingNode = routingNodes.node(discoNode.id()); if (routingNode == null) { if (!discoNode.dataNode()) { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "allocate_allocation_command", "Allocation can only be done on data nodes, not [" + node + "]")); } throw new IllegalArgumentException( "Allocation can only be done on data nodes, not [" + node + "]"); } else { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "allocate_allocation_command", "Could not find [" + node + "] among the routing nodes")); } throw new IllegalStateException("Could not find [" + node + "] among the routing nodes"); } } Decision decision = allocation.deciders().canAllocate(shardRouting, routingNode, allocation); if (decision.type() == Decision.Type.NO) { if (explain) { return new RerouteExplanation(this, decision); } throw new IllegalArgumentException( "[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed, reason: " + decision); } // go over and remove it from the unassigned for (RoutingNodes.UnassignedShards.UnassignedIterator it = routingNodes.unassigned().iterator(); it.hasNext(); ) { if (it.next() != shardRouting) { continue; } it.initialize(routingNode.nodeId()); if (shardRouting.primary()) { // we need to clear the post allocation flag, since its an explicit allocation of the // primary shard // and we want to force allocate it (and create a new index for it) routingNodes.addClearPostAllocationFlag(shardRouting.shardId()); } break; } return new RerouteExplanation(this, decision); }