private Decision shouldIndexFilter( IndexMetaData indexMd, RoutingNode node, RoutingAllocation allocation) { if (indexMd.requireFilters() != null) { if (!indexMd.requireFilters().match(node.node())) { return allocation.decision( Decision.NO, NAME, "node does not match index setting [%s] filters [%s]", IndexMetaData.INDEX_ROUTING_REQUIRE_GROUP_PREFIX, indexMd.requireFilters()); } } if (indexMd.includeFilters() != null) { if (!indexMd.includeFilters().match(node.node())) { return allocation.decision( Decision.NO, NAME, "node does not match index setting [%s] filters [%s]", IndexMetaData.INDEX_ROUTING_INCLUDE_GROUP_PREFIX, indexMd.includeFilters()); } } if (indexMd.excludeFilters() != null) { if (indexMd.excludeFilters().match(node.node())) { return allocation.decision( Decision.NO, NAME, "node matches index setting [%s] filters [%s]", IndexMetaData.INDEX_ROUTING_EXCLUDE_GROUP_SETTING.getKey(), indexMd.excludeFilters()); } } return null; }
private Decision shouldClusterFilter(RoutingNode node, RoutingAllocation allocation) { if (clusterRequireFilters != null) { if (!clusterRequireFilters.match(node.node())) { return allocation.decision( Decision.NO, NAME, "node does not match cluster setting [%s] filters [%s]", CLUSTER_ROUTING_REQUIRE_GROUP_PREFIX, clusterRequireFilters); } } if (clusterIncludeFilters != null) { if (!clusterIncludeFilters.match(node.node())) { return allocation.decision( Decision.NO, NAME, "node does not cluster setting [%s] filters [%s]", CLUSTER_ROUTING_INCLUDE_GROUP_PREFIX, clusterIncludeFilters); } } if (clusterExcludeFilters != null) { if (clusterExcludeFilters.match(node.node())) { return allocation.decision( Decision.NO, NAME, "node matches cluster setting [%s] filters [%s]", CLUSTER_ROUTING_EXCLUDE_GROUP_PREFIX, clusterExcludeFilters); } } return null; }
private Decision shouldClusterFilter(RoutingNode node, RoutingAllocation allocation) { if (clusterRequireFilters != null) { if (!clusterRequireFilters.match(node.node())) { return allocation.decision( Decision.NO, NAME, "node does not match global required filters [%s]", clusterRequireFilters); } } if (clusterIncludeFilters != null) { if (!clusterIncludeFilters.match(node.node())) { return allocation.decision( Decision.NO, NAME, "node does not match global include filters [%s]", clusterIncludeFilters); } } if (clusterExcludeFilters != null) { if (clusterExcludeFilters.match(node.node())) { return allocation.decision( Decision.NO, NAME, "node matches global exclude filters [%s]", clusterExcludeFilters); } } return null; }
private Decision shouldIndexFilter( IndexMetaData indexMd, RoutingNode node, RoutingAllocation allocation) { if (indexMd.requireFilters() != null) { if (!indexMd.requireFilters().match(node.node())) { return allocation.decision( Decision.NO, NAME, "node does not match index required filters [%s]", indexMd.requireFilters()); } } if (indexMd.includeFilters() != null) { if (!indexMd.includeFilters().match(node.node())) { return allocation.decision( Decision.NO, NAME, "node does not match index include filters [%s]", indexMd.includeFilters()); } } if (indexMd.excludeFilters() != null) { if (indexMd.excludeFilters().match(node.node())) { return allocation.decision( Decision.NO, NAME, "node matches index exclude filters [%s]", indexMd.excludeFilters()); } } return null; }
@Override public Decision canRemain( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (shardRouting.currentNodeId().equals(node.nodeId()) == false) { throw new IllegalArgumentException( "Shard [" + shardRouting + "] is not allocated on node: [" + node.nodeId() + "]"); } final Decision decision = earlyTerminate(allocation); if (decision != null) { return decision; } ClusterInfo clusterInfo = allocation.clusterInfo(); Map<String, DiskUsage> usages = clusterInfo.getNodeLeastAvailableDiskUsages(); DiskUsage usage = getDiskUsage(node, allocation, usages); // If this node is already above the high threshold, the shard cannot remain (get it off!) double freeDiskPercentage = usage.getFreeDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isDebugEnabled()) { logger.debug( "node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes); } if (freeBytes < freeBytesThresholdHigh.bytes()) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain", freeBytesThresholdHigh, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdHigh, new ByteSizeValue(freeBytes)); } if (freeDiskPercentage < freeDiskThresholdHigh) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain", freeDiskThresholdHigh, freeDiskPercentage, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "after allocation less than required [%s%%] free disk on node, free: [%s%%]", freeDiskThresholdHigh, freeDiskPercentage); } return allocation.decision( Decision.YES, NAME, "enough disk for shard to remain on node, free: [%s]", new ByteSizeValue(freeBytes)); }
private Decision shouldFilter( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { Decision decision = shouldClusterFilter(node, allocation); if (decision != null) return decision; decision = shouldIndexFilter( allocation.metaData().getIndexSafe(shardRouting.index()), node, allocation); if (decision != null) return decision; return allocation.decision(Decision.YES, NAME, "node passes include/exclude/require filters"); }
/** Can the shard be allocated on at least one node based on the allocation deciders. */ private boolean canBeAllocatedToAtLeastOneNode(ShardRouting shard, RoutingAllocation allocation) { for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) { RoutingNode node = allocation.routingNodes().node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { return true; } } return false; }
private Decision shouldFilter( IndexMetaData indexMd, RoutingNode node, RoutingAllocation allocation) { Decision decision = shouldClusterFilter(node, allocation); if (decision != null) return decision; decision = shouldIndexFilter(indexMd, node, allocation); if (decision != null) return decision; return allocation.decision(Decision.YES, NAME, "node passes include/exclude/require filters"); }
public boolean allocateUnassigned(final RoutingAllocation allocation) { boolean changed = false; RoutingNodes.UnassignedShards unassigned = allocation.routingNodes().unassigned(); unassigned.sort( PriorityComparator.getAllocationComparator(allocation)); // sort for priority ordering changed |= primaryShardAllocator.allocateUnassigned(allocation); changed |= replicaShardAllocator.processExistingRecoveries(allocation); changed |= replicaShardAllocator.allocateUnassigned(allocation); return changed; }
@Override public Decision canRebalance(ShardRouting shardRouting, RoutingAllocation allocation) { if (clusterConcurrentRebalance == -1) { return allocation.decision(Decision.YES, NAME, "unlimited concurrent rebalances are allowed"); } int relocatingShards = allocation.routingNodes().getRelocatingShardCount(); if (relocatingShards >= clusterConcurrentRebalance) { return allocation.decision( Decision.NO, NAME, "too many shards are concurrently rebalancing [%d], limit: [%d]", relocatingShards, clusterConcurrentRebalance); } return allocation.decision( Decision.YES, NAME, "below threshold [%d] for concurrent rebalances, current rebalance shard count [%d]", clusterConcurrentRebalance, relocatingShards); }
@Override protected AsyncShardFetch.FetchResult< TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetchData(ShardRouting shard, RoutingAllocation allocation) { AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch = asyncFetchStore.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction); asyncFetchStore.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetch.fetchData( allocation.nodes(), allocation.metaData(), allocation.getIgnoreNodes(shard.shardId())); if (shardStores.hasData() == true) { shardStores.processAllocation(allocation); } return shardStores; }
private Decision earlyTerminate(RoutingAllocation allocation) { // Always allow allocation if the decider is disabled if (!enabled) { return allocation.decision(Decision.YES, NAME, "disk threshold decider disabled"); } // Allow allocation regardless if only a single node is available if (allocation.nodes().size() <= 1) { if (logger.isTraceEnabled()) { logger.trace("only a single node is present, allowing allocation"); } return allocation.decision(Decision.YES, NAME, "only a single node is present"); } // Fail open there is no info available final ClusterInfo clusterInfo = allocation.clusterInfo(); if (clusterInfo == null) { if (logger.isTraceEnabled()) { logger.trace("cluster info unavailable for disk threshold decider, allowing allocation."); } return allocation.decision(Decision.YES, NAME, "cluster info unavailable"); } final Map<String, DiskUsage> usages = clusterInfo.getNodeLeastAvailableDiskUsages(); // Fail open if there are no disk usages available if (usages.isEmpty()) { if (logger.isTraceEnabled()) { logger.trace( "unable to determine disk usages for disk-aware allocation, allowing allocation"); } return allocation.decision(Decision.YES, NAME, "disk usages unavailable"); } return null; }
@Override protected AsyncShardFetch.FetchResult< TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetchData(ShardRouting shard, RoutingAllocation allocation) { AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch = asyncFetchStarted.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction); asyncFetchStarted.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState = fetch.fetchData( allocation.nodes(), allocation.metaData(), allocation.getIgnoreNodes(shard.shardId())); if (shardState.hasData() == true) { shardState.processAllocation(allocation); } return shardState; }
@Override public Decision canAllocate( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (shardRouting.unassigned()) { // only for unassigned - we filter allocation right after the index creation ie. for shard // shrinking etc. to ensure // that once it has been allocated post API the replicas can be allocated elsewhere without // user interaction // this is a setting that can only be set within the system! IndexMetaData indexMd = allocation.metaData().getIndexSafe(shardRouting.index()); DiscoveryNodeFilters initialRecoveryFilters = indexMd.getInitialRecoveryFilters(); if (initialRecoveryFilters != null && RecoverySource.isInitialRecovery(shardRouting.recoverySource().getType()) && initialRecoveryFilters.match(node.node()) == false) { String explanation = (shardRouting.recoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS) ? "initial allocation of the shrunken index is only allowed on nodes [%s] that hold a copy of every shard in the index" : "initial allocation of the index is only allowed on nodes [%s]"; return allocation.decision(Decision.NO, NAME, explanation, initialRecoveryFilters); } } return shouldFilter(shardRouting, node, allocation); }
/** Finds the store for the assigned shard in the fetched data, returns null if none is found. */ private TransportNodesListShardStoreMetaData.StoreFilesMetaData findStore( ShardRouting shard, RoutingAllocation allocation, AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> data) { assert shard.currentNodeId() != null; DiscoveryNode primaryNode = allocation.nodes().get(shard.currentNodeId()); if (primaryNode == null) { return null; } TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore = data.getData().get(primaryNode); if (primaryNodeFilesStore == null) { return null; } return primaryNodeFilesStore.storeFilesMetaData(); }
private DiskUsage getDiskUsage( RoutingNode node, RoutingAllocation allocation, Map<String, DiskUsage> usages) { ClusterInfo clusterInfo = allocation.clusterInfo(); DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node usage = averageUsage(node, usages); if (logger.isDebugEnabled()) { logger.debug( "unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]", node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage()); } } if (includeRelocations) { long relocatingShardsSize = sizeOfRelocatingShards(node, clusterInfo, true); DiskUsage usageIncludingRelocations = new DiskUsage( node.nodeId(), node.node().name(), "_na_", usage.getTotalBytes(), usage.getFreeBytes() - relocatingShardsSize); if (logger.isTraceEnabled()) { logger.trace("usage without relocations: {}", usage); logger.trace( "usage with relocations: [{} bytes] {}", relocatingShardsSize, usageIncludingRelocations); } usage = usageIncludingRelocations; } return usage; }
@Override public void execute(RoutingAllocation allocation) throws ElasticSearchException { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); MutableShardRouting shardRouting = null; for (MutableShardRouting routing : allocation.routingNodes().unassigned()) { if (routing.shardId().equals(shardId)) { // prefer primaries first to allocate if (shardRouting == null || routing.primary()) { shardRouting = routing; } } } if (shardRouting == null) { throw new ElasticSearchIllegalArgumentException( "[allocate] failed to find " + shardId + " on the list of unassigned shards"); } if (shardRouting.primary() && !allowPrimary) { throw new ElasticSearchIllegalArgumentException( "[allocate] trying to allocate a primary shard " + shardId + "], which is disabled"); } RoutingNode routingNode = allocation.routingNodes().node(discoNode.id()); allocation.addIgnoreDisable(shardRouting.shardId(), routingNode.nodeId()); if (!allocation.deciders().canAllocate(shardRouting, routingNode, allocation).allowed()) { throw new ElasticSearchIllegalArgumentException( "[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed"); } // go over and remove it from the unassigned for (Iterator<MutableShardRouting> it = allocation.routingNodes().unassigned().iterator(); it.hasNext(); ) { if (it.next() != shardRouting) { continue; } it.remove(); routingNode.add(shardRouting); break; } }
@Override public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); boolean found = false; for (RoutingNodes.RoutingNodeIterator it = allocation.routingNodes().routingNodeIter(discoNode.id()); it.hasNext(); ) { ShardRouting shardRouting = it.next(); if (!shardRouting.shardId().equals(shardId)) { continue; } found = true; if (shardRouting.relocatingNodeId() != null) { if (shardRouting.initializing()) { // the shard is initializing and recovering from another node, simply cancel the recovery it.remove(); // and cancel the relocating state from the shard its being relocated from RoutingNode relocatingFromNode = allocation.routingNodes().node(shardRouting.relocatingNodeId()); if (relocatingFromNode != null) { for (ShardRouting fromShardRouting : relocatingFromNode) { if (fromShardRouting.isSameShard(shardRouting) && fromShardRouting.state() == RELOCATING) { allocation.routingNodes().cancelRelocation(fromShardRouting); break; } } } } else if (shardRouting.relocating()) { // the shard is relocating to another node, cancel the recovery on the other node, and // deallocate this one if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + " on node " + discoNode + ", shard is primary and initializing its state")); } throw new IllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and initializing its state"); } it.moveToUnassigned(new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null)); // now, go and find the shard that is initializing on the target node, and cancel it as // well... RoutingNodes.RoutingNodeIterator initializingNode = allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId()); if (initializingNode != null) { while (initializingNode.hasNext()) { ShardRouting initializingShardRouting = initializingNode.next(); if (initializingShardRouting.isRelocationTargetOf(shardRouting)) { initializingNode.remove(); } } } } } else { // the shard is not relocating, its either started, or initializing, just cancel it and move // on... if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + " on node " + discoNode + ", shard is primary and started")); } throw new IllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and started"); } it.moveToUnassigned(new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null)); } } if (!found) { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + ", failed to find it on node " + discoNode)); } throw new IllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + ", failed to find it on node " + discoNode); } return new RerouteExplanation( this, allocation.decision( Decision.YES, "cancel_allocation_command", "shard " + shardId + " on node " + discoNode + " can be cancelled")); }
@Override public Decision canAllocate( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { final Decision decision = earlyTerminate(allocation); if (decision != null) { return decision; } final double usedDiskThresholdLow = 100.0 - DiskThresholdDecider.this.freeDiskThresholdLow; final double usedDiskThresholdHigh = 100.0 - DiskThresholdDecider.this.freeDiskThresholdHigh; ClusterInfo clusterInfo = allocation.clusterInfo(); Map<String, DiskUsage> usages = clusterInfo.getNodeMostAvailableDiskUsages(); DiskUsage usage = getDiskUsage(node, allocation, usages); // First, check that the node currently over the low watermark double freeDiskPercentage = usage.getFreeDiskAsPercentage(); // Cache the used disk percentage for displaying disk percentages consistent with documentation double usedDiskPercentage = usage.getUsedDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isTraceEnabled()) { logger.trace("node [{}] has {}% used disk", node.nodeId(), usedDiskPercentage); } // a flag for whether the primary shard has been previously allocated boolean primaryHasBeenAllocated = shardRouting.primary() && shardRouting.allocatedPostIndexCreate(); // checks for exact byte comparisons if (freeBytes < freeBytesThresholdLow.bytes()) { // If the shard is a replica or has a primary that has already been allocated before, check // the low threshold if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation", freeBytesThresholdLow, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytes)); } else if (freeBytes > freeBytesThresholdHigh.bytes()) { // Allow the shard to be allocated because it is primary that // has never been allocated if it's under the high watermark if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, " + "but allowing allocation because primary has never been allocated", freeBytesThresholdLow, freeBytes, node.nodeId()); } return allocation.decision(Decision.YES, NAME, "primary has never been allocated before"); } else { // Even though the primary has never been allocated, the node is // above the high watermark, so don't allow allocating the shard if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, " + "preventing allocation even though primary has never been allocated", freeBytesThresholdHigh, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "less than required [%s] free on node, free: [%s]", freeBytesThresholdHigh, new ByteSizeValue(freeBytes)); } } // checks for percentage comparisons if (freeDiskPercentage < freeDiskThresholdLow) { // If the shard is a replica or has a primary that has already been allocated before, check // the low threshold if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) { if (logger.isDebugEnabled()) { logger.debug( "more than the allowed {} used disk threshold ({} used) on node [{}], preventing allocation", Strings.format1Decimals(usedDiskThresholdLow, "%"), Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId()); } return allocation.decision( Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]", usedDiskThresholdLow, freeDiskPercentage); } else if (freeDiskPercentage > freeDiskThresholdHigh) { // Allow the shard to be allocated because it is primary that // has never been allocated if it's under the high watermark if (logger.isDebugEnabled()) { logger.debug( "more than the allowed {} used disk threshold ({} used) on node [{}], " + "but allowing allocation because primary has never been allocated", Strings.format1Decimals(usedDiskThresholdLow, "%"), Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId()); } return allocation.decision(Decision.YES, NAME, "primary has never been allocated before"); } else { // Even though the primary has never been allocated, the node is // above the high watermark, so don't allow allocating the shard if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, " + "preventing allocation even though primary has never been allocated", Strings.format1Decimals(freeDiskThresholdHigh, "%"), Strings.format1Decimals(freeDiskPercentage, "%"), node.nodeId()); } return allocation.decision( Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]", usedDiskThresholdHigh, freeDiskPercentage); } } // Secondly, check that allocating the shard to this node doesn't put it above the high // watermark final long shardSize = getShardSize(shardRouting, allocation.clusterInfo()); double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize); long freeBytesAfterShard = freeBytes - shardSize; if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) { logger.warn( "after allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation", node.nodeId(), freeBytesThresholdHigh, freeBytesAfterShard); return allocation.decision( Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytesAfterShard)); } if (freeSpaceAfterShard < freeDiskThresholdHigh) { logger.warn( "after allocating, node [{}] would have more than the allowed {} free disk threshold ({} free), preventing allocation", node.nodeId(), Strings.format1Decimals(freeDiskThresholdHigh, "%"), Strings.format1Decimals(freeSpaceAfterShard, "%")); return allocation.decision( Decision.NO, NAME, "after allocation more than allowed [%s%%] used disk on node, free: [%s%%]", usedDiskThresholdLow, freeSpaceAfterShard); } return allocation.decision( Decision.YES, NAME, "enough disk for shard on node, free: [%s]", new ByteSizeValue(freeBytes)); }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; DiscoveryNodes nodes = allocation.nodes(); RoutingNodes routingNodes = allocation.routingNodes(); // First, handle primaries, they must find a place to be allocated on here Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { MutableShardRouting shard = unassignedIterator.next(); if (!shard.primary()) { continue; } // this is an API allocation, ignore since we know there is no data... if (!routingNodes .routingTable() .index(shard.index()) .shard(shard.id()) .primaryAllocatedPostApi()) { continue; } ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard); int numberOfAllocationsFound = 0; long highestVersion = -1; Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet(); final boolean[] states = nodesState.allocated; final Object[] keys = nodesState.keys; final long[] values = nodesState.values; for (int i = 0; i < states.length; i++) { if (!states[i]) { continue; } DiscoveryNode node = (DiscoveryNode) keys[i]; long version = values[i]; // since we don't check in NO allocation, we need to double check here if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) { continue; } if (version != -1) { numberOfAllocationsFound++; if (highestVersion == -1) { nodesWithHighestVersion.add(node); highestVersion = version; } else { if (version > highestVersion) { nodesWithHighestVersion.clear(); nodesWithHighestVersion.add(node); highestVersion = version; } else if (version == highestVersion) { nodesWithHighestVersion.add(node); } } } } // check if the counts meets the minimum set int requiredAllocation = 1; // if we restore from a repository one copy is more then enough if (shard.restoreSource() == null) { try { IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index()); String initialShards = indexMetaData .settings() .get( INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards)); if ("quorum".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1; } } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 2) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2); } } else if ("one".equals(initialShards)) { requiredAllocation = 1; } else if ("full".equals(initialShards) || "all".equals(initialShards)) { requiredAllocation = indexMetaData.numberOfReplicas() + 1; } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = indexMetaData.numberOfReplicas(); } } else { requiredAllocation = Integer.parseInt(initialShards); } } catch (Exception e) { logger.warn( "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(), shard.id(), initialShards, shard); } } // not enough found for this shard, continue... if (numberOfAllocationsFound < requiredAllocation) { // if we are restoring this shard we still can allocate if (shard.restoreSource() == null) { // we can't really allocate, so ignore it and continue unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]", shard.index(), shard.id(), numberOfAllocationsFound, requiredAllocation); } } else if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: missing local data, will restore from [{}]", shard.index(), shard.id(), shard.restoreSource()); } continue; } Set<DiscoveryNode> throttledNodes = Sets.newHashSet(); Set<DiscoveryNode> noNodes = Sets.newHashSet(); for (DiscoveryNode discoNode : nodesWithHighestVersion) { RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.THROTTLE) { throttledNodes.add(discoNode); } else if (decision.type() == Decision.Type.NO) { noNodes.add(discoNode); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state allocation .routingNodes() .assign(new MutableShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); // found a node, so no throttling, no "no", and break out of the loop throttledNodes.clear(); noNodes.clear(); break; } } if (throttledNodes.isEmpty()) { // if we have a node that we "can't" allocate to, force allocation, since this is our master // data! if (!noNodes.isEmpty()) { DiscoveryNode discoNode = noNodes.iterator().next(); RoutingNode node = routingNodes.node(discoNode.id()); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state allocation .routingNodes() .assign(new MutableShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); } } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, throttledNodes); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } if (!routingNodes.hasUnassigned()) { return changed; } // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was // allocated on unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { MutableShardRouting shard = unassignedIterator.next(); // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { continue; } Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = buildShardStores(nodes, shard); long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> nodeStoreEntry : shardStores.entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { MutableShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = shardStores.get(primaryNode); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { if (primaryNodeStore.fileExists(storeFileMetaData.name()) && primaryNodeStore .file(storeFileMetaData.name()) .isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; allocation.routingNodes().assign(shard, lastNodeMatched.nodeId()); unassignedIterator.remove(); } } } return changed; }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; DiscoveryNodes nodes = allocation.nodes(); RoutingNodes routingNodes = allocation.routingNodes(); // First, handle primaries, they must find a place to be allocated on here final MetaData metaData = routingNodes.metaData(); RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned(); unassigned.sort( new PriorityComparator() { @Override protected Settings getIndexSettings(String index) { IndexMetaData indexMetaData = metaData.index(index); return indexMetaData.getSettings(); } }); // sort for priority ordering Iterator<ShardRouting> unassignedIterator = unassigned.iterator(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (!shard.primary()) { continue; } // this is an API allocation, ignore since we know there is no data... if (!routingNodes .routingTable() .index(shard.index()) .shard(shard.id()) .primaryAllocatedPostApi()) { continue; } AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch = asyncFetchStarted.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction); asyncFetchStarted.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId())); if (shardState.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard started state", shard); unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); continue; } shardState.processAllocation(allocation); IndexMetaData indexMetaData = metaData.index(shard.getIndex()); /** * Build a map of DiscoveryNodes to shard state number for the given shard. A state of -1 * means the shard does not exist on the node, where any shard state >= 0 is the state version * of the shard on that node's disk. * * <p>A shard on shared storage will return at least shard state 0 for all nodes, indicating * that the shard can be allocated to any node. */ ObjectLongHashMap<DiscoveryNode> nodesState = new ObjectLongHashMap<>(); for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : shardState.getData().values()) { long version = nodeShardState.version(); // -1 version means it does not exists, which is what the API returns, and what we expect to logger.trace( "[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), version); nodesState.put(nodeShardState.getNode(), version); } int numberOfAllocationsFound = 0; long highestVersion = -1; final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap(); assert !nodesState.containsKey(null); final Object[] keys = nodesState.keys; final long[] values = nodesState.values; Settings idxSettings = indexMetaData.settings(); for (int i = 0; i < keys.length; i++) { if (keys[i] == null) { continue; } DiscoveryNode node = (DiscoveryNode) keys[i]; long version = values[i]; // since we don't check in NO allocation, we need to double check here if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) { continue; } if (recoverOnAnyNode(idxSettings)) { numberOfAllocationsFound++; if (version > highestVersion) { highestVersion = version; } // We always put the node without clearing the map nodesWithVersion.put(node, version); } else if (version != -1) { numberOfAllocationsFound++; // If we've found a new "best" candidate, clear the // current candidates and add it if (version > highestVersion) { highestVersion = version; nodesWithVersion.clear(); nodesWithVersion.put(node, version); } else if (version == highestVersion) { // If the candidate is the same, add it to the // list, but keep the current candidate nodesWithVersion.put(node, version); } } } // Now that we have a map of nodes to versions along with the // number of allocations found (and not ignored), we need to sort // it so the node with the highest version is at the beginning List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList(); nodesWithHighestVersion.addAll(nodesWithVersion.keySet()); CollectionUtil.timSort( nodesWithHighestVersion, new Comparator<DiscoveryNode>() { @Override public int compare(DiscoveryNode o1, DiscoveryNode o2) { return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1)); } }); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}] found {} allocations of {}, highest version: [{}]", shard.index(), shard.id(), numberOfAllocationsFound, shard, highestVersion); } if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder("["); for (DiscoveryNode n : nodesWithHighestVersion) { sb.append("["); sb.append(n.getName()); sb.append("]"); sb.append(" -> "); sb.append(nodesWithVersion.get(n)); sb.append(", "); } sb.append("]"); logger.trace("{} candidates for allocation: {}", shard, sb.toString()); } // check if the counts meets the minimum set int requiredAllocation = 1; // if we restore from a repository one copy is more then enough if (shard.restoreSource() == null) { try { String initialShards = indexMetaData .settings() .get( INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards)); if ("quorum".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1; } } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 2) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2); } } else if ("one".equals(initialShards)) { requiredAllocation = 1; } else if ("full".equals(initialShards) || "all".equals(initialShards)) { requiredAllocation = indexMetaData.numberOfReplicas() + 1; } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = indexMetaData.numberOfReplicas(); } } else { requiredAllocation = Integer.parseInt(initialShards); } } catch (Exception e) { logger.warn( "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(), shard.id(), initialShards, shard); } } // not enough found for this shard, continue... if (numberOfAllocationsFound < requiredAllocation) { // if we are restoring this shard we still can allocate if (shard.restoreSource() == null) { // we can't really allocate, so ignore it and continue unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]", shard.index(), shard.id(), numberOfAllocationsFound, requiredAllocation); } } else if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: missing local data, will restore from [{}]", shard.index(), shard.id(), shard.restoreSource()); } continue; } Set<DiscoveryNode> throttledNodes = Sets.newHashSet(); Set<DiscoveryNode> noNodes = Sets.newHashSet(); for (DiscoveryNode discoNode : nodesWithHighestVersion) { RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.THROTTLE) { throttledNodes.add(discoNode); } else if (decision.type() == Decision.Type.NO) { noNodes.add(discoNode); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); // found a node, so no throttling, no "no", and break out of the loop throttledNodes.clear(); noNodes.clear(); break; } } if (throttledNodes.isEmpty()) { // if we have a node that we "can't" allocate to, force allocation, since this is our master // data! if (!noNodes.isEmpty()) { DiscoveryNode discoNode = noNodes.iterator().next(); RoutingNode node = routingNodes.node(discoNode.id()); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); } } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, throttledNodes); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } if (!routingNodes.hasUnassigned()) { return changed; } // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was // allocated on unassignedIterator = unassigned.iterator(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); continue; } AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch = asyncFetchStore.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction); asyncFetchStore.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId())); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); continue; // still fetching } shardStores.processAllocation(allocation); long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; boolean hasReplicaData = false; IndexMetaData indexMetaData = metaData.index(shard.getIndex()); for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : shardStores.getData().entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { hasReplicaData |= storeFilesMetaData.iterator().hasNext(); ShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore = shardStores.getData().get(primaryNode); if (primaryNodeFilesStore != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = primaryNodeFilesStore.storeFilesMetaData(); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; String primarySyncId = primaryNodeStore.syncId(); String replicaSyncId = storeFilesMetaData.syncId(); // see if we have a sync id we can make use of if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) { logger.trace( "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId); lastNodeMatched = node; lastSizeMatched = Long.MAX_VALUE; lastDiscoNodeMatched = discoNode; } else { for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { String metaDataFileName = storeFileMetaData.name(); if (primaryNodeStore.fileExists(metaDataFileName) && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; routingNodes.initialize(shard, lastNodeMatched.nodeId()); unassignedIterator.remove(); } } else if (hasReplicaData == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list // note: we only care about replica in delayed allocation, since if we have an unassigned // primary it // will anyhow wait to find an existing copy of the shard to be allocated // note: the other side of the equation is scheduling a reroute in a timely manner, which // happens in the RoutingService long delay = shard .unassignedInfo() .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings()); if (delay > 0) { logger.debug( "[{}][{}]: delaying allocation of [{}] for [{}]", shard.index(), shard.id(), shard, TimeValue.timeValueMillis(delay)); /** * mark it as changed, since we want to kick a publishing to schedule future allocation, * see {@link * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}). */ changed = true; unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } } return changed; }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; final RoutingNodes routingNodes = allocation.routingNodes(); final MetaData metaData = routingNodes.metaData(); final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.removeAndIgnore(); continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); unassignedIterator.removeAndIgnore(); continue; // still fetching } long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; boolean hasReplicaData = false; IndexMetaData indexMetaData = metaData.index(shard.getIndex()); for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : shardStores.getData().entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { hasReplicaData |= storeFilesMetaData.iterator().hasNext(); ShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore = shardStores.getData().get(primaryNode); if (primaryNodeFilesStore != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = primaryNodeFilesStore.storeFilesMetaData(); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; String primarySyncId = primaryNodeStore.syncId(); String replicaSyncId = storeFilesMetaData.syncId(); // see if we have a sync id we can make use of if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) { logger.trace( "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId); lastNodeMatched = node; lastSizeMatched = Long.MAX_VALUE; lastDiscoNodeMatched = discoNode; } else { for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { String metaDataFileName = storeFileMetaData.name(); if (primaryNodeStore.fileExists(metaDataFileName) && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.removeAndIgnore(); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; unassignedIterator.initialize(lastNodeMatched.nodeId()); } } else if (hasReplicaData == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list // note: we only care about replica in delayed allocation, since if we have an unassigned // primary it // will anyhow wait to find an existing copy of the shard to be allocated // note: the other side of the equation is scheduling a reroute in a timely manner, which // happens in the RoutingService long delay = shard .unassignedInfo() .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings()); if (delay > 0) { logger.debug( "[{}][{}]: delaying allocation of [{}] for [{}]", shard.index(), shard.id(), shard, TimeValue.timeValueMillis(delay)); /** * mark it as changed, since we want to kick a publishing to schedule future allocation, * see {@link * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}). */ changed = true; unassignedIterator.removeAndIgnore(); } } } return changed; }
@Override public void execute(RoutingAllocation allocation) throws ElasticSearchException { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); boolean found = false; for (RoutingNodes.RoutingNodeIterator it = allocation.routingNodes().routingNodeIter(discoNode.id()); it.hasNext(); ) { MutableShardRouting shardRouting = it.next(); if (!shardRouting.shardId().equals(shardId)) { continue; } found = true; if (shardRouting.relocatingNodeId() != null) { if (shardRouting.initializing()) { // the shard is initializing and recovering from another node, simply cancel the recovery it.remove(); // and cancel the relocating state from the shard its being relocated from RoutingNode relocatingFromNode = allocation.routingNodes().node(shardRouting.relocatingNodeId()); if (relocatingFromNode != null) { for (MutableShardRouting fromShardRouting : relocatingFromNode) { if (fromShardRouting.shardId().equals(shardRouting.shardId()) && fromShardRouting.state() == RELOCATING) { allocation.routingNodes().cancelRelocation(fromShardRouting); break; } } } } else if (shardRouting.relocating()) { // the shard is relocating to another node, cancel the recovery on the other node, and // deallocate this one if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and initializing its state"); } it.moveToUnassigned(); // now, go and find the shard that is initializing on the target node, and cancel it as // well... RoutingNodes.RoutingNodeIterator initializingNode = allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId()); if (initializingNode != null) { while (initializingNode.hasNext()) { MutableShardRouting initializingShardRouting = initializingNode.next(); if (initializingShardRouting.shardId().equals(shardRouting.shardId()) && initializingShardRouting.state() == INITIALIZING) { initializingNode.remove(); } } } } } else { // the shard is not relocating, its either started, or initializing, just cancel it and move // on... if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and started"); } it.remove(); allocation .routingNodes() .unassigned() .add( new MutableShardRouting( shardRouting.index(), shardRouting.id(), null, shardRouting.primary(), ShardRoutingState.UNASSIGNED, shardRouting.version() + 1)); } } if (!found) { throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + ", failed to find it on node " + discoNode); } }
@Override public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) { final DiscoveryNode discoNode = allocation.nodes().resolveNode(node); final RoutingNodes routingNodes = allocation.routingNodes(); ShardRouting shardRouting = null; for (ShardRouting routing : routingNodes.unassigned()) { if (routing.shardId().equals(shardId)) { // prefer primaries first to allocate if (shardRouting == null || routing.primary()) { shardRouting = routing; } } } if (shardRouting == null) { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "allocate_allocation_command", "failed to find " + shardId + " on the list of unassigned shards")); } throw new IllegalArgumentException( "[allocate] failed to find " + shardId + " on the list of unassigned shards"); } if (shardRouting.primary() && !allowPrimary) { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "allocate_allocation_command", "trying to allocate a primary shard " + shardId + ", which is disabled")); } throw new IllegalArgumentException( "[allocate] trying to allocate a primary shard " + shardId + ", which is disabled"); } RoutingNode routingNode = routingNodes.node(discoNode.id()); if (routingNode == null) { if (!discoNode.dataNode()) { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "allocate_allocation_command", "Allocation can only be done on data nodes, not [" + node + "]")); } throw new IllegalArgumentException( "Allocation can only be done on data nodes, not [" + node + "]"); } else { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "allocate_allocation_command", "Could not find [" + node + "] among the routing nodes")); } throw new IllegalStateException("Could not find [" + node + "] among the routing nodes"); } } Decision decision = allocation.deciders().canAllocate(shardRouting, routingNode, allocation); if (decision.type() == Decision.Type.NO) { if (explain) { return new RerouteExplanation(this, decision); } throw new IllegalArgumentException( "[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed, reason: " + decision); } // go over and remove it from the unassigned for (RoutingNodes.UnassignedShards.UnassignedIterator it = routingNodes.unassigned().iterator(); it.hasNext(); ) { if (it.next() != shardRouting) { continue; } it.initialize(routingNode.nodeId()); if (shardRouting.primary()) { // we need to clear the post allocation flag, since its an explicit allocation of the // primary shard // and we want to force allocate it (and create a new index for it) routingNodes.addClearPostAllocationFlag(shardRouting.shardId()); } break; } return new RerouteExplanation(this, decision); }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; final RoutingNodes routingNodes = allocation.routingNodes(); final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator(); MetaData metaData = allocation.metaData(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // if we are allocating a replica because of index creation, no need to go and find a copy, // there isn't one... IndexMetaData indexMetaData = metaData.index(shard.getIndexName()); if (shard.allocatedPostIndexCreate(indexMetaData) == false) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing if (canBeAllocatedToAtLeastOneNode(shard, allocation) == false) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.removeAndIgnore(); continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); allocation.setHasPendingAsyncFetch(); unassignedIterator.removeAndIgnore(); continue; // still fetching } ShardRouting primaryShard = routingNodes.activePrimary(shard); assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore = findStore(primaryShard, allocation, shardStores); if (primaryStore == null || primaryStore.allocated() == false) { // if we can't find the primary data, it is probably because the primary shard is corrupted // (and listing failed) // we want to let the replica be allocated in order to expose the actual problem with the // primary that the replica // will try and recover from // Note, this is the existing behavior, as exposed in running // CorruptFileTest#testNoPrimaryData logger.trace( "{}: no primary shard store found or allocated, letting actual allocation figure it out", shard); continue; } MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryStore, shardStores); if (matchingNodes.getNodeWithHighestMatch() != null) { RoutingNode nodeWithHighestMatch = allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().id()); // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, nodeWithHighestMatch, allocation); if (decision.type() == Decision.Type.THROTTLE) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store", shard.index(), shard.id(), shard, nodeWithHighestMatch.node()); // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.removeAndIgnore(); } else { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store", shard.index(), shard.id(), shard, nodeWithHighestMatch.node()); // we found a match changed = true; unassignedIterator.initialize( nodeWithHighestMatch.nodeId(), null, allocation .clusterInfo() .getShardSize(shard, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE)); } } else if (matchingNodes.hasAnyData() == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation of the replica shard needs to be delayed changed |= ignoreUnassignedIfDelayed(unassignedIterator, shard); } } return changed; }
public Decision canRemain( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (!enabled) { return allocation.decision(Decision.YES, "disk threshold decider disabled"); } // Allow allocation regardless if only a single node is available if (allocation.nodes().size() <= 1) { return allocation.decision(Decision.YES, "only a single node is present"); } ClusterInfo clusterInfo = allocation.clusterInfo(); if (clusterInfo == null) { if (logger.isTraceEnabled()) { logger.trace("Cluster info unavailable for disk threshold decider, allowing allocation."); } return allocation.decision(Decision.YES, "cluster info unavailable"); } Map<String, DiskUsage> usages = clusterInfo.getNodeDiskUsages(); if (usages.isEmpty()) { if (logger.isTraceEnabled()) { logger.trace( "Unable to determine disk usages for disk-aware allocation, allowing allocation"); } return allocation.decision(Decision.YES, "disk usages unavailable"); } DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node usage = averageUsage(node, usages); if (logger.isDebugEnabled()) { logger.debug( "Unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]", node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage()); } } // If this node is already above the high threshold, the shard cannot remain (get it off!) double freeDiskPercentage = usage.getFreeDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isDebugEnabled()) { logger.debug( "Node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes); } if (freeBytes < freeBytesThresholdHigh.bytes()) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain", freeBytesThresholdHigh, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdHigh, new ByteSizeValue(freeBytes)); } if (freeDiskPercentage < freeDiskThresholdHigh) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain", freeDiskThresholdHigh, freeDiskPercentage, node.nodeId()); } return allocation.decision( Decision.NO, "after allocation less than required [%d%%] free disk on node, free: [%d%%]", freeDiskThresholdHigh, freeDiskPercentage); } return allocation.decision( Decision.YES, "enough disk for shard to remain on node, free: [%s]", new ByteSizeValue(freeBytes)); }
public Decision canAllocate( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (!enabled) { return allocation.decision(Decision.YES, "disk threshold decider disabled"); } // Allow allocation regardless if only a single node is available if (allocation.nodes().size() <= 1) { return allocation.decision(Decision.YES, "only a single node is present"); } ClusterInfo clusterInfo = allocation.clusterInfo(); if (clusterInfo == null) { if (logger.isTraceEnabled()) { logger.trace("Cluster info unavailable for disk threshold decider, allowing allocation."); } return allocation.decision(Decision.YES, "cluster info unavailable"); } Map<String, DiskUsage> usages = clusterInfo.getNodeDiskUsages(); Map<String, Long> shardSizes = clusterInfo.getShardSizes(); if (usages.isEmpty()) { if (logger.isTraceEnabled()) { logger.trace( "Unable to determine disk usages for disk-aware allocation, allowing allocation"); } return allocation.decision(Decision.YES, "disk usages unavailable"); } DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node usage = averageUsage(node, usages); if (logger.isDebugEnabled()) { logger.debug( "Unable to determine disk usage for [{}], defaulting to average across nodes [{} total] [{} free] [{}% free]", node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage()); } } // First, check that the node currently over the low watermark double freeDiskPercentage = usage.getFreeDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isDebugEnabled()) { logger.debug("Node [{}] has {}% free disk", node.nodeId(), freeDiskPercentage); } if (freeBytes < freeBytesThresholdLow.bytes()) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation", freeBytesThresholdLow, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, "less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytes)); } if (freeDiskPercentage < freeDiskThresholdLow) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {}% free disk threshold ({}% free) on node [{}], preventing allocation", freeDiskThresholdLow, freeDiskPercentage, node.nodeId()); } return allocation.decision( Decision.NO, "less than required [%d%%] free disk on node, free: [%d%%]", freeDiskThresholdLow, freeDiskThresholdLow); } // Secondly, check that allocating the shard to this node doesn't put it above the high // watermark Long shardSize = shardSizes.get(shardIdentifierFromRouting(shardRouting)); shardSize = shardSize == null ? 0 : shardSize; double freeSpaceAfterShard = this.freeDiskPercentageAfterShardAssigned(usage, shardSize); long freeBytesAfterShard = freeBytes - shardSize; if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) { logger.warn( "After allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation", node.nodeId(), freeBytesThresholdHigh, freeBytesAfterShard); return allocation.decision( Decision.NO, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytesAfterShard)); } if (freeSpaceAfterShard < freeDiskThresholdHigh) { logger.warn( "After allocating, node [{}] would have less than the required {}% free disk threshold ({}% free), preventing allocation", node.nodeId(), freeDiskThresholdHigh, freeSpaceAfterShard); return allocation.decision( Decision.NO, "after allocation less than required [%d%%] free disk on node, free: [%d%%]", freeDiskThresholdLow, freeSpaceAfterShard); } return allocation.decision( Decision.YES, "enough disk for shard on node, free: [%s]", new ByteSizeValue(freeBytes)); }
/** * Process existing recoveries of replicas and see if we need to cancel them if we find a better * match. Today, a better match is one that has full sync id match compared to not having one in * the previous recovery. */ public boolean processExistingRecoveries(RoutingAllocation allocation) { boolean changed = false; MetaData metaData = allocation.metaData(); for (RoutingNodes.RoutingNodesIterator nodes = allocation.routingNodes().nodes(); nodes.hasNext(); ) { nodes.next(); for (RoutingNodes.RoutingNodeIterator it = nodes.nodeShards(); it.hasNext(); ) { ShardRouting shard = it.next(); if (shard.primary() == true) { continue; } if (shard.initializing() == false) { continue; } if (shard.relocatingNodeId() != null) { continue; } // if we are allocating a replica because of index creation, no need to go and find a copy, // there isn't one... IndexMetaData indexMetaData = metaData.index(shard.getIndexName()); if (shard.allocatedPostIndexCreate(indexMetaData) == false) { continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: fetching new stores for initializing shard", shard); continue; // still fetching } ShardRouting primaryShard = allocation.routingNodes().activePrimary(shard); assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore = findStore(primaryShard, allocation, shardStores); if (primaryStore == null || primaryStore.allocated() == false) { // if we can't find the primary data, it is probably because the primary shard is // corrupted (and listing failed) // just let the recovery find it out, no need to do anything about it for the initializing // shard logger.trace( "{}: no primary shard store found or allocated, letting actual allocation figure it out", shard); continue; } MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryStore, shardStores); if (matchingNodes.getNodeWithHighestMatch() != null) { DiscoveryNode currentNode = allocation.nodes().get(shard.currentNodeId()); DiscoveryNode nodeWithHighestMatch = matchingNodes.getNodeWithHighestMatch(); if (currentNode.equals(nodeWithHighestMatch) == false && matchingNodes.isNodeMatchBySyncID(currentNode) == false && matchingNodes.isNodeMatchBySyncID(nodeWithHighestMatch) == true) { // we found a better match that has a full sync id match, the existing allocation is not // fully synced // so we found a better one, cancel this one it.moveToUnassigned( new UnassignedInfo( UnassignedInfo.Reason.REALLOCATED_REPLICA, "existing allocation of replica to [" + currentNode + "] cancelled, sync id match found on node [" + nodeWithHighestMatch + "]", null, allocation.getCurrentNanoTime(), System.currentTimeMillis())); changed = true; } } } } return changed; }
private MatchingNodes findMatchingNodes( ShardRouting shard, RoutingAllocation allocation, TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore, AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> data) { ObjectLongMap<DiscoveryNode> nodesToSize = new ObjectLongHashMap<>(); for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : data.getData().entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData(); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = allocation.routingNodes().node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... (and it might be primary as well) if (storeFilesMetaData.allocated()) { continue; } // we don't have any files at all, it is an empty index if (storeFilesMetaData.iterator().hasNext() == false) { continue; } String primarySyncId = primaryStore.syncId(); String replicaSyncId = storeFilesMetaData.syncId(); // see if we have a sync id we can make use of if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) { logger.trace( "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId); nodesToSize.put(discoNode, Long.MAX_VALUE); } else { long sizeMatched = 0; for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { String metaDataFileName = storeFileMetaData.name(); if (primaryStore.fileExists(metaDataFileName) && primaryStore.file(metaDataFileName).isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); nodesToSize.put(discoNode, sizeMatched); } } return new MatchingNodes(nodesToSize); }