private Decision earlyTerminate(RoutingAllocation allocation) { // Always allow allocation if the decider is disabled if (!enabled) { return allocation.decision(Decision.YES, NAME, "disk threshold decider disabled"); } // Allow allocation regardless if only a single node is available if (allocation.nodes().size() <= 1) { if (logger.isTraceEnabled()) { logger.trace("only a single node is present, allowing allocation"); } return allocation.decision(Decision.YES, NAME, "only a single node is present"); } // Fail open there is no info available final ClusterInfo clusterInfo = allocation.clusterInfo(); if (clusterInfo == null) { if (logger.isTraceEnabled()) { logger.trace("cluster info unavailable for disk threshold decider, allowing allocation."); } return allocation.decision(Decision.YES, NAME, "cluster info unavailable"); } final Map<String, DiskUsage> usages = clusterInfo.getNodeLeastAvailableDiskUsages(); // Fail open if there are no disk usages available if (usages.isEmpty()) { if (logger.isTraceEnabled()) { logger.trace( "unable to determine disk usages for disk-aware allocation, allowing allocation"); } return allocation.decision(Decision.YES, NAME, "disk usages unavailable"); } return null; }
@Override public Decision canRemain( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (shardRouting.currentNodeId().equals(node.nodeId()) == false) { throw new IllegalArgumentException( "Shard [" + shardRouting + "] is not allocated on node: [" + node.nodeId() + "]"); } final Decision decision = earlyTerminate(allocation); if (decision != null) { return decision; } ClusterInfo clusterInfo = allocation.clusterInfo(); Map<String, DiskUsage> usages = clusterInfo.getNodeLeastAvailableDiskUsages(); DiskUsage usage = getDiskUsage(node, allocation, usages); // If this node is already above the high threshold, the shard cannot remain (get it off!) double freeDiskPercentage = usage.getFreeDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isDebugEnabled()) { logger.debug( "node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes); } if (freeBytes < freeBytesThresholdHigh.bytes()) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain", freeBytesThresholdHigh, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdHigh, new ByteSizeValue(freeBytes)); } if (freeDiskPercentage < freeDiskThresholdHigh) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain", freeDiskThresholdHigh, freeDiskPercentage, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "after allocation less than required [%s%%] free disk on node, free: [%s%%]", freeDiskThresholdHigh, freeDiskPercentage); } return allocation.decision( Decision.YES, NAME, "enough disk for shard to remain on node, free: [%s]", new ByteSizeValue(freeBytes)); }
private DiskUsage getDiskUsage( RoutingNode node, RoutingAllocation allocation, Map<String, DiskUsage> usages) { ClusterInfo clusterInfo = allocation.clusterInfo(); DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node usage = averageUsage(node, usages); if (logger.isDebugEnabled()) { logger.debug( "unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]", node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage()); } } if (includeRelocations) { long relocatingShardsSize = sizeOfRelocatingShards(node, clusterInfo, true); DiskUsage usageIncludingRelocations = new DiskUsage( node.nodeId(), node.node().name(), "_na_", usage.getTotalBytes(), usage.getFreeBytes() - relocatingShardsSize); if (logger.isTraceEnabled()) { logger.trace("usage without relocations: {}", usage); logger.trace( "usage with relocations: [{} bytes] {}", relocatingShardsSize, usageIncludingRelocations); } usage = usageIncludingRelocations; } return usage; }
@Override public Decision canAllocate( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { final Decision decision = earlyTerminate(allocation); if (decision != null) { return decision; } final double usedDiskThresholdLow = 100.0 - DiskThresholdDecider.this.freeDiskThresholdLow; final double usedDiskThresholdHigh = 100.0 - DiskThresholdDecider.this.freeDiskThresholdHigh; ClusterInfo clusterInfo = allocation.clusterInfo(); Map<String, DiskUsage> usages = clusterInfo.getNodeMostAvailableDiskUsages(); DiskUsage usage = getDiskUsage(node, allocation, usages); // First, check that the node currently over the low watermark double freeDiskPercentage = usage.getFreeDiskAsPercentage(); // Cache the used disk percentage for displaying disk percentages consistent with documentation double usedDiskPercentage = usage.getUsedDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isTraceEnabled()) { logger.trace("node [{}] has {}% used disk", node.nodeId(), usedDiskPercentage); } // a flag for whether the primary shard has been previously allocated boolean primaryHasBeenAllocated = shardRouting.primary() && shardRouting.allocatedPostIndexCreate(); // checks for exact byte comparisons if (freeBytes < freeBytesThresholdLow.bytes()) { // If the shard is a replica or has a primary that has already been allocated before, check // the low threshold if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation", freeBytesThresholdLow, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytes)); } else if (freeBytes > freeBytesThresholdHigh.bytes()) { // Allow the shard to be allocated because it is primary that // has never been allocated if it's under the high watermark if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, " + "but allowing allocation because primary has never been allocated", freeBytesThresholdLow, freeBytes, node.nodeId()); } return allocation.decision(Decision.YES, NAME, "primary has never been allocated before"); } else { // Even though the primary has never been allocated, the node is // above the high watermark, so don't allow allocating the shard if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, " + "preventing allocation even though primary has never been allocated", freeBytesThresholdHigh, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "less than required [%s] free on node, free: [%s]", freeBytesThresholdHigh, new ByteSizeValue(freeBytes)); } } // checks for percentage comparisons if (freeDiskPercentage < freeDiskThresholdLow) { // If the shard is a replica or has a primary that has already been allocated before, check // the low threshold if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) { if (logger.isDebugEnabled()) { logger.debug( "more than the allowed {} used disk threshold ({} used) on node [{}], preventing allocation", Strings.format1Decimals(usedDiskThresholdLow, "%"), Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId()); } return allocation.decision( Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]", usedDiskThresholdLow, freeDiskPercentage); } else if (freeDiskPercentage > freeDiskThresholdHigh) { // Allow the shard to be allocated because it is primary that // has never been allocated if it's under the high watermark if (logger.isDebugEnabled()) { logger.debug( "more than the allowed {} used disk threshold ({} used) on node [{}], " + "but allowing allocation because primary has never been allocated", Strings.format1Decimals(usedDiskThresholdLow, "%"), Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId()); } return allocation.decision(Decision.YES, NAME, "primary has never been allocated before"); } else { // Even though the primary has never been allocated, the node is // above the high watermark, so don't allow allocating the shard if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, " + "preventing allocation even though primary has never been allocated", Strings.format1Decimals(freeDiskThresholdHigh, "%"), Strings.format1Decimals(freeDiskPercentage, "%"), node.nodeId()); } return allocation.decision( Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]", usedDiskThresholdHigh, freeDiskPercentage); } } // Secondly, check that allocating the shard to this node doesn't put it above the high // watermark final long shardSize = getShardSize(shardRouting, allocation.clusterInfo()); double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize); long freeBytesAfterShard = freeBytes - shardSize; if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) { logger.warn( "after allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation", node.nodeId(), freeBytesThresholdHigh, freeBytesAfterShard); return allocation.decision( Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytesAfterShard)); } if (freeSpaceAfterShard < freeDiskThresholdHigh) { logger.warn( "after allocating, node [{}] would have more than the allowed {} free disk threshold ({} free), preventing allocation", node.nodeId(), Strings.format1Decimals(freeDiskThresholdHigh, "%"), Strings.format1Decimals(freeSpaceAfterShard, "%")); return allocation.decision( Decision.NO, NAME, "after allocation more than allowed [%s%%] used disk on node, free: [%s%%]", usedDiskThresholdLow, freeSpaceAfterShard); } return allocation.decision( Decision.YES, NAME, "enough disk for shard on node, free: [%s]", new ByteSizeValue(freeBytes)); }
public Decision canRemain( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (!enabled) { return allocation.decision(Decision.YES, "disk threshold decider disabled"); } // Allow allocation regardless if only a single node is available if (allocation.nodes().size() <= 1) { return allocation.decision(Decision.YES, "only a single node is present"); } ClusterInfo clusterInfo = allocation.clusterInfo(); if (clusterInfo == null) { if (logger.isTraceEnabled()) { logger.trace("Cluster info unavailable for disk threshold decider, allowing allocation."); } return allocation.decision(Decision.YES, "cluster info unavailable"); } Map<String, DiskUsage> usages = clusterInfo.getNodeDiskUsages(); if (usages.isEmpty()) { if (logger.isTraceEnabled()) { logger.trace( "Unable to determine disk usages for disk-aware allocation, allowing allocation"); } return allocation.decision(Decision.YES, "disk usages unavailable"); } DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node usage = averageUsage(node, usages); if (logger.isDebugEnabled()) { logger.debug( "Unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]", node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage()); } } // If this node is already above the high threshold, the shard cannot remain (get it off!) double freeDiskPercentage = usage.getFreeDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isDebugEnabled()) { logger.debug( "Node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes); } if (freeBytes < freeBytesThresholdHigh.bytes()) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain", freeBytesThresholdHigh, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdHigh, new ByteSizeValue(freeBytes)); } if (freeDiskPercentage < freeDiskThresholdHigh) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain", freeDiskThresholdHigh, freeDiskPercentage, node.nodeId()); } return allocation.decision( Decision.NO, "after allocation less than required [%d%%] free disk on node, free: [%d%%]", freeDiskThresholdHigh, freeDiskPercentage); } return allocation.decision( Decision.YES, "enough disk for shard to remain on node, free: [%s]", new ByteSizeValue(freeBytes)); }
public Decision canAllocate( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (!enabled) { return allocation.decision(Decision.YES, "disk threshold decider disabled"); } // Allow allocation regardless if only a single node is available if (allocation.nodes().size() <= 1) { return allocation.decision(Decision.YES, "only a single node is present"); } ClusterInfo clusterInfo = allocation.clusterInfo(); if (clusterInfo == null) { if (logger.isTraceEnabled()) { logger.trace("Cluster info unavailable for disk threshold decider, allowing allocation."); } return allocation.decision(Decision.YES, "cluster info unavailable"); } Map<String, DiskUsage> usages = clusterInfo.getNodeDiskUsages(); Map<String, Long> shardSizes = clusterInfo.getShardSizes(); if (usages.isEmpty()) { if (logger.isTraceEnabled()) { logger.trace( "Unable to determine disk usages for disk-aware allocation, allowing allocation"); } return allocation.decision(Decision.YES, "disk usages unavailable"); } DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node usage = averageUsage(node, usages); if (logger.isDebugEnabled()) { logger.debug( "Unable to determine disk usage for [{}], defaulting to average across nodes [{} total] [{} free] [{}% free]", node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage()); } } // First, check that the node currently over the low watermark double freeDiskPercentage = usage.getFreeDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isDebugEnabled()) { logger.debug("Node [{}] has {}% free disk", node.nodeId(), freeDiskPercentage); } if (freeBytes < freeBytesThresholdLow.bytes()) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation", freeBytesThresholdLow, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, "less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytes)); } if (freeDiskPercentage < freeDiskThresholdLow) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {}% free disk threshold ({}% free) on node [{}], preventing allocation", freeDiskThresholdLow, freeDiskPercentage, node.nodeId()); } return allocation.decision( Decision.NO, "less than required [%d%%] free disk on node, free: [%d%%]", freeDiskThresholdLow, freeDiskThresholdLow); } // Secondly, check that allocating the shard to this node doesn't put it above the high // watermark Long shardSize = shardSizes.get(shardIdentifierFromRouting(shardRouting)); shardSize = shardSize == null ? 0 : shardSize; double freeSpaceAfterShard = this.freeDiskPercentageAfterShardAssigned(usage, shardSize); long freeBytesAfterShard = freeBytes - shardSize; if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) { logger.warn( "After allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation", node.nodeId(), freeBytesThresholdHigh, freeBytesAfterShard); return allocation.decision( Decision.NO, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytesAfterShard)); } if (freeSpaceAfterShard < freeDiskThresholdHigh) { logger.warn( "After allocating, node [{}] would have less than the required {}% free disk threshold ({}% free), preventing allocation", node.nodeId(), freeDiskThresholdHigh, freeSpaceAfterShard); return allocation.decision( Decision.NO, "after allocation less than required [%d%%] free disk on node, free: [%d%%]", freeDiskThresholdLow, freeSpaceAfterShard); } return allocation.decision( Decision.YES, "enough disk for shard on node, free: [%s]", new ByteSizeValue(freeBytes)); }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; final RoutingNodes routingNodes = allocation.routingNodes(); final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator(); MetaData metaData = allocation.metaData(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // if we are allocating a replica because of index creation, no need to go and find a copy, // there isn't one... IndexMetaData indexMetaData = metaData.index(shard.getIndexName()); if (shard.allocatedPostIndexCreate(indexMetaData) == false) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing if (canBeAllocatedToAtLeastOneNode(shard, allocation) == false) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.removeAndIgnore(); continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); allocation.setHasPendingAsyncFetch(); unassignedIterator.removeAndIgnore(); continue; // still fetching } ShardRouting primaryShard = routingNodes.activePrimary(shard); assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore = findStore(primaryShard, allocation, shardStores); if (primaryStore == null || primaryStore.allocated() == false) { // if we can't find the primary data, it is probably because the primary shard is corrupted // (and listing failed) // we want to let the replica be allocated in order to expose the actual problem with the // primary that the replica // will try and recover from // Note, this is the existing behavior, as exposed in running // CorruptFileTest#testNoPrimaryData logger.trace( "{}: no primary shard store found or allocated, letting actual allocation figure it out", shard); continue; } MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryStore, shardStores); if (matchingNodes.getNodeWithHighestMatch() != null) { RoutingNode nodeWithHighestMatch = allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().id()); // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, nodeWithHighestMatch, allocation); if (decision.type() == Decision.Type.THROTTLE) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store", shard.index(), shard.id(), shard, nodeWithHighestMatch.node()); // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.removeAndIgnore(); } else { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store", shard.index(), shard.id(), shard, nodeWithHighestMatch.node()); // we found a match changed = true; unassignedIterator.initialize( nodeWithHighestMatch.nodeId(), null, allocation .clusterInfo() .getShardSize(shard, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE)); } } else if (matchingNodes.hasAnyData() == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation of the replica shard needs to be delayed changed |= ignoreUnassignedIfDelayed(unassignedIterator, shard); } } return changed; }