/** Warn about the given disk usage if the low or high watermark has been passed */ private void warnAboutDiskIfNeeded(DiskUsage usage) { // Check absolute disk values if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdHigh.bytes()) { logger.warn( "high disk watermark [{}] exceeded on {}, shards will be relocated away from this node", DiskThresholdDecider.this.freeBytesThresholdHigh, usage); } else if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdLow.bytes()) { logger.info( "low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node", DiskThresholdDecider.this.freeBytesThresholdLow, usage); } // Check percentage disk values if (usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdHigh) { logger.warn( "high disk watermark [{}] exceeded on {}, shards will be relocated away from this node", Strings.format1Decimals(100.0 - DiskThresholdDecider.this.freeDiskThresholdHigh, "%"), usage); } else if (usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdLow) { logger.info( "low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node", Strings.format1Decimals(100.0 - DiskThresholdDecider.this.freeDiskThresholdLow, "%"), usage); } }
/** * Returns a {@link DiskUsage} for the {@link RoutingNode} using the average usage of other nodes * in the disk usage map. * * @param node Node to return an averaged DiskUsage object for * @param usages Map of nodeId to DiskUsage for all known nodes * @return DiskUsage representing given node using the average disk usage */ public DiskUsage averageUsage(RoutingNode node, Map<String, DiskUsage> usages) { long totalBytes = 0; long freeBytes = 0; for (DiskUsage du : usages.values()) { totalBytes += du.getTotalBytes(); freeBytes += du.getFreeBytes(); } return new DiskUsage(node.nodeId(), totalBytes / usages.size(), freeBytes / usages.size()); }
@Override public Decision canRemain( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (shardRouting.currentNodeId().equals(node.nodeId()) == false) { throw new IllegalArgumentException( "Shard [" + shardRouting + "] is not allocated on node: [" + node.nodeId() + "]"); } final Decision decision = earlyTerminate(allocation); if (decision != null) { return decision; } ClusterInfo clusterInfo = allocation.clusterInfo(); Map<String, DiskUsage> usages = clusterInfo.getNodeLeastAvailableDiskUsages(); DiskUsage usage = getDiskUsage(node, allocation, usages); // If this node is already above the high threshold, the shard cannot remain (get it off!) double freeDiskPercentage = usage.getFreeDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isDebugEnabled()) { logger.debug( "node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes); } if (freeBytes < freeBytesThresholdHigh.bytes()) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain", freeBytesThresholdHigh, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdHigh, new ByteSizeValue(freeBytes)); } if (freeDiskPercentage < freeDiskThresholdHigh) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain", freeDiskThresholdHigh, freeDiskPercentage, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "after allocation less than required [%s%%] free disk on node, free: [%s%%]", freeDiskThresholdHigh, freeDiskPercentage); } return allocation.decision( Decision.YES, NAME, "enough disk for shard to remain on node, free: [%s]", new ByteSizeValue(freeBytes)); }
/** * Given the DiskUsage for a node and the size of the shard, return the percentage of free disk if * the shard were to be allocated to the node. * * @param usage A DiskUsage for the node to have space computed for * @param shardSize Size in bytes of the shard * @return Percentage of free space after the shard is assigned to the node */ public double freeDiskPercentageAfterShardAssigned(DiskUsage usage, Long shardSize) { shardSize = (shardSize == null) ? 0 : shardSize; DiskUsage newUsage = new DiskUsage( usage.getNodeId(), usage.getNodeName(), usage.getPath(), usage.getTotalBytes(), usage.getFreeBytes() - shardSize); return newUsage.getFreeDiskAsPercentage(); }
@Test public void averageUsageUnitTest() { RoutingNode rn = new RoutingNode("node1", newNode("node1")); DiskThresholdDecider decider = new DiskThresholdDecider(ImmutableSettings.EMPTY); Map<String, DiskUsage> usages = new HashMap<>(); usages.put("node2", new DiskUsage("node2", 100, 50)); // 50% used usages.put("node3", new DiskUsage("node3", 100, 0)); // 100% used DiskUsage node1Usage = decider.averageUsage(rn, usages); assertThat(node1Usage.getTotalBytes(), equalTo(100L)); assertThat(node1Usage.getFreeBytes(), equalTo(25L)); }
private DiskUsage getDiskUsage( RoutingNode node, RoutingAllocation allocation, Map<String, DiskUsage> usages) { ClusterInfo clusterInfo = allocation.clusterInfo(); DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node usage = averageUsage(node, usages); if (logger.isDebugEnabled()) { logger.debug( "unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]", node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage()); } } if (includeRelocations) { long relocatingShardsSize = sizeOfRelocatingShards(node, clusterInfo, true); DiskUsage usageIncludingRelocations = new DiskUsage( node.nodeId(), node.node().name(), "_na_", usage.getTotalBytes(), usage.getFreeBytes() - relocatingShardsSize); if (logger.isTraceEnabled()) { logger.trace("usage without relocations: {}", usage); logger.trace( "usage with relocations: [{} bytes] {}", relocatingShardsSize, usageIncludingRelocations); } usage = usageIncludingRelocations; } return usage; }
/** * Returns a {@link DiskUsage} for the {@link RoutingNode} using the average usage of other nodes * in the disk usage map. * * @param node Node to return an averaged DiskUsage object for * @param usages Map of nodeId to DiskUsage for all known nodes * @return DiskUsage representing given node using the average disk usage */ public DiskUsage averageUsage(RoutingNode node, Map<String, DiskUsage> usages) { if (usages.size() == 0) { return new DiskUsage(node.nodeId(), node.node().name(), "_na_", 0, 0); } long totalBytes = 0; long freeBytes = 0; for (DiskUsage du : usages.values()) { totalBytes += du.getTotalBytes(); freeBytes += du.getFreeBytes(); } return new DiskUsage( node.nodeId(), node.node().name(), "_na_", totalBytes / usages.size(), freeBytes / usages.size()); }
@Override public Decision canAllocate( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { final Decision decision = earlyTerminate(allocation); if (decision != null) { return decision; } final double usedDiskThresholdLow = 100.0 - DiskThresholdDecider.this.freeDiskThresholdLow; final double usedDiskThresholdHigh = 100.0 - DiskThresholdDecider.this.freeDiskThresholdHigh; ClusterInfo clusterInfo = allocation.clusterInfo(); Map<String, DiskUsage> usages = clusterInfo.getNodeMostAvailableDiskUsages(); DiskUsage usage = getDiskUsage(node, allocation, usages); // First, check that the node currently over the low watermark double freeDiskPercentage = usage.getFreeDiskAsPercentage(); // Cache the used disk percentage for displaying disk percentages consistent with documentation double usedDiskPercentage = usage.getUsedDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isTraceEnabled()) { logger.trace("node [{}] has {}% used disk", node.nodeId(), usedDiskPercentage); } // a flag for whether the primary shard has been previously allocated boolean primaryHasBeenAllocated = shardRouting.primary() && shardRouting.allocatedPostIndexCreate(); // checks for exact byte comparisons if (freeBytes < freeBytesThresholdLow.bytes()) { // If the shard is a replica or has a primary that has already been allocated before, check // the low threshold if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation", freeBytesThresholdLow, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytes)); } else if (freeBytes > freeBytesThresholdHigh.bytes()) { // Allow the shard to be allocated because it is primary that // has never been allocated if it's under the high watermark if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, " + "but allowing allocation because primary has never been allocated", freeBytesThresholdLow, freeBytes, node.nodeId()); } return allocation.decision(Decision.YES, NAME, "primary has never been allocated before"); } else { // Even though the primary has never been allocated, the node is // above the high watermark, so don't allow allocating the shard if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, " + "preventing allocation even though primary has never been allocated", freeBytesThresholdHigh, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, NAME, "less than required [%s] free on node, free: [%s]", freeBytesThresholdHigh, new ByteSizeValue(freeBytes)); } } // checks for percentage comparisons if (freeDiskPercentage < freeDiskThresholdLow) { // If the shard is a replica or has a primary that has already been allocated before, check // the low threshold if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) { if (logger.isDebugEnabled()) { logger.debug( "more than the allowed {} used disk threshold ({} used) on node [{}], preventing allocation", Strings.format1Decimals(usedDiskThresholdLow, "%"), Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId()); } return allocation.decision( Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]", usedDiskThresholdLow, freeDiskPercentage); } else if (freeDiskPercentage > freeDiskThresholdHigh) { // Allow the shard to be allocated because it is primary that // has never been allocated if it's under the high watermark if (logger.isDebugEnabled()) { logger.debug( "more than the allowed {} used disk threshold ({} used) on node [{}], " + "but allowing allocation because primary has never been allocated", Strings.format1Decimals(usedDiskThresholdLow, "%"), Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId()); } return allocation.decision(Decision.YES, NAME, "primary has never been allocated before"); } else { // Even though the primary has never been allocated, the node is // above the high watermark, so don't allow allocating the shard if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} bytes free) on node {}, " + "preventing allocation even though primary has never been allocated", Strings.format1Decimals(freeDiskThresholdHigh, "%"), Strings.format1Decimals(freeDiskPercentage, "%"), node.nodeId()); } return allocation.decision( Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]", usedDiskThresholdHigh, freeDiskPercentage); } } // Secondly, check that allocating the shard to this node doesn't put it above the high // watermark final long shardSize = getShardSize(shardRouting, allocation.clusterInfo()); double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize); long freeBytesAfterShard = freeBytes - shardSize; if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) { logger.warn( "after allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation", node.nodeId(), freeBytesThresholdHigh, freeBytesAfterShard); return allocation.decision( Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytesAfterShard)); } if (freeSpaceAfterShard < freeDiskThresholdHigh) { logger.warn( "after allocating, node [{}] would have more than the allowed {} free disk threshold ({} free), preventing allocation", node.nodeId(), Strings.format1Decimals(freeDiskThresholdHigh, "%"), Strings.format1Decimals(freeSpaceAfterShard, "%")); return allocation.decision( Decision.NO, NAME, "after allocation more than allowed [%s%%] used disk on node, free: [%s%%]", usedDiskThresholdLow, freeSpaceAfterShard); } return allocation.decision( Decision.YES, NAME, "enough disk for shard on node, free: [%s]", new ByteSizeValue(freeBytes)); }
@Override public void onNewInfo(ClusterInfo info) { Map<String, DiskUsage> usages = info.getNodeLeastAvailableDiskUsages(); if (usages != null) { boolean reroute = false; String explanation = ""; // Garbage collect nodes that have been removed from the cluster // from the map that tracks watermark crossing Set<String> nodes = usages.keySet(); for (String node : nodeHasPassedWatermark) { if (nodes.contains(node) == false) { nodeHasPassedWatermark.remove(node); } } for (Map.Entry<String, DiskUsage> entry : usages.entrySet()) { String node = entry.getKey(); DiskUsage usage = entry.getValue(); warnAboutDiskIfNeeded(usage); if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdHigh.bytes() || usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdHigh) { if ((System.nanoTime() - lastRunNS) > DiskThresholdDecider.this.rerouteInterval.nanos()) { lastRunNS = System.nanoTime(); reroute = true; explanation = "high disk watermark exceeded on one or more nodes"; } else { logger.debug( "high disk watermark exceeded on {} but an automatic reroute has occurred in the last [{}], skipping reroute", node, DiskThresholdDecider.this.rerouteInterval); } nodeHasPassedWatermark.add(node); } else if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdLow.bytes() || usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdLow) { nodeHasPassedWatermark.add(node); } else { if (nodeHasPassedWatermark.contains(node)) { // The node has previously been over the high or // low watermark, but is no longer, so we should // reroute so any unassigned shards can be allocated // if they are able to be if ((System.nanoTime() - lastRunNS) > DiskThresholdDecider.this.rerouteInterval.nanos()) { lastRunNS = System.nanoTime(); reroute = true; explanation = "one or more nodes has gone under the high or low watermark"; nodeHasPassedWatermark.remove(node); } else { logger.debug( "{} has gone below a disk threshold, but an automatic reroute has occurred in the last [{}], skipping reroute", node, DiskThresholdDecider.this.rerouteInterval); } } } } if (reroute) { logger.info("rerouting shards: [{}]", explanation); // Execute an empty reroute, but don't block on the response client.admin().cluster().prepareReroute().execute(); } } }
public Decision canRemain( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (!enabled) { return allocation.decision(Decision.YES, "disk threshold decider disabled"); } // Allow allocation regardless if only a single node is available if (allocation.nodes().size() <= 1) { return allocation.decision(Decision.YES, "only a single node is present"); } ClusterInfo clusterInfo = allocation.clusterInfo(); if (clusterInfo == null) { if (logger.isTraceEnabled()) { logger.trace("Cluster info unavailable for disk threshold decider, allowing allocation."); } return allocation.decision(Decision.YES, "cluster info unavailable"); } Map<String, DiskUsage> usages = clusterInfo.getNodeDiskUsages(); if (usages.isEmpty()) { if (logger.isTraceEnabled()) { logger.trace( "Unable to determine disk usages for disk-aware allocation, allowing allocation"); } return allocation.decision(Decision.YES, "disk usages unavailable"); } DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node usage = averageUsage(node, usages); if (logger.isDebugEnabled()) { logger.debug( "Unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]", node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage()); } } // If this node is already above the high threshold, the shard cannot remain (get it off!) double freeDiskPercentage = usage.getFreeDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isDebugEnabled()) { logger.debug( "Node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes); } if (freeBytes < freeBytesThresholdHigh.bytes()) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain", freeBytesThresholdHigh, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdHigh, new ByteSizeValue(freeBytes)); } if (freeDiskPercentage < freeDiskThresholdHigh) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain", freeDiskThresholdHigh, freeDiskPercentage, node.nodeId()); } return allocation.decision( Decision.NO, "after allocation less than required [%d%%] free disk on node, free: [%d%%]", freeDiskThresholdHigh, freeDiskPercentage); } return allocation.decision( Decision.YES, "enough disk for shard to remain on node, free: [%s]", new ByteSizeValue(freeBytes)); }
public Decision canAllocate( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (!enabled) { return allocation.decision(Decision.YES, "disk threshold decider disabled"); } // Allow allocation regardless if only a single node is available if (allocation.nodes().size() <= 1) { return allocation.decision(Decision.YES, "only a single node is present"); } ClusterInfo clusterInfo = allocation.clusterInfo(); if (clusterInfo == null) { if (logger.isTraceEnabled()) { logger.trace("Cluster info unavailable for disk threshold decider, allowing allocation."); } return allocation.decision(Decision.YES, "cluster info unavailable"); } Map<String, DiskUsage> usages = clusterInfo.getNodeDiskUsages(); Map<String, Long> shardSizes = clusterInfo.getShardSizes(); if (usages.isEmpty()) { if (logger.isTraceEnabled()) { logger.trace( "Unable to determine disk usages for disk-aware allocation, allowing allocation"); } return allocation.decision(Decision.YES, "disk usages unavailable"); } DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node usage = averageUsage(node, usages); if (logger.isDebugEnabled()) { logger.debug( "Unable to determine disk usage for [{}], defaulting to average across nodes [{} total] [{} free] [{}% free]", node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage()); } } // First, check that the node currently over the low watermark double freeDiskPercentage = usage.getFreeDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); if (logger.isDebugEnabled()) { logger.debug("Node [{}] has {}% free disk", node.nodeId(), freeDiskPercentage); } if (freeBytes < freeBytesThresholdLow.bytes()) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation", freeBytesThresholdLow, freeBytes, node.nodeId()); } return allocation.decision( Decision.NO, "less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytes)); } if (freeDiskPercentage < freeDiskThresholdLow) { if (logger.isDebugEnabled()) { logger.debug( "Less than the required {}% free disk threshold ({}% free) on node [{}], preventing allocation", freeDiskThresholdLow, freeDiskPercentage, node.nodeId()); } return allocation.decision( Decision.NO, "less than required [%d%%] free disk on node, free: [%d%%]", freeDiskThresholdLow, freeDiskThresholdLow); } // Secondly, check that allocating the shard to this node doesn't put it above the high // watermark Long shardSize = shardSizes.get(shardIdentifierFromRouting(shardRouting)); shardSize = shardSize == null ? 0 : shardSize; double freeSpaceAfterShard = this.freeDiskPercentageAfterShardAssigned(usage, shardSize); long freeBytesAfterShard = freeBytes - shardSize; if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) { logger.warn( "After allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation", node.nodeId(), freeBytesThresholdHigh, freeBytesAfterShard); return allocation.decision( Decision.NO, "after allocation less than required [%s] free on node, free: [%s]", freeBytesThresholdLow, new ByteSizeValue(freeBytesAfterShard)); } if (freeSpaceAfterShard < freeDiskThresholdHigh) { logger.warn( "After allocating, node [{}] would have less than the required {}% free disk threshold ({}% free), preventing allocation", node.nodeId(), freeDiskThresholdHigh, freeSpaceAfterShard); return allocation.decision( Decision.NO, "after allocation less than required [%d%%] free disk on node, free: [%d%%]", freeDiskThresholdLow, freeSpaceAfterShard); } return allocation.decision( Decision.YES, "enough disk for shard on node, free: [%s]", new ByteSizeValue(freeBytes)); }