private Decision shouldIndexFilter(
     IndexMetaData indexMd, RoutingNode node, RoutingAllocation allocation) {
   if (indexMd.requireFilters() != null) {
     if (!indexMd.requireFilters().match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node does not match index setting [%s] filters [%s]",
           IndexMetaData.INDEX_ROUTING_REQUIRE_GROUP_PREFIX,
           indexMd.requireFilters());
     }
   }
   if (indexMd.includeFilters() != null) {
     if (!indexMd.includeFilters().match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node does not match index setting [%s] filters [%s]",
           IndexMetaData.INDEX_ROUTING_INCLUDE_GROUP_PREFIX,
           indexMd.includeFilters());
     }
   }
   if (indexMd.excludeFilters() != null) {
     if (indexMd.excludeFilters().match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node matches index setting [%s] filters [%s]",
           IndexMetaData.INDEX_ROUTING_EXCLUDE_GROUP_SETTING.getKey(),
           indexMd.excludeFilters());
     }
   }
   return null;
 }
 private Decision shouldClusterFilter(RoutingNode node, RoutingAllocation allocation) {
   if (clusterRequireFilters != null) {
     if (!clusterRequireFilters.match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node does not match cluster setting [%s] filters [%s]",
           CLUSTER_ROUTING_REQUIRE_GROUP_PREFIX,
           clusterRequireFilters);
     }
   }
   if (clusterIncludeFilters != null) {
     if (!clusterIncludeFilters.match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node does not cluster setting [%s] filters [%s]",
           CLUSTER_ROUTING_INCLUDE_GROUP_PREFIX,
           clusterIncludeFilters);
     }
   }
   if (clusterExcludeFilters != null) {
     if (clusterExcludeFilters.match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node matches cluster setting [%s] filters [%s]",
           CLUSTER_ROUTING_EXCLUDE_GROUP_PREFIX,
           clusterExcludeFilters);
     }
   }
   return null;
 }
 private Decision shouldClusterFilter(RoutingNode node, RoutingAllocation allocation) {
   if (clusterRequireFilters != null) {
     if (!clusterRequireFilters.match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node does not match global required filters [%s]",
           clusterRequireFilters);
     }
   }
   if (clusterIncludeFilters != null) {
     if (!clusterIncludeFilters.match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node does not match global include filters [%s]",
           clusterIncludeFilters);
     }
   }
   if (clusterExcludeFilters != null) {
     if (clusterExcludeFilters.match(node.node())) {
       return allocation.decision(
           Decision.NO, NAME, "node matches global exclude filters [%s]", clusterExcludeFilters);
     }
   }
   return null;
 }
 private Decision shouldIndexFilter(
     IndexMetaData indexMd, RoutingNode node, RoutingAllocation allocation) {
   if (indexMd.requireFilters() != null) {
     if (!indexMd.requireFilters().match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node does not match index required filters [%s]",
           indexMd.requireFilters());
     }
   }
   if (indexMd.includeFilters() != null) {
     if (!indexMd.includeFilters().match(node.node())) {
       return allocation.decision(
           Decision.NO,
           NAME,
           "node does not match index include filters [%s]",
           indexMd.includeFilters());
     }
   }
   if (indexMd.excludeFilters() != null) {
     if (indexMd.excludeFilters().match(node.node())) {
       return allocation.decision(
           Decision.NO, NAME, "node matches index exclude filters [%s]", indexMd.excludeFilters());
     }
   }
   return null;
 }
  @Override
  public Decision canRemain(
      ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
    if (shardRouting.currentNodeId().equals(node.nodeId()) == false) {
      throw new IllegalArgumentException(
          "Shard [" + shardRouting + "] is not allocated on node: [" + node.nodeId() + "]");
    }
    final Decision decision = earlyTerminate(allocation);
    if (decision != null) {
      return decision;
    }
    ClusterInfo clusterInfo = allocation.clusterInfo();
    Map<String, DiskUsage> usages = clusterInfo.getNodeLeastAvailableDiskUsages();
    DiskUsage usage = getDiskUsage(node, allocation, usages);
    // If this node is already above the high threshold, the shard cannot remain (get it off!)
    double freeDiskPercentage = usage.getFreeDiskAsPercentage();
    long freeBytes = usage.getFreeBytes();
    if (logger.isDebugEnabled()) {
      logger.debug(
          "node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes);
    }
    if (freeBytes < freeBytesThresholdHigh.bytes()) {
      if (logger.isDebugEnabled()) {
        logger.debug(
            "less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain",
            freeBytesThresholdHigh,
            freeBytes,
            node.nodeId());
      }
      return allocation.decision(
          Decision.NO,
          NAME,
          "after allocation less than required [%s] free on node, free: [%s]",
          freeBytesThresholdHigh,
          new ByteSizeValue(freeBytes));
    }
    if (freeDiskPercentage < freeDiskThresholdHigh) {
      if (logger.isDebugEnabled()) {
        logger.debug(
            "less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain",
            freeDiskThresholdHigh, freeDiskPercentage, node.nodeId());
      }
      return allocation.decision(
          Decision.NO,
          NAME,
          "after allocation less than required [%s%%] free disk on node, free: [%s%%]",
          freeDiskThresholdHigh,
          freeDiskPercentage);
    }

    return allocation.decision(
        Decision.YES,
        NAME,
        "enough disk for shard to remain on node, free: [%s]",
        new ByteSizeValue(freeBytes));
  }
  private Decision shouldFilter(
      ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
    Decision decision = shouldClusterFilter(node, allocation);
    if (decision != null) return decision;

    decision =
        shouldIndexFilter(
            allocation.metaData().getIndexSafe(shardRouting.index()), node, allocation);
    if (decision != null) return decision;

    return allocation.decision(Decision.YES, NAME, "node passes include/exclude/require filters");
  }
 /** Can the shard be allocated on at least one node based on the allocation deciders. */
 private boolean canBeAllocatedToAtLeastOneNode(ShardRouting shard, RoutingAllocation allocation) {
   for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) {
     RoutingNode node = allocation.routingNodes().node(cursor.value.id());
     if (node == null) {
       continue;
     }
     // if we can't allocate it on a node, ignore it, for example, this handles
     // cases for only allocating a replica after a primary
     Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
     if (decision.type() == Decision.Type.YES) {
       return true;
     }
   }
   return false;
 }
  private Decision shouldFilter(
      IndexMetaData indexMd, RoutingNode node, RoutingAllocation allocation) {
    Decision decision = shouldClusterFilter(node, allocation);
    if (decision != null) return decision;

    decision = shouldIndexFilter(indexMd, node, allocation);
    if (decision != null) return decision;

    return allocation.decision(Decision.YES, NAME, "node passes include/exclude/require filters");
  }
예제 #9
0
  public boolean allocateUnassigned(final RoutingAllocation allocation) {
    boolean changed = false;

    RoutingNodes.UnassignedShards unassigned = allocation.routingNodes().unassigned();
    unassigned.sort(
        PriorityComparator.getAllocationComparator(allocation)); // sort for priority ordering

    changed |= primaryShardAllocator.allocateUnassigned(allocation);
    changed |= replicaShardAllocator.processExistingRecoveries(allocation);
    changed |= replicaShardAllocator.allocateUnassigned(allocation);
    return changed;
  }
 @Override
 public Decision canRebalance(ShardRouting shardRouting, RoutingAllocation allocation) {
   if (clusterConcurrentRebalance == -1) {
     return allocation.decision(Decision.YES, NAME, "unlimited concurrent rebalances are allowed");
   }
   int relocatingShards = allocation.routingNodes().getRelocatingShardCount();
   if (relocatingShards >= clusterConcurrentRebalance) {
     return allocation.decision(
         Decision.NO,
         NAME,
         "too many shards are concurrently rebalancing [%d], limit: [%d]",
         relocatingShards,
         clusterConcurrentRebalance);
   }
   return allocation.decision(
       Decision.YES,
       NAME,
       "below threshold [%d] for concurrent rebalances, current rebalance shard count [%d]",
       clusterConcurrentRebalance,
       relocatingShards);
 }
예제 #11
0
 @Override
 protected AsyncShardFetch.FetchResult<
         TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
     fetchData(ShardRouting shard, RoutingAllocation allocation) {
   AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch =
       asyncFetchStore.get(shard.shardId());
   if (fetch == null) {
     fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction);
     asyncFetchStore.put(shard.shardId(), fetch);
   }
   AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
       shardStores =
           fetch.fetchData(
               allocation.nodes(),
               allocation.metaData(),
               allocation.getIgnoreNodes(shard.shardId()));
   if (shardStores.hasData() == true) {
     shardStores.processAllocation(allocation);
   }
   return shardStores;
 }
  private Decision earlyTerminate(RoutingAllocation allocation) {
    // Always allow allocation if the decider is disabled
    if (!enabled) {
      return allocation.decision(Decision.YES, NAME, "disk threshold decider disabled");
    }

    // Allow allocation regardless if only a single node is available
    if (allocation.nodes().size() <= 1) {
      if (logger.isTraceEnabled()) {
        logger.trace("only a single node is present, allowing allocation");
      }
      return allocation.decision(Decision.YES, NAME, "only a single node is present");
    }

    // Fail open there is no info available
    final ClusterInfo clusterInfo = allocation.clusterInfo();
    if (clusterInfo == null) {
      if (logger.isTraceEnabled()) {
        logger.trace("cluster info unavailable for disk threshold decider, allowing allocation.");
      }
      return allocation.decision(Decision.YES, NAME, "cluster info unavailable");
    }

    final Map<String, DiskUsage> usages = clusterInfo.getNodeLeastAvailableDiskUsages();
    // Fail open if there are no disk usages available
    if (usages.isEmpty()) {
      if (logger.isTraceEnabled()) {
        logger.trace(
            "unable to determine disk usages for disk-aware allocation, allowing allocation");
      }
      return allocation.decision(Decision.YES, NAME, "disk usages unavailable");
    }
    return null;
  }
예제 #13
0
    @Override
    protected AsyncShardFetch.FetchResult<
            TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>
        fetchData(ShardRouting shard, RoutingAllocation allocation) {
      AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch =
          asyncFetchStarted.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction);
        asyncFetchStarted.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>
          shardState =
              fetch.fetchData(
                  allocation.nodes(),
                  allocation.metaData(),
                  allocation.getIgnoreNodes(shard.shardId()));

      if (shardState.hasData() == true) {
        shardState.processAllocation(allocation);
      }
      return shardState;
    }
 @Override
 public Decision canAllocate(
     ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
   if (shardRouting.unassigned()) {
     // only for unassigned - we filter allocation right after the index creation ie. for shard
     // shrinking etc. to ensure
     // that once it has been allocated post API the replicas can be allocated elsewhere without
     // user interaction
     // this is a setting that can only be set within the system!
     IndexMetaData indexMd = allocation.metaData().getIndexSafe(shardRouting.index());
     DiscoveryNodeFilters initialRecoveryFilters = indexMd.getInitialRecoveryFilters();
     if (initialRecoveryFilters != null
         && RecoverySource.isInitialRecovery(shardRouting.recoverySource().getType())
         && initialRecoveryFilters.match(node.node()) == false) {
       String explanation =
           (shardRouting.recoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS)
               ? "initial allocation of the shrunken index is only allowed on nodes [%s] that hold a copy of every shard in the index"
               : "initial allocation of the index is only allowed on nodes [%s]";
       return allocation.decision(Decision.NO, NAME, explanation, initialRecoveryFilters);
     }
   }
   return shouldFilter(shardRouting, node, allocation);
 }
 /** Finds the store for the assigned shard in the fetched data, returns null if none is found. */
 private TransportNodesListShardStoreMetaData.StoreFilesMetaData findStore(
     ShardRouting shard,
     RoutingAllocation allocation,
     AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
         data) {
   assert shard.currentNodeId() != null;
   DiscoveryNode primaryNode = allocation.nodes().get(shard.currentNodeId());
   if (primaryNode == null) {
     return null;
   }
   TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
       data.getData().get(primaryNode);
   if (primaryNodeFilesStore == null) {
     return null;
   }
   return primaryNodeFilesStore.storeFilesMetaData();
 }
  private DiskUsage getDiskUsage(
      RoutingNode node, RoutingAllocation allocation, Map<String, DiskUsage> usages) {
    ClusterInfo clusterInfo = allocation.clusterInfo();
    DiskUsage usage = usages.get(node.nodeId());
    if (usage == null) {
      // If there is no usage, and we have other nodes in the cluster,
      // use the average usage for all nodes as the usage for this node
      usage = averageUsage(node, usages);
      if (logger.isDebugEnabled()) {
        logger.debug(
            "unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]",
            node.nodeId(),
            usage.getTotalBytes(),
            usage.getFreeBytes(),
            usage.getFreeDiskAsPercentage());
      }
    }

    if (includeRelocations) {
      long relocatingShardsSize = sizeOfRelocatingShards(node, clusterInfo, true);
      DiskUsage usageIncludingRelocations =
          new DiskUsage(
              node.nodeId(),
              node.node().name(),
              "_na_",
              usage.getTotalBytes(),
              usage.getFreeBytes() - relocatingShardsSize);
      if (logger.isTraceEnabled()) {
        logger.trace("usage without relocations: {}", usage);
        logger.trace(
            "usage with relocations: [{} bytes] {}",
            relocatingShardsSize,
            usageIncludingRelocations);
      }
      usage = usageIncludingRelocations;
    }
    return usage;
  }
  @Override
  public void execute(RoutingAllocation allocation) throws ElasticSearchException {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);

    MutableShardRouting shardRouting = null;
    for (MutableShardRouting routing : allocation.routingNodes().unassigned()) {
      if (routing.shardId().equals(shardId)) {
        // prefer primaries first to allocate
        if (shardRouting == null || routing.primary()) {
          shardRouting = routing;
        }
      }
    }

    if (shardRouting == null) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] failed to find " + shardId + " on the list of unassigned shards");
    }

    if (shardRouting.primary() && !allowPrimary) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] trying to allocate a primary shard " + shardId + "], which is disabled");
    }

    RoutingNode routingNode = allocation.routingNodes().node(discoNode.id());
    allocation.addIgnoreDisable(shardRouting.shardId(), routingNode.nodeId());
    if (!allocation.deciders().canAllocate(shardRouting, routingNode, allocation).allowed()) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed");
    }
    // go over and remove it from the unassigned
    for (Iterator<MutableShardRouting> it = allocation.routingNodes().unassigned().iterator();
        it.hasNext(); ) {
      if (it.next() != shardRouting) {
        continue;
      }
      it.remove();
      routingNode.add(shardRouting);
      break;
    }
  }
  @Override
  public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
    boolean found = false;
    for (RoutingNodes.RoutingNodeIterator it =
            allocation.routingNodes().routingNodeIter(discoNode.id());
        it.hasNext(); ) {
      ShardRouting shardRouting = it.next();
      if (!shardRouting.shardId().equals(shardId)) {
        continue;
      }
      found = true;
      if (shardRouting.relocatingNodeId() != null) {
        if (shardRouting.initializing()) {
          // the shard is initializing and recovering from another node, simply cancel the recovery
          it.remove();
          // and cancel the relocating state from the shard its being relocated from
          RoutingNode relocatingFromNode =
              allocation.routingNodes().node(shardRouting.relocatingNodeId());
          if (relocatingFromNode != null) {
            for (ShardRouting fromShardRouting : relocatingFromNode) {
              if (fromShardRouting.isSameShard(shardRouting)
                  && fromShardRouting.state() == RELOCATING) {
                allocation.routingNodes().cancelRelocation(fromShardRouting);
                break;
              }
            }
          }
        } else if (shardRouting.relocating()) {

          // the shard is relocating to another node, cancel the recovery on the other node, and
          // deallocate this one
          if (!allowPrimary && shardRouting.primary()) {
            // can't cancel a primary shard being initialized
            if (explain) {
              return new RerouteExplanation(
                  this,
                  allocation.decision(
                      Decision.NO,
                      "cancel_allocation_command",
                      "can't cancel "
                          + shardId
                          + " on node "
                          + discoNode
                          + ", shard is primary and initializing its state"));
            }
            throw new IllegalArgumentException(
                "[cancel_allocation] can't cancel "
                    + shardId
                    + " on node "
                    + discoNode
                    + ", shard is primary and initializing its state");
          }
          it.moveToUnassigned(new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null));
          // now, go and find the shard that is initializing on the target node, and cancel it as
          // well...
          RoutingNodes.RoutingNodeIterator initializingNode =
              allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId());
          if (initializingNode != null) {
            while (initializingNode.hasNext()) {
              ShardRouting initializingShardRouting = initializingNode.next();
              if (initializingShardRouting.isRelocationTargetOf(shardRouting)) {
                initializingNode.remove();
              }
            }
          }
        }
      } else {
        // the shard is not relocating, its either started, or initializing, just cancel it and move
        // on...
        if (!allowPrimary && shardRouting.primary()) {
          // can't cancel a primary shard being initialized
          if (explain) {
            return new RerouteExplanation(
                this,
                allocation.decision(
                    Decision.NO,
                    "cancel_allocation_command",
                    "can't cancel "
                        + shardId
                        + " on node "
                        + discoNode
                        + ", shard is primary and started"));
          }
          throw new IllegalArgumentException(
              "[cancel_allocation] can't cancel "
                  + shardId
                  + " on node "
                  + discoNode
                  + ", shard is primary and started");
        }
        it.moveToUnassigned(new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null));
      }
    }
    if (!found) {
      if (explain) {
        return new RerouteExplanation(
            this,
            allocation.decision(
                Decision.NO,
                "cancel_allocation_command",
                "can't cancel " + shardId + ", failed to find it on node " + discoNode));
      }
      throw new IllegalArgumentException(
          "[cancel_allocation] can't cancel "
              + shardId
              + ", failed to find it on node "
              + discoNode);
    }
    return new RerouteExplanation(
        this,
        allocation.decision(
            Decision.YES,
            "cancel_allocation_command",
            "shard " + shardId + " on node " + discoNode + " can be cancelled"));
  }
  @Override
  public Decision canAllocate(
      ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
    final Decision decision = earlyTerminate(allocation);
    if (decision != null) {
      return decision;
    }

    final double usedDiskThresholdLow = 100.0 - DiskThresholdDecider.this.freeDiskThresholdLow;
    final double usedDiskThresholdHigh = 100.0 - DiskThresholdDecider.this.freeDiskThresholdHigh;
    ClusterInfo clusterInfo = allocation.clusterInfo();
    Map<String, DiskUsage> usages = clusterInfo.getNodeMostAvailableDiskUsages();
    DiskUsage usage = getDiskUsage(node, allocation, usages);
    // First, check that the node currently over the low watermark
    double freeDiskPercentage = usage.getFreeDiskAsPercentage();
    // Cache the used disk percentage for displaying disk percentages consistent with documentation
    double usedDiskPercentage = usage.getUsedDiskAsPercentage();
    long freeBytes = usage.getFreeBytes();
    if (logger.isTraceEnabled()) {
      logger.trace("node [{}] has {}% used disk", node.nodeId(), usedDiskPercentage);
    }

    // a flag for whether the primary shard has been previously allocated
    boolean primaryHasBeenAllocated =
        shardRouting.primary() && shardRouting.allocatedPostIndexCreate();

    // checks for exact byte comparisons
    if (freeBytes < freeBytesThresholdLow.bytes()) {
      // If the shard is a replica or has a primary that has already been allocated before, check
      // the low threshold
      if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation",
              freeBytesThresholdLow,
              freeBytes,
              node.nodeId());
        }
        return allocation.decision(
            Decision.NO,
            NAME,
            "less than required [%s] free on node, free: [%s]",
            freeBytesThresholdLow,
            new ByteSizeValue(freeBytes));
      } else if (freeBytes > freeBytesThresholdHigh.bytes()) {
        // Allow the shard to be allocated because it is primary that
        // has never been allocated if it's under the high watermark
        if (logger.isDebugEnabled()) {
          logger.debug(
              "less than the required {} free bytes threshold ({} bytes free) on node {}, "
                  + "but allowing allocation because primary has never been allocated",
              freeBytesThresholdLow,
              freeBytes,
              node.nodeId());
        }
        return allocation.decision(Decision.YES, NAME, "primary has never been allocated before");
      } else {
        // Even though the primary has never been allocated, the node is
        // above the high watermark, so don't allow allocating the shard
        if (logger.isDebugEnabled()) {
          logger.debug(
              "less than the required {} free bytes threshold ({} bytes free) on node {}, "
                  + "preventing allocation even though primary has never been allocated",
              freeBytesThresholdHigh,
              freeBytes,
              node.nodeId());
        }
        return allocation.decision(
            Decision.NO,
            NAME,
            "less than required [%s] free on node, free: [%s]",
            freeBytesThresholdHigh,
            new ByteSizeValue(freeBytes));
      }
    }

    // checks for percentage comparisons
    if (freeDiskPercentage < freeDiskThresholdLow) {
      // If the shard is a replica or has a primary that has already been allocated before, check
      // the low threshold
      if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "more than the allowed {} used disk threshold ({} used) on node [{}], preventing allocation",
              Strings.format1Decimals(usedDiskThresholdLow, "%"),
              Strings.format1Decimals(usedDiskPercentage, "%"),
              node.nodeId());
        }
        return allocation.decision(
            Decision.NO,
            NAME,
            "more than allowed [%s%%] used disk on node, free: [%s%%]",
            usedDiskThresholdLow,
            freeDiskPercentage);
      } else if (freeDiskPercentage > freeDiskThresholdHigh) {
        // Allow the shard to be allocated because it is primary that
        // has never been allocated if it's under the high watermark
        if (logger.isDebugEnabled()) {
          logger.debug(
              "more than the allowed {} used disk threshold ({} used) on node [{}], "
                  + "but allowing allocation because primary has never been allocated",
              Strings.format1Decimals(usedDiskThresholdLow, "%"),
              Strings.format1Decimals(usedDiskPercentage, "%"),
              node.nodeId());
        }
        return allocation.decision(Decision.YES, NAME, "primary has never been allocated before");
      } else {
        // Even though the primary has never been allocated, the node is
        // above the high watermark, so don't allow allocating the shard
        if (logger.isDebugEnabled()) {
          logger.debug(
              "less than the required {} free bytes threshold ({} bytes free) on node {}, "
                  + "preventing allocation even though primary has never been allocated",
              Strings.format1Decimals(freeDiskThresholdHigh, "%"),
              Strings.format1Decimals(freeDiskPercentage, "%"),
              node.nodeId());
        }
        return allocation.decision(
            Decision.NO,
            NAME,
            "more than allowed [%s%%] used disk on node, free: [%s%%]",
            usedDiskThresholdHigh,
            freeDiskPercentage);
      }
    }

    // Secondly, check that allocating the shard to this node doesn't put it above the high
    // watermark
    final long shardSize = getShardSize(shardRouting, allocation.clusterInfo());
    double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize);
    long freeBytesAfterShard = freeBytes - shardSize;
    if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) {
      logger.warn(
          "after allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation",
          node.nodeId(),
          freeBytesThresholdHigh,
          freeBytesAfterShard);
      return allocation.decision(
          Decision.NO,
          NAME,
          "after allocation less than required [%s] free on node, free: [%s]",
          freeBytesThresholdLow,
          new ByteSizeValue(freeBytesAfterShard));
    }
    if (freeSpaceAfterShard < freeDiskThresholdHigh) {
      logger.warn(
          "after allocating, node [{}] would have more than the allowed {} free disk threshold ({} free), preventing allocation",
          node.nodeId(),
          Strings.format1Decimals(freeDiskThresholdHigh, "%"),
          Strings.format1Decimals(freeSpaceAfterShard, "%"));
      return allocation.decision(
          Decision.NO,
          NAME,
          "after allocation more than allowed [%s%%] used disk on node, free: [%s%%]",
          usedDiskThresholdLow,
          freeSpaceAfterShard);
    }

    return allocation.decision(
        Decision.YES,
        NAME,
        "enough disk for shard on node, free: [%s]",
        new ByteSizeValue(freeBytes));
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard);

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet();
      final boolean[] states = nodesState.allocated;
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      for (int i = 0; i < states.length; i++) {
        if (!states[i]) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (version != -1) {
          numberOfAllocationsFound++;
          if (highestVersion == -1) {
            nodesWithHighestVersion.add(node);
            highestVersion = version;
          } else {
            if (version > highestVersion) {
              nodesWithHighestVersion.clear();
              nodesWithHighestVersion.add(node);
              highestVersion = version;
            } else if (version == highestVersion) {
              nodesWithHighestVersion.add(node);
            }
          }
        }
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index());
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        continue;
      }

      Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
          buildShardStores(nodes, shard);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
          nodeStoreEntry : shardStores.entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          MutableShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                  shardStores.get(primaryNode);
              if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                long sizeMatched = 0;

                for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                  if (primaryNodeStore.fileExists(storeFileMetaData.name())
                      && primaryNodeStore
                          .file(storeFileMetaData.name())
                          .isSame(storeFileMetaData)) {
                    sizeMatched += storeFileMetaData.length();
                  }
                }
                logger.trace(
                    "{}: node [{}] has [{}/{}] bytes of re-usable data",
                    shard,
                    discoNode.name(),
                    new ByteSizeValue(sizeMatched),
                    sizeMatched);
                if (sizeMatched > lastSizeMatched) {
                  lastSizeMatched = sizeMatched;
                  lastDiscoNodeMatched = discoNode;
                  lastNodeMatched = node;
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          allocation.routingNodes().assign(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      }
    }
    return changed;
  }
예제 #21
0
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    final MetaData metaData = routingNodes.metaData();
    RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned();
    unassigned.sort(
        new PriorityComparator() {

          @Override
          protected Settings getIndexSettings(String index) {
            IndexMetaData indexMetaData = metaData.index(index);
            return indexMetaData.getSettings();
          }
        }); // sort for priority ordering
    Iterator<ShardRouting> unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch =
          asyncFetchStarted.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction);
        asyncFetchStarted.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>
          shardState = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardState.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard started state", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }
      shardState.processAllocation(allocation);

      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      /**
       * Build a map of DiscoveryNodes to shard state number for the given shard. A state of -1
       * means the shard does not exist on the node, where any shard state >= 0 is the state version
       * of the shard on that node's disk.
       *
       * <p>A shard on shared storage will return at least shard state 0 for all nodes, indicating
       * that the shard can be allocated to any node.
       */
      ObjectLongHashMap<DiscoveryNode> nodesState = new ObjectLongHashMap<>();
      for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState :
          shardState.getData().values()) {
        long version = nodeShardState.version();
        // -1 version means it does not exists, which is what the API returns, and what we expect to
        logger.trace(
            "[{}] on node [{}] has version [{}] of shard",
            shard,
            nodeShardState.getNode(),
            version);
        nodesState.put(nodeShardState.getNode(), version);
      }

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap();

      assert !nodesState.containsKey(null);
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      Settings idxSettings = indexMetaData.settings();
      for (int i = 0; i < keys.length; i++) {
        if (keys[i] == null) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (recoverOnAnyNode(idxSettings)) {
          numberOfAllocationsFound++;
          if (version > highestVersion) {
            highestVersion = version;
          }
          // We always put the node without clearing the map
          nodesWithVersion.put(node, version);
        } else if (version != -1) {
          numberOfAllocationsFound++;
          // If we've found a new "best" candidate, clear the
          // current candidates and add it
          if (version > highestVersion) {
            highestVersion = version;
            nodesWithVersion.clear();
            nodesWithVersion.put(node, version);
          } else if (version == highestVersion) {
            // If the candidate is the same, add it to the
            // list, but keep the current candidate
            nodesWithVersion.put(node, version);
          }
        }
      }
      // Now that we have a map of nodes to versions along with the
      // number of allocations found (and not ignored), we need to sort
      // it so the node with the highest version is at the beginning
      List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList();
      nodesWithHighestVersion.addAll(nodesWithVersion.keySet());
      CollectionUtil.timSort(
          nodesWithHighestVersion,
          new Comparator<DiscoveryNode>() {
            @Override
            public int compare(DiscoveryNode o1, DiscoveryNode o2) {
              return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1));
            }
          });

      if (logger.isDebugEnabled()) {
        logger.debug(
            "[{}][{}] found {} allocations of {}, highest version: [{}]",
            shard.index(),
            shard.id(),
            numberOfAllocationsFound,
            shard,
            highestVersion);
      }
      if (logger.isTraceEnabled()) {
        StringBuilder sb = new StringBuilder("[");
        for (DiscoveryNode n : nodesWithHighestVersion) {
          sb.append("[");
          sb.append(n.getName());
          sb.append("]");
          sb.append(" -> ");
          sb.append(nodesWithVersion.get(n));
          sb.append(", ");
        }
        sb.append("]");
        logger.trace("{} candidates for allocation: {}", shard, sb.toString());
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }

      AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch =
          asyncFetchStore.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction);
        asyncFetchStore.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores =
              fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue; // still fetching
      }
      shardStores.processAllocation(allocation);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;
      boolean hasReplicaData = false;
      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          nodeStoreEntry : shardStores.getData().entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue().storeFilesMetaData();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          hasReplicaData |= storeFilesMetaData.iterator().hasNext();
          ShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
                  shardStores.getData().get(primaryNode);
              if (primaryNodeFilesStore != null) {
                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                    primaryNodeFilesStore.storeFilesMetaData();
                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                  long sizeMatched = 0;

                  String primarySyncId = primaryNodeStore.syncId();
                  String replicaSyncId = storeFilesMetaData.syncId();
                  // see if we have a sync id we can make use of
                  if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
                    logger.trace(
                        "{}: node [{}] has same sync id {} as primary",
                        shard,
                        discoNode.name(),
                        replicaSyncId);
                    lastNodeMatched = node;
                    lastSizeMatched = Long.MAX_VALUE;
                    lastDiscoNodeMatched = discoNode;
                  } else {
                    for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                      String metaDataFileName = storeFileMetaData.name();
                      if (primaryNodeStore.fileExists(metaDataFileName)
                          && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
                        sizeMatched += storeFileMetaData.length();
                      }
                    }
                    logger.trace(
                        "{}: node [{}] has [{}/{}] bytes of re-usable data",
                        shard,
                        discoNode.name(),
                        new ByteSizeValue(sizeMatched),
                        sizeMatched);
                    if (sizeMatched > lastSizeMatched) {
                      lastSizeMatched = sizeMatched;
                      lastDiscoNodeMatched = discoNode;
                      lastNodeMatched = node;
                    }
                  }
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          routingNodes.initialize(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      } else if (hasReplicaData == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation
        // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
        // note: we only care about replica in delayed allocation, since if we have an unassigned
        // primary it
        //       will anyhow wait to find an existing copy of the shard to be allocated
        // note: the other side of the equation is scheduling a reroute in a timely manner, which
        // happens in the RoutingService
        long delay =
            shard
                .unassignedInfo()
                .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
        if (delay > 0) {
          logger.debug(
              "[{}][{}]: delaying allocation of [{}] for [{}]",
              shard.index(),
              shard.id(),
              shard,
              TimeValue.timeValueMillis(delay));
          /**
           * mark it as changed, since we want to kick a publishing to schedule future allocation,
           * see {@link
           * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
           */
          changed = true;
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        }
      }
    }
    return changed;
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    final RoutingNodes routingNodes = allocation.routingNodes();
    final MetaData metaData = routingNodes.metaData();

    final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator =
        routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.removeAndIgnore();
        continue;
      }

      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores = fetchData(shard, allocation);
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        unassignedIterator.removeAndIgnore();
        continue; // still fetching
      }

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;
      boolean hasReplicaData = false;
      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          nodeStoreEntry : shardStores.getData().entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue().storeFilesMetaData();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          hasReplicaData |= storeFilesMetaData.iterator().hasNext();
          ShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
                  shardStores.getData().get(primaryNode);
              if (primaryNodeFilesStore != null) {
                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                    primaryNodeFilesStore.storeFilesMetaData();
                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                  long sizeMatched = 0;

                  String primarySyncId = primaryNodeStore.syncId();
                  String replicaSyncId = storeFilesMetaData.syncId();
                  // see if we have a sync id we can make use of
                  if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
                    logger.trace(
                        "{}: node [{}] has same sync id {} as primary",
                        shard,
                        discoNode.name(),
                        replicaSyncId);
                    lastNodeMatched = node;
                    lastSizeMatched = Long.MAX_VALUE;
                    lastDiscoNodeMatched = discoNode;
                  } else {
                    for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                      String metaDataFileName = storeFileMetaData.name();
                      if (primaryNodeStore.fileExists(metaDataFileName)
                          && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
                        sizeMatched += storeFileMetaData.length();
                      }
                    }
                    logger.trace(
                        "{}: node [{}] has [{}/{}] bytes of re-usable data",
                        shard,
                        discoNode.name(),
                        new ByteSizeValue(sizeMatched),
                        sizeMatched);
                    if (sizeMatched > lastSizeMatched) {
                      lastSizeMatched = sizeMatched;
                      lastDiscoNodeMatched = discoNode;
                      lastNodeMatched = node;
                    }
                  }
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.removeAndIgnore();
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          unassignedIterator.initialize(lastNodeMatched.nodeId());
        }
      } else if (hasReplicaData == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation
        // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
        // note: we only care about replica in delayed allocation, since if we have an unassigned
        // primary it
        //       will anyhow wait to find an existing copy of the shard to be allocated
        // note: the other side of the equation is scheduling a reroute in a timely manner, which
        // happens in the RoutingService
        long delay =
            shard
                .unassignedInfo()
                .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
        if (delay > 0) {
          logger.debug(
              "[{}][{}]: delaying allocation of [{}] for [{}]",
              shard.index(),
              shard.id(),
              shard,
              TimeValue.timeValueMillis(delay));
          /**
           * mark it as changed, since we want to kick a publishing to schedule future allocation,
           * see {@link
           * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
           */
          changed = true;
          unassignedIterator.removeAndIgnore();
        }
      }
    }
    return changed;
  }
  @Override
  public void execute(RoutingAllocation allocation) throws ElasticSearchException {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
    boolean found = false;
    for (RoutingNodes.RoutingNodeIterator it =
            allocation.routingNodes().routingNodeIter(discoNode.id());
        it.hasNext(); ) {
      MutableShardRouting shardRouting = it.next();
      if (!shardRouting.shardId().equals(shardId)) {
        continue;
      }
      found = true;
      if (shardRouting.relocatingNodeId() != null) {
        if (shardRouting.initializing()) {
          // the shard is initializing and recovering from another node, simply cancel the recovery
          it.remove();
          // and cancel the relocating state from the shard its being relocated from
          RoutingNode relocatingFromNode =
              allocation.routingNodes().node(shardRouting.relocatingNodeId());
          if (relocatingFromNode != null) {
            for (MutableShardRouting fromShardRouting : relocatingFromNode) {
              if (fromShardRouting.shardId().equals(shardRouting.shardId())
                  && fromShardRouting.state() == RELOCATING) {
                allocation.routingNodes().cancelRelocation(fromShardRouting);
                break;
              }
            }
          }
        } else if (shardRouting.relocating()) {

          // the shard is relocating to another node, cancel the recovery on the other node, and
          // deallocate this one
          if (!allowPrimary && shardRouting.primary()) {
            // can't cancel a primary shard being initialized
            throw new ElasticSearchIllegalArgumentException(
                "[cancel_allocation] can't cancel "
                    + shardId
                    + " on node "
                    + discoNode
                    + ", shard is primary and initializing its state");
          }
          it.moveToUnassigned();
          // now, go and find the shard that is initializing on the target node, and cancel it as
          // well...
          RoutingNodes.RoutingNodeIterator initializingNode =
              allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId());
          if (initializingNode != null) {
            while (initializingNode.hasNext()) {
              MutableShardRouting initializingShardRouting = initializingNode.next();
              if (initializingShardRouting.shardId().equals(shardRouting.shardId())
                  && initializingShardRouting.state() == INITIALIZING) {
                initializingNode.remove();
              }
            }
          }
        }
      } else {
        // the shard is not relocating, its either started, or initializing, just cancel it and move
        // on...
        if (!allowPrimary && shardRouting.primary()) {
          // can't cancel a primary shard being initialized
          throw new ElasticSearchIllegalArgumentException(
              "[cancel_allocation] can't cancel "
                  + shardId
                  + " on node "
                  + discoNode
                  + ", shard is primary and started");
        }
        it.remove();
        allocation
            .routingNodes()
            .unassigned()
            .add(
                new MutableShardRouting(
                    shardRouting.index(),
                    shardRouting.id(),
                    null,
                    shardRouting.primary(),
                    ShardRoutingState.UNASSIGNED,
                    shardRouting.version() + 1));
      }
    }
    if (!found) {
      throw new ElasticSearchIllegalArgumentException(
          "[cancel_allocation] can't cancel "
              + shardId
              + ", failed to find it on node "
              + discoNode);
    }
  }
  @Override
  public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) {
    final DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
    final RoutingNodes routingNodes = allocation.routingNodes();

    ShardRouting shardRouting = null;
    for (ShardRouting routing : routingNodes.unassigned()) {
      if (routing.shardId().equals(shardId)) {
        // prefer primaries first to allocate
        if (shardRouting == null || routing.primary()) {
          shardRouting = routing;
        }
      }
    }

    if (shardRouting == null) {
      if (explain) {
        return new RerouteExplanation(
            this,
            allocation.decision(
                Decision.NO,
                "allocate_allocation_command",
                "failed to find " + shardId + " on the list of unassigned shards"));
      }
      throw new IllegalArgumentException(
          "[allocate] failed to find " + shardId + " on the list of unassigned shards");
    }

    if (shardRouting.primary() && !allowPrimary) {
      if (explain) {
        return new RerouteExplanation(
            this,
            allocation.decision(
                Decision.NO,
                "allocate_allocation_command",
                "trying to allocate a primary shard " + shardId + ", which is disabled"));
      }
      throw new IllegalArgumentException(
          "[allocate] trying to allocate a primary shard " + shardId + ", which is disabled");
    }

    RoutingNode routingNode = routingNodes.node(discoNode.id());
    if (routingNode == null) {
      if (!discoNode.dataNode()) {
        if (explain) {
          return new RerouteExplanation(
              this,
              allocation.decision(
                  Decision.NO,
                  "allocate_allocation_command",
                  "Allocation can only be done on data nodes, not [" + node + "]"));
        }
        throw new IllegalArgumentException(
            "Allocation can only be done on data nodes, not [" + node + "]");
      } else {
        if (explain) {
          return new RerouteExplanation(
              this,
              allocation.decision(
                  Decision.NO,
                  "allocate_allocation_command",
                  "Could not find [" + node + "] among the routing nodes"));
        }
        throw new IllegalStateException("Could not find [" + node + "] among the routing nodes");
      }
    }

    Decision decision = allocation.deciders().canAllocate(shardRouting, routingNode, allocation);
    if (decision.type() == Decision.Type.NO) {
      if (explain) {
        return new RerouteExplanation(this, decision);
      }
      throw new IllegalArgumentException(
          "[allocate] allocation of "
              + shardId
              + " on node "
              + discoNode
              + " is not allowed, reason: "
              + decision);
    }
    // go over and remove it from the unassigned
    for (RoutingNodes.UnassignedShards.UnassignedIterator it = routingNodes.unassigned().iterator();
        it.hasNext(); ) {
      if (it.next() != shardRouting) {
        continue;
      }
      it.initialize(routingNode.nodeId());
      if (shardRouting.primary()) {
        // we need to clear the post allocation flag, since its an explicit allocation of the
        // primary shard
        // and we want to force allocate it (and create a new index for it)
        routingNodes.addClearPostAllocationFlag(shardRouting.shardId());
      }
      break;
    }
    return new RerouteExplanation(this, decision);
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    final RoutingNodes routingNodes = allocation.routingNodes();
    final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator =
        routingNodes.unassigned().iterator();
    MetaData metaData = allocation.metaData();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // if we are allocating a replica because of index creation, no need to go and find a copy,
      // there isn't one...
      IndexMetaData indexMetaData = metaData.index(shard.getIndexName());
      if (shard.allocatedPostIndexCreate(indexMetaData) == false) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      if (canBeAllocatedToAtLeastOneNode(shard, allocation) == false) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.removeAndIgnore();
        continue;
      }

      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores = fetchData(shard, allocation);
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        allocation.setHasPendingAsyncFetch();
        unassignedIterator.removeAndIgnore();
        continue; // still fetching
      }

      ShardRouting primaryShard = routingNodes.activePrimary(shard);
      assert primaryShard != null
          : "the replica shard can be allocated on at least one node, so there must be an active primary";
      TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore =
          findStore(primaryShard, allocation, shardStores);
      if (primaryStore == null || primaryStore.allocated() == false) {
        // if we can't find the primary data, it is probably because the primary shard is corrupted
        // (and listing failed)
        // we want to let the replica be allocated in order to expose the actual problem with the
        // primary that the replica
        // will try and recover from
        // Note, this is the existing behavior, as exposed in running
        // CorruptFileTest#testNoPrimaryData
        logger.trace(
            "{}: no primary shard store found or allocated, letting actual allocation figure it out",
            shard);
        continue;
      }

      MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryStore, shardStores);

      if (matchingNodes.getNodeWithHighestMatch() != null) {
        RoutingNode nodeWithHighestMatch =
            allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().id());
        // we only check on THROTTLE since we checked before before on NO
        Decision decision =
            allocation.deciders().canAllocate(shard, nodeWithHighestMatch, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store",
              shard.index(),
              shard.id(),
              shard,
              nodeWithHighestMatch.node());
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.removeAndIgnore();
        } else {
          logger.debug(
              "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store",
              shard.index(),
              shard.id(),
              shard,
              nodeWithHighestMatch.node());
          // we found a match
          changed = true;
          unassignedIterator.initialize(
              nodeWithHighestMatch.nodeId(),
              null,
              allocation
                  .clusterInfo()
                  .getShardSize(shard, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE));
        }
      } else if (matchingNodes.hasAnyData() == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation of the replica shard needs to be delayed
        changed |= ignoreUnassignedIfDelayed(unassignedIterator, shard);
      }
    }
    return changed;
  }
  public Decision canRemain(
      ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
    if (!enabled) {
      return allocation.decision(Decision.YES, "disk threshold decider disabled");
    }
    // Allow allocation regardless if only a single node is available
    if (allocation.nodes().size() <= 1) {
      return allocation.decision(Decision.YES, "only a single node is present");
    }

    ClusterInfo clusterInfo = allocation.clusterInfo();
    if (clusterInfo == null) {
      if (logger.isTraceEnabled()) {
        logger.trace("Cluster info unavailable for disk threshold decider, allowing allocation.");
      }
      return allocation.decision(Decision.YES, "cluster info unavailable");
    }

    Map<String, DiskUsage> usages = clusterInfo.getNodeDiskUsages();
    if (usages.isEmpty()) {
      if (logger.isTraceEnabled()) {
        logger.trace(
            "Unable to determine disk usages for disk-aware allocation, allowing allocation");
      }
      return allocation.decision(Decision.YES, "disk usages unavailable");
    }

    DiskUsage usage = usages.get(node.nodeId());
    if (usage == null) {
      // If there is no usage, and we have other nodes in the cluster,
      // use the average usage for all nodes as the usage for this node
      usage = averageUsage(node, usages);
      if (logger.isDebugEnabled()) {
        logger.debug(
            "Unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]",
            node.nodeId(),
            usage.getTotalBytes(),
            usage.getFreeBytes(),
            usage.getFreeDiskAsPercentage());
      }
    }

    // If this node is already above the high threshold, the shard cannot remain (get it off!)
    double freeDiskPercentage = usage.getFreeDiskAsPercentage();
    long freeBytes = usage.getFreeBytes();
    if (logger.isDebugEnabled()) {
      logger.debug(
          "Node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes);
    }
    if (freeBytes < freeBytesThresholdHigh.bytes()) {
      if (logger.isDebugEnabled()) {
        logger.debug(
            "Less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain",
            freeBytesThresholdHigh,
            freeBytes,
            node.nodeId());
      }
      return allocation.decision(
          Decision.NO,
          "after allocation less than required [%s] free on node, free: [%s]",
          freeBytesThresholdHigh,
          new ByteSizeValue(freeBytes));
    }
    if (freeDiskPercentage < freeDiskThresholdHigh) {
      if (logger.isDebugEnabled()) {
        logger.debug(
            "Less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain",
            freeDiskThresholdHigh, freeDiskPercentage, node.nodeId());
      }
      return allocation.decision(
          Decision.NO,
          "after allocation less than required [%d%%] free disk on node, free: [%d%%]",
          freeDiskThresholdHigh,
          freeDiskPercentage);
    }

    return allocation.decision(
        Decision.YES,
        "enough disk for shard to remain on node, free: [%s]",
        new ByteSizeValue(freeBytes));
  }
  public Decision canAllocate(
      ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
    if (!enabled) {
      return allocation.decision(Decision.YES, "disk threshold decider disabled");
    }
    // Allow allocation regardless if only a single node is available
    if (allocation.nodes().size() <= 1) {
      return allocation.decision(Decision.YES, "only a single node is present");
    }

    ClusterInfo clusterInfo = allocation.clusterInfo();
    if (clusterInfo == null) {
      if (logger.isTraceEnabled()) {
        logger.trace("Cluster info unavailable for disk threshold decider, allowing allocation.");
      }
      return allocation.decision(Decision.YES, "cluster info unavailable");
    }

    Map<String, DiskUsage> usages = clusterInfo.getNodeDiskUsages();
    Map<String, Long> shardSizes = clusterInfo.getShardSizes();
    if (usages.isEmpty()) {
      if (logger.isTraceEnabled()) {
        logger.trace(
            "Unable to determine disk usages for disk-aware allocation, allowing allocation");
      }
      return allocation.decision(Decision.YES, "disk usages unavailable");
    }

    DiskUsage usage = usages.get(node.nodeId());
    if (usage == null) {
      // If there is no usage, and we have other nodes in the cluster,
      // use the average usage for all nodes as the usage for this node
      usage = averageUsage(node, usages);
      if (logger.isDebugEnabled()) {
        logger.debug(
            "Unable to determine disk usage for [{}], defaulting to average across nodes [{} total] [{} free] [{}% free]",
            node.nodeId(),
            usage.getTotalBytes(),
            usage.getFreeBytes(),
            usage.getFreeDiskAsPercentage());
      }
    }

    // First, check that the node currently over the low watermark
    double freeDiskPercentage = usage.getFreeDiskAsPercentage();
    long freeBytes = usage.getFreeBytes();
    if (logger.isDebugEnabled()) {
      logger.debug("Node [{}] has {}% free disk", node.nodeId(), freeDiskPercentage);
    }
    if (freeBytes < freeBytesThresholdLow.bytes()) {
      if (logger.isDebugEnabled()) {
        logger.debug(
            "Less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation",
            freeBytesThresholdLow,
            freeBytes,
            node.nodeId());
      }
      return allocation.decision(
          Decision.NO,
          "less than required [%s] free on node, free: [%s]",
          freeBytesThresholdLow,
          new ByteSizeValue(freeBytes));
    }
    if (freeDiskPercentage < freeDiskThresholdLow) {
      if (logger.isDebugEnabled()) {
        logger.debug(
            "Less than the required {}% free disk threshold ({}% free) on node [{}], preventing allocation",
            freeDiskThresholdLow, freeDiskPercentage, node.nodeId());
      }
      return allocation.decision(
          Decision.NO,
          "less than required [%d%%] free disk on node, free: [%d%%]",
          freeDiskThresholdLow,
          freeDiskThresholdLow);
    }

    // Secondly, check that allocating the shard to this node doesn't put it above the high
    // watermark
    Long shardSize = shardSizes.get(shardIdentifierFromRouting(shardRouting));
    shardSize = shardSize == null ? 0 : shardSize;
    double freeSpaceAfterShard = this.freeDiskPercentageAfterShardAssigned(usage, shardSize);
    long freeBytesAfterShard = freeBytes - shardSize;
    if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) {
      logger.warn(
          "After allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation",
          node.nodeId(),
          freeBytesThresholdHigh,
          freeBytesAfterShard);
      return allocation.decision(
          Decision.NO,
          "after allocation less than required [%s] free on node, free: [%s]",
          freeBytesThresholdLow,
          new ByteSizeValue(freeBytesAfterShard));
    }
    if (freeSpaceAfterShard < freeDiskThresholdHigh) {
      logger.warn(
          "After allocating, node [{}] would have less than the required {}% free disk threshold ({}% free), preventing allocation",
          node.nodeId(), freeDiskThresholdHigh, freeSpaceAfterShard);
      return allocation.decision(
          Decision.NO,
          "after allocation less than required [%d%%] free disk on node, free: [%d%%]",
          freeDiskThresholdLow,
          freeSpaceAfterShard);
    }

    return allocation.decision(
        Decision.YES, "enough disk for shard on node, free: [%s]", new ByteSizeValue(freeBytes));
  }
  /**
   * Process existing recoveries of replicas and see if we need to cancel them if we find a better
   * match. Today, a better match is one that has full sync id match compared to not having one in
   * the previous recovery.
   */
  public boolean processExistingRecoveries(RoutingAllocation allocation) {
    boolean changed = false;
    MetaData metaData = allocation.metaData();
    for (RoutingNodes.RoutingNodesIterator nodes = allocation.routingNodes().nodes();
        nodes.hasNext(); ) {
      nodes.next();
      for (RoutingNodes.RoutingNodeIterator it = nodes.nodeShards(); it.hasNext(); ) {
        ShardRouting shard = it.next();
        if (shard.primary() == true) {
          continue;
        }
        if (shard.initializing() == false) {
          continue;
        }
        if (shard.relocatingNodeId() != null) {
          continue;
        }

        // if we are allocating a replica because of index creation, no need to go and find a copy,
        // there isn't one...
        IndexMetaData indexMetaData = metaData.index(shard.getIndexName());
        if (shard.allocatedPostIndexCreate(indexMetaData) == false) {
          continue;
        }

        AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
            shardStores = fetchData(shard, allocation);
        if (shardStores.hasData() == false) {
          logger.trace("{}: fetching new stores for initializing shard", shard);
          continue; // still fetching
        }

        ShardRouting primaryShard = allocation.routingNodes().activePrimary(shard);
        assert primaryShard != null
            : "the replica shard can be allocated on at least one node, so there must be an active primary";
        TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore =
            findStore(primaryShard, allocation, shardStores);
        if (primaryStore == null || primaryStore.allocated() == false) {
          // if we can't find the primary data, it is probably because the primary shard is
          // corrupted (and listing failed)
          // just let the recovery find it out, no need to do anything about it for the initializing
          // shard
          logger.trace(
              "{}: no primary shard store found or allocated, letting actual allocation figure it out",
              shard);
          continue;
        }

        MatchingNodes matchingNodes =
            findMatchingNodes(shard, allocation, primaryStore, shardStores);
        if (matchingNodes.getNodeWithHighestMatch() != null) {
          DiscoveryNode currentNode = allocation.nodes().get(shard.currentNodeId());
          DiscoveryNode nodeWithHighestMatch = matchingNodes.getNodeWithHighestMatch();
          if (currentNode.equals(nodeWithHighestMatch) == false
              && matchingNodes.isNodeMatchBySyncID(currentNode) == false
              && matchingNodes.isNodeMatchBySyncID(nodeWithHighestMatch) == true) {
            // we found a better match that has a full sync id match, the existing allocation is not
            // fully synced
            // so we found a better one, cancel this one
            it.moveToUnassigned(
                new UnassignedInfo(
                    UnassignedInfo.Reason.REALLOCATED_REPLICA,
                    "existing allocation of replica to ["
                        + currentNode
                        + "] cancelled, sync id match found on node ["
                        + nodeWithHighestMatch
                        + "]",
                    null,
                    allocation.getCurrentNanoTime(),
                    System.currentTimeMillis()));
            changed = true;
          }
        }
      }
    }
    return changed;
  }
  private MatchingNodes findMatchingNodes(
      ShardRouting shard,
      RoutingAllocation allocation,
      TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore,
      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          data) {
    ObjectLongMap<DiscoveryNode> nodesToSize = new ObjectLongHashMap<>();
    for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
        nodeStoreEntry : data.getData().entrySet()) {
      DiscoveryNode discoNode = nodeStoreEntry.getKey();
      TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
          nodeStoreEntry.getValue().storeFilesMetaData();
      if (storeFilesMetaData == null) {
        // already allocated on that node...
        continue;
      }

      RoutingNode node = allocation.routingNodes().node(discoNode.id());
      if (node == null) {
        continue;
      }

      // check if we can allocate on that node...
      // we only check for NO, since if this node is THROTTLING and it has enough "same data"
      // then we will try and assign it next time
      Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
      if (decision.type() == Decision.Type.NO) {
        continue;
      }

      // if it is already allocated, we can't assign to it... (and it might be primary as well)
      if (storeFilesMetaData.allocated()) {
        continue;
      }

      // we don't have any files at all, it is an empty index
      if (storeFilesMetaData.iterator().hasNext() == false) {
        continue;
      }

      String primarySyncId = primaryStore.syncId();
      String replicaSyncId = storeFilesMetaData.syncId();
      // see if we have a sync id we can make use of
      if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
        logger.trace(
            "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId);
        nodesToSize.put(discoNode, Long.MAX_VALUE);
      } else {
        long sizeMatched = 0;
        for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
          String metaDataFileName = storeFileMetaData.name();
          if (primaryStore.fileExists(metaDataFileName)
              && primaryStore.file(metaDataFileName).isSame(storeFileMetaData)) {
            sizeMatched += storeFileMetaData.length();
          }
        }
        logger.trace(
            "{}: node [{}] has [{}/{}] bytes of re-usable data",
            shard,
            discoNode.name(),
            new ByteSizeValue(sizeMatched),
            sizeMatched);
        nodesToSize.put(discoNode, sizeMatched);
      }
    }

    return new MatchingNodes(nodesToSize);
  }