/**
  * Check if the allocation of the replica is to be delayed. Compute the delay and if it is
  * delayed, add it to the ignore unassigned list Note: we only care about replica in delayed
  * allocation, since if we have an unassigned primary it will anyhow wait to find an existing copy
  * of the shard to be allocated Note: the other side of the equation is scheduling a reroute in a
  * timely manner, which happens in the RoutingService
  *
  * <p>PUBLIC FOR TESTS!
  *
  * @param unassignedIterator iterator over unassigned shards
  * @param shard the shard which might be delayed
  * @return true iff allocation is delayed for this shard
  */
 public boolean ignoreUnassignedIfDelayed(
     RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator, ShardRouting shard) {
   // calculate delay and store it in UnassignedInfo to be used by RoutingService
   long delay = shard.unassignedInfo().getLastComputedLeftDelayNanos();
   if (delay > 0) {
     logger.debug(
         "[{}][{}]: delaying allocation of [{}] for [{}]",
         shard.index(),
         shard.id(),
         shard,
         TimeValue.timeValueNanos(delay));
     /**
      * mark it as changed, since we want to kick a publishing to schedule future allocation, see
      * {@link
      * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
      */
     unassignedIterator.removeAndIgnore();
     return true;
   }
   return false;
 }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    final RoutingNodes routingNodes = allocation.routingNodes();
    final MetaData metaData = routingNodes.metaData();

    final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator =
        routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.removeAndIgnore();
        continue;
      }

      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores = fetchData(shard, allocation);
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        unassignedIterator.removeAndIgnore();
        continue; // still fetching
      }

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;
      boolean hasReplicaData = false;
      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          nodeStoreEntry : shardStores.getData().entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue().storeFilesMetaData();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          hasReplicaData |= storeFilesMetaData.iterator().hasNext();
          ShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
                  shardStores.getData().get(primaryNode);
              if (primaryNodeFilesStore != null) {
                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                    primaryNodeFilesStore.storeFilesMetaData();
                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                  long sizeMatched = 0;

                  String primarySyncId = primaryNodeStore.syncId();
                  String replicaSyncId = storeFilesMetaData.syncId();
                  // see if we have a sync id we can make use of
                  if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
                    logger.trace(
                        "{}: node [{}] has same sync id {} as primary",
                        shard,
                        discoNode.name(),
                        replicaSyncId);
                    lastNodeMatched = node;
                    lastSizeMatched = Long.MAX_VALUE;
                    lastDiscoNodeMatched = discoNode;
                  } else {
                    for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                      String metaDataFileName = storeFileMetaData.name();
                      if (primaryNodeStore.fileExists(metaDataFileName)
                          && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
                        sizeMatched += storeFileMetaData.length();
                      }
                    }
                    logger.trace(
                        "{}: node [{}] has [{}/{}] bytes of re-usable data",
                        shard,
                        discoNode.name(),
                        new ByteSizeValue(sizeMatched),
                        sizeMatched);
                    if (sizeMatched > lastSizeMatched) {
                      lastSizeMatched = sizeMatched;
                      lastDiscoNodeMatched = discoNode;
                      lastNodeMatched = node;
                    }
                  }
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.removeAndIgnore();
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          unassignedIterator.initialize(lastNodeMatched.nodeId());
        }
      } else if (hasReplicaData == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation
        // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
        // note: we only care about replica in delayed allocation, since if we have an unassigned
        // primary it
        //       will anyhow wait to find an existing copy of the shard to be allocated
        // note: the other side of the equation is scheduling a reroute in a timely manner, which
        // happens in the RoutingService
        long delay =
            shard
                .unassignedInfo()
                .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
        if (delay > 0) {
          logger.debug(
              "[{}][{}]: delaying allocation of [{}] for [{}]",
              shard.index(),
              shard.id(),
              shard,
              TimeValue.timeValueMillis(delay));
          /**
           * mark it as changed, since we want to kick a publishing to schedule future allocation,
           * see {@link
           * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
           */
          changed = true;
          unassignedIterator.removeAndIgnore();
        }
      }
    }
    return changed;
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    final RoutingNodes routingNodes = allocation.routingNodes();
    final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator =
        routingNodes.unassigned().iterator();
    MetaData metaData = allocation.metaData();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // if we are allocating a replica because of index creation, no need to go and find a copy,
      // there isn't one...
      IndexMetaData indexMetaData = metaData.index(shard.getIndexName());
      if (shard.allocatedPostIndexCreate(indexMetaData) == false) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      if (canBeAllocatedToAtLeastOneNode(shard, allocation) == false) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.removeAndIgnore();
        continue;
      }

      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores = fetchData(shard, allocation);
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        allocation.setHasPendingAsyncFetch();
        unassignedIterator.removeAndIgnore();
        continue; // still fetching
      }

      ShardRouting primaryShard = routingNodes.activePrimary(shard);
      assert primaryShard != null
          : "the replica shard can be allocated on at least one node, so there must be an active primary";
      TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore =
          findStore(primaryShard, allocation, shardStores);
      if (primaryStore == null || primaryStore.allocated() == false) {
        // if we can't find the primary data, it is probably because the primary shard is corrupted
        // (and listing failed)
        // we want to let the replica be allocated in order to expose the actual problem with the
        // primary that the replica
        // will try and recover from
        // Note, this is the existing behavior, as exposed in running
        // CorruptFileTest#testNoPrimaryData
        logger.trace(
            "{}: no primary shard store found or allocated, letting actual allocation figure it out",
            shard);
        continue;
      }

      MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryStore, shardStores);

      if (matchingNodes.getNodeWithHighestMatch() != null) {
        RoutingNode nodeWithHighestMatch =
            allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().id());
        // we only check on THROTTLE since we checked before before on NO
        Decision decision =
            allocation.deciders().canAllocate(shard, nodeWithHighestMatch, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store",
              shard.index(),
              shard.id(),
              shard,
              nodeWithHighestMatch.node());
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.removeAndIgnore();
        } else {
          logger.debug(
              "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store",
              shard.index(),
              shard.id(),
              shard,
              nodeWithHighestMatch.node());
          // we found a match
          changed = true;
          unassignedIterator.initialize(
              nodeWithHighestMatch.nodeId(),
              null,
              allocation
                  .clusterInfo()
                  .getShardSize(shard, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE));
        }
      } else if (matchingNodes.hasAnyData() == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation of the replica shard needs to be delayed
        changed |= ignoreUnassignedIfDelayed(unassignedIterator, shard);
      }
    }
    return changed;
  }
  @Override
  public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) {
    final DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
    final RoutingNodes routingNodes = allocation.routingNodes();

    ShardRouting shardRouting = null;
    for (ShardRouting routing : routingNodes.unassigned()) {
      if (routing.shardId().equals(shardId)) {
        // prefer primaries first to allocate
        if (shardRouting == null || routing.primary()) {
          shardRouting = routing;
        }
      }
    }

    if (shardRouting == null) {
      if (explain) {
        return new RerouteExplanation(
            this,
            allocation.decision(
                Decision.NO,
                "allocate_allocation_command",
                "failed to find " + shardId + " on the list of unassigned shards"));
      }
      throw new IllegalArgumentException(
          "[allocate] failed to find " + shardId + " on the list of unassigned shards");
    }

    if (shardRouting.primary() && !allowPrimary) {
      if (explain) {
        return new RerouteExplanation(
            this,
            allocation.decision(
                Decision.NO,
                "allocate_allocation_command",
                "trying to allocate a primary shard " + shardId + ", which is disabled"));
      }
      throw new IllegalArgumentException(
          "[allocate] trying to allocate a primary shard " + shardId + ", which is disabled");
    }

    RoutingNode routingNode = routingNodes.node(discoNode.id());
    if (routingNode == null) {
      if (!discoNode.dataNode()) {
        if (explain) {
          return new RerouteExplanation(
              this,
              allocation.decision(
                  Decision.NO,
                  "allocate_allocation_command",
                  "Allocation can only be done on data nodes, not [" + node + "]"));
        }
        throw new IllegalArgumentException(
            "Allocation can only be done on data nodes, not [" + node + "]");
      } else {
        if (explain) {
          return new RerouteExplanation(
              this,
              allocation.decision(
                  Decision.NO,
                  "allocate_allocation_command",
                  "Could not find [" + node + "] among the routing nodes"));
        }
        throw new IllegalStateException("Could not find [" + node + "] among the routing nodes");
      }
    }

    Decision decision = allocation.deciders().canAllocate(shardRouting, routingNode, allocation);
    if (decision.type() == Decision.Type.NO) {
      if (explain) {
        return new RerouteExplanation(this, decision);
      }
      throw new IllegalArgumentException(
          "[allocate] allocation of "
              + shardId
              + " on node "
              + discoNode
              + " is not allowed, reason: "
              + decision);
    }
    // go over and remove it from the unassigned
    for (RoutingNodes.UnassignedShards.UnassignedIterator it = routingNodes.unassigned().iterator();
        it.hasNext(); ) {
      if (it.next() != shardRouting) {
        continue;
      }
      it.initialize(routingNode.nodeId());
      if (shardRouting.primary()) {
        // we need to clear the post allocation flag, since its an explicit allocation of the
        // primary shard
        // and we want to force allocate it (and create a new index for it)
        routingNodes.addClearPostAllocationFlag(shardRouting.shardId());
      }
      break;
    }
    return new RerouteExplanation(this, decision);
  }