Exemplo n.º 1
0
  private boolean moveShards(RoutingAllocation allocation) {
    boolean changed = false;

    // create a copy of the shards interleaving between nodes, and check if they can remain
    List<MutableShardRouting> shards = new ArrayList<>();
    int index = 0;
    boolean found = true;
    final RoutingNodes routingNodes = allocation.routingNodes();
    while (found) {
      found = false;
      for (RoutingNode routingNode : routingNodes) {
        if (index >= routingNode.size()) {
          continue;
        }
        found = true;
        shards.add(routingNode.get(index));
      }
      index++;
    }
    for (int i = 0; i < shards.size(); i++) {
      MutableShardRouting shardRouting = shards.get(i);
      // we can only move started shards...
      if (!shardRouting.started()) {
        continue;
      }
      final RoutingNode routingNode = routingNodes.node(shardRouting.currentNodeId());
      Decision decision = allocation.deciders().canRemain(shardRouting, routingNode, allocation);
      if (decision.type() == Decision.Type.NO) {
        logger.debug(
            "[{}][{}] allocated on [{}], but can no longer be allocated on it, moving...",
            shardRouting.index(),
            shardRouting.id(),
            routingNode.node());
        boolean moved = shardsAllocators.move(shardRouting, routingNode, allocation);
        if (!moved) {
          logger.debug("[{}][{}] can't move", shardRouting.index(), shardRouting.id());
        } else {
          changed = true;
        }
      }
    }
    return changed;
  }
  @Test
  public void indexLevelShardsLimitRemain() {
    AllocationService strategy =
        new AllocationService(
            settingsBuilder()
                .put("cluster.routing.allocation.concurrent_recoveries", 10)
                .put("cluster.routing.allocation.node_initial_primaries_recoveries", 10)
                .put("cluster.routing.allocation.cluster_concurrent_rebalance", -1)
                .put("cluster.routing.allocation.balance.index", 0.0f)
                .put("cluster.routing.allocation.balance.replica", 1.0f)
                .put("cluster.routing.allocation.balance.primary", 0.0f)
                .build());

    logger.info("Building initial routing table");

    MetaData metaData =
        newMetaDataBuilder()
            .put(
                newIndexMetaDataBuilder("test")
                    .settings(
                        ImmutableSettings.settingsBuilder()
                            .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5)
                            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)))
            .build();

    RoutingTable routingTable = routingTable().addAsNew(metaData.index("test")).build();

    ClusterState clusterState =
        newClusterStateBuilder().metaData(metaData).routingTable(routingTable).build();
    logger.info("Adding one node and reroute");
    clusterState =
        newClusterStateBuilder()
            .state(clusterState)
            .nodes(newNodesBuilder().put(newNode("node1")))
            .build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    logger.info("Start the primary shards");
    RoutingNodes routingNodes = clusterState.routingNodes();
    routingTable =
        strategy
            .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING))
            .routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    assertThat(clusterState.readOnlyRoutingNodes().numberOfShardsOfType(STARTED), equalTo(5));

    logger.info("add another index with 5 shards");
    metaData =
        newMetaDataBuilder()
            .metaData(metaData)
            .put(
                newIndexMetaDataBuilder("test1")
                    .settings(
                        ImmutableSettings.settingsBuilder()
                            .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5)
                            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)))
            .build();
    routingTable =
        routingTable().routingTable(routingTable).addAsNew(metaData.index("test1")).build();

    clusterState =
        newClusterStateBuilder()
            .state(clusterState)
            .metaData(metaData)
            .routingTable(routingTable)
            .build();

    logger.info("Add another one node and reroute");
    clusterState =
        newClusterStateBuilder()
            .state(clusterState)
            .nodes(newNodesBuilder().putAll(clusterState.nodes()).put(newNode("node2")))
            .build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    routingNodes = clusterState.routingNodes();
    routingTable =
        strategy
            .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING))
            .routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    assertThat(clusterState.readOnlyRoutingNodes().numberOfShardsOfType(STARTED), equalTo(10));

    for (MutableShardRouting shardRouting : clusterState.readOnlyRoutingNodes().node("node1")) {
      assertThat(shardRouting.index(), equalTo("test"));
    }
    for (MutableShardRouting shardRouting : clusterState.readOnlyRoutingNodes().node("node2")) {
      assertThat(shardRouting.index(), equalTo("test1"));
    }

    logger.info(
        "update "
            + ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE
            + " for test, see that things move");
    metaData =
        newMetaDataBuilder()
            .metaData(metaData)
            .put(
                newIndexMetaDataBuilder("test")
                    .settings(
                        ImmutableSettings.settingsBuilder()
                            .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5)
                            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
                            .put(ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE, 3)))
            .build();

    clusterState = newClusterStateBuilder().state(clusterState).metaData(metaData).build();

    logger.info("reroute after setting");
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    assertThat(
        clusterState.readOnlyRoutingNodes().node("node1").numberOfShardsWithState(STARTED),
        equalTo(3));
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node1").numberOfShardsWithState(RELOCATING),
        equalTo(2));
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node2").numberOfShardsWithState(RELOCATING),
        equalTo(2));
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node2").numberOfShardsWithState(STARTED),
        equalTo(3));
    // the first move will destroy the balance and the balancer will move 2 shards from node2 to
    // node one right after
    // moving the nodes to node2 since we consider INITIALIZING nodes during rebalance
    routingNodes = clusterState.routingNodes();
    routingTable =
        strategy
            .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING))
            .routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();
    // now we are done compared to EvenShardCountAllocator since the Balancer is not soely based on
    // the average
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node1").numberOfShardsWithState(STARTED),
        equalTo(5));
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node2").numberOfShardsWithState(STARTED),
        equalTo(5));
  }
  @Test
  public void testMultiIndexEvenDistribution() {
    AllocationService strategy =
        createAllocationService(
            settingsBuilder()
                .put("cluster.routing.allocation.concurrent_recoveries", 10)
                .put("cluster.routing.allocation.allow_rebalance", "always")
                .put("cluster.routing.allocation.cluster_concurrent_rebalance", -1)
                .build());

    final int numberOfIndices = 50;
    logger.info("Building initial routing table with " + numberOfIndices + " indices");

    MetaData.Builder metaDataBuilder = MetaData.builder();
    for (int i = 0; i < numberOfIndices; i++) {
      metaDataBuilder.put(IndexMetaData.builder("test" + i).numberOfShards(1).numberOfReplicas(0));
    }
    MetaData metaData = metaDataBuilder.build();

    RoutingTable.Builder routingTableBuilder = RoutingTable.builder();
    for (int i = 0; i < numberOfIndices; i++) {
      routingTableBuilder.addAsNew(metaData.index("test" + i));
    }
    RoutingTable routingTable = routingTableBuilder.build();
    ClusterState clusterState =
        ClusterState.builder().metaData(metaData).routingTable(routingTable).build();

    assertThat(routingTable.indicesRouting().size(), equalTo(numberOfIndices));
    for (int i = 0; i < numberOfIndices; i++) {
      assertThat(routingTable.index("test" + i).shards().size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).shards().size(), equalTo(1));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).state(), equalTo(UNASSIGNED));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).currentNodeId(), nullValue());
    }

    logger.info("Adding " + (numberOfIndices / 2) + " nodes");
    DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder();
    List<DiscoveryNode> nodes = newArrayList();
    for (int i = 0; i < (numberOfIndices / 2); i++) {
      nodesBuilder.put(newNode("node" + i));
    }
    RoutingTable prevRoutingTable = routingTable;
    clusterState = ClusterState.builder(clusterState).nodes(nodesBuilder).build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();

    assertThat(prevRoutingTable != routingTable, equalTo(true));
    for (int i = 0; i < numberOfIndices; i++) {
      assertThat(routingTable.index("test" + i).shards().size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).shards().size(), equalTo(1));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).unassigned(), equalTo(false));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).state(), equalTo(INITIALIZING));
      assertThat(routingTable.index("test" + i).shard(0).shards().get(0).primary(), equalTo(true));
      // make sure we still have 2 shards initializing per node on the first 25 nodes
      String nodeId = routingTable.index("test" + i).shard(0).shards().get(0).currentNodeId();
      int nodeIndex = Integer.parseInt(nodeId.substring("node".length()));
      assertThat(nodeIndex, lessThan(25));
    }
    RoutingNodes routingNodes = clusterState.routingNodes();
    Set<String> encounteredIndices = newHashSet();
    for (RoutingNode routingNode : routingNodes) {
      assertThat(routingNode.numberOfShardsWithState(STARTED), equalTo(0));
      assertThat(routingNode.size(), equalTo(2));
      // make sure we still have 2 shards initializing per node on the only 25 nodes
      int nodeIndex = Integer.parseInt(routingNode.nodeId().substring("node".length()));
      assertThat(nodeIndex, lessThan(25));
      // check that we don't have a shard associated with a node with the same index name (we have a
      // single shard)
      for (MutableShardRouting shardRoutingEntry : routingNode) {
        assertThat(encounteredIndices, not(hasItem(shardRoutingEntry.index())));
        encounteredIndices.add(shardRoutingEntry.index());
      }
    }

    logger.info("Adding additional " + (numberOfIndices / 2) + " nodes, nothing should change");
    nodesBuilder = DiscoveryNodes.builder(clusterState.nodes());
    for (int i = (numberOfIndices / 2); i < numberOfIndices; i++) {
      nodesBuilder.put(newNode("node" + i));
    }
    prevRoutingTable = routingTable;
    clusterState = ClusterState.builder(clusterState).nodes(nodesBuilder).build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();

    assertThat(prevRoutingTable != routingTable, equalTo(false));

    logger.info("Marking the shard as started");
    prevRoutingTable = routingTable;
    routingTable =
        strategy
            .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING))
            .routingTable();
    clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();

    assertThat(prevRoutingTable != routingTable, equalTo(true));
    int numberOfRelocatingShards = 0;
    int numberOfStartedShards = 0;
    for (int i = 0; i < numberOfIndices; i++) {
      assertThat(routingTable.index("test" + i).shards().size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).shards().size(), equalTo(1));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).unassigned(), equalTo(false));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).state(),
          anyOf(equalTo(STARTED), equalTo(RELOCATING)));
      if (routingTable.index("test" + i).shard(0).shards().get(0).state() == STARTED) {
        numberOfStartedShards++;
      } else if (routingTable.index("test" + i).shard(0).shards().get(0).state() == RELOCATING) {
        numberOfRelocatingShards++;
      }
      assertThat(routingTable.index("test" + i).shard(0).shards().get(0).primary(), equalTo(true));
      // make sure we still have 2 shards either relocating or started on the first 25 nodes (still)
      String nodeId = routingTable.index("test" + i).shard(0).shards().get(0).currentNodeId();
      int nodeIndex = Integer.parseInt(nodeId.substring("node".length()));
      assertThat(nodeIndex, lessThan(25));
    }
    assertThat(numberOfRelocatingShards, equalTo(25));
    assertThat(numberOfStartedShards, equalTo(25));
  }
  @Override
  public void execute(RoutingAllocation allocation) throws ElasticSearchException {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
    boolean found = false;
    for (RoutingNodes.RoutingNodeIterator it =
            allocation.routingNodes().routingNodeIter(discoNode.id());
        it.hasNext(); ) {
      MutableShardRouting shardRouting = it.next();
      if (!shardRouting.shardId().equals(shardId)) {
        continue;
      }
      found = true;
      if (shardRouting.relocatingNodeId() != null) {
        if (shardRouting.initializing()) {
          // the shard is initializing and recovering from another node, simply cancel the recovery
          it.remove();
          // and cancel the relocating state from the shard its being relocated from
          RoutingNode relocatingFromNode =
              allocation.routingNodes().node(shardRouting.relocatingNodeId());
          if (relocatingFromNode != null) {
            for (MutableShardRouting fromShardRouting : relocatingFromNode) {
              if (fromShardRouting.shardId().equals(shardRouting.shardId())
                  && fromShardRouting.state() == RELOCATING) {
                allocation.routingNodes().cancelRelocation(fromShardRouting);
                break;
              }
            }
          }
        } else if (shardRouting.relocating()) {

          // the shard is relocating to another node, cancel the recovery on the other node, and
          // deallocate this one
          if (!allowPrimary && shardRouting.primary()) {
            // can't cancel a primary shard being initialized
            throw new ElasticSearchIllegalArgumentException(
                "[cancel_allocation] can't cancel "
                    + shardId
                    + " on node "
                    + discoNode
                    + ", shard is primary and initializing its state");
          }
          it.moveToUnassigned();
          // now, go and find the shard that is initializing on the target node, and cancel it as
          // well...
          RoutingNodes.RoutingNodeIterator initializingNode =
              allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId());
          if (initializingNode != null) {
            while (initializingNode.hasNext()) {
              MutableShardRouting initializingShardRouting = initializingNode.next();
              if (initializingShardRouting.shardId().equals(shardRouting.shardId())
                  && initializingShardRouting.state() == INITIALIZING) {
                initializingNode.remove();
              }
            }
          }
        }
      } else {
        // the shard is not relocating, its either started, or initializing, just cancel it and move
        // on...
        if (!allowPrimary && shardRouting.primary()) {
          // can't cancel a primary shard being initialized
          throw new ElasticSearchIllegalArgumentException(
              "[cancel_allocation] can't cancel "
                  + shardId
                  + " on node "
                  + discoNode
                  + ", shard is primary and started");
        }
        it.remove();
        allocation
            .routingNodes()
            .unassigned()
            .add(
                new MutableShardRouting(
                    shardRouting.index(),
                    shardRouting.id(),
                    null,
                    shardRouting.primary(),
                    ShardRoutingState.UNASSIGNED,
                    shardRouting.version() + 1));
      }
    }
    if (!found) {
      throw new ElasticSearchIllegalArgumentException(
          "[cancel_allocation] can't cancel "
              + shardId
              + ", failed to find it on node "
              + discoNode);
    }
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard);

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet();
      final boolean[] states = nodesState.allocated;
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      for (int i = 0; i < states.length; i++) {
        if (!states[i]) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (version != -1) {
          numberOfAllocationsFound++;
          if (highestVersion == -1) {
            nodesWithHighestVersion.add(node);
            highestVersion = version;
          } else {
            if (version > highestVersion) {
              nodesWithHighestVersion.clear();
              nodesWithHighestVersion.add(node);
              highestVersion = version;
            } else if (version == highestVersion) {
              nodesWithHighestVersion.add(node);
            }
          }
        }
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index());
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        continue;
      }

      Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
          buildShardStores(nodes, shard);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
          nodeStoreEntry : shardStores.entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          MutableShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                  shardStores.get(primaryNode);
              if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                long sizeMatched = 0;

                for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                  if (primaryNodeStore.fileExists(storeFileMetaData.name())
                      && primaryNodeStore
                          .file(storeFileMetaData.name())
                          .isSame(storeFileMetaData)) {
                    sizeMatched += storeFileMetaData.length();
                  }
                }
                logger.trace(
                    "{}: node [{}] has [{}/{}] bytes of re-usable data",
                    shard,
                    discoNode.name(),
                    new ByteSizeValue(sizeMatched),
                    sizeMatched);
                if (sizeMatched > lastSizeMatched) {
                  lastSizeMatched = sizeMatched;
                  lastDiscoNodeMatched = discoNode;
                  lastNodeMatched = node;
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          allocation.routingNodes().assign(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      }
    }
    return changed;
  }