private Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
      buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) {
    Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
        cachedStores.get(shard.shardId());
    ObjectOpenHashSet<String> nodesIds;
    if (shardStores == null) {
      shardStores = Maps.newHashMap();
      cachedStores.put(shard.shardId(), shardStores);
      nodesIds = ObjectOpenHashSet.from(nodes.dataNodes().keys());
    } else {
      nodesIds = ObjectOpenHashSet.newInstance();
      // clean nodes that have failed
      for (Iterator<DiscoveryNode> it = shardStores.keySet().iterator(); it.hasNext(); ) {
        DiscoveryNode node = it.next();
        if (!nodes.nodeExists(node.id())) {
          it.remove();
        }
      }

      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        DiscoveryNode node = cursor.value;
        if (!shardStores.containsKey(node)) {
          nodesIds.add(node.id());
        }
      }
    }

    if (!nodesIds.isEmpty()) {
      String[] nodesIdsArray = nodesIds.toArray(String.class);
      TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData =
          listShardStoreMetaData
              .list(shard.shardId(), false, nodesIdsArray, listTimeout)
              .actionGet();
      if (logger.isTraceEnabled()) {
        if (nodesStoreFilesMetaData.failures().length > 0) {
          StringBuilder sb =
              new StringBuilder(shard + ": failures when trying to list stores on nodes:");
          for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) {
            Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]);
            if (cause instanceof ConnectTransportException) {
              continue;
            }
            sb.append("\n    -> ")
                .append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage());
          }
          logger.trace(sb.toString());
        }
      }

      for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData :
          nodesStoreFilesMetaData) {
        if (nodeStoreFilesMetaData.storeFilesMetaData() != null) {
          shardStores.put(
              nodeStoreFilesMetaData.getNode(), nodeStoreFilesMetaData.storeFilesMetaData());
        }
      }
    }

    return shardStores;
  }
 @Override
 public boolean canRebalance(ShardRouting shardRouting, RoutingAllocation allocation) {
   if (clusterConcurrentRebalance == -1) {
     return true;
   }
   int rebalance = 0;
   for (RoutingNode node : allocation.routingNodes()) {
     for (MutableShardRouting shard : node) {
       if (shard.state() == ShardRoutingState.RELOCATING) {
         rebalance++;
       }
     }
   }
   if (rebalance >= clusterConcurrentRebalance) {
     return false;
   }
   return true;
 }
Пример #3
0
  private boolean applyStartedShards(
      RoutingNodes routingNodes, Iterable<? extends ShardRouting> startedShardEntries) {
    boolean dirty = false;
    // apply shards might be called several times with the same shard, ignore it
    for (ShardRouting startedShard : startedShardEntries) {
      assert startedShard.state() == INITIALIZING;

      // retrieve the relocating node id before calling startedShard().
      String relocatingNodeId = null;

      RoutingNodes.RoutingNodeIterator currentRoutingNode =
          routingNodes.routingNodeIter(startedShard.currentNodeId());
      if (currentRoutingNode != null) {
        for (MutableShardRouting shard : currentRoutingNode) {
          if (shard.shardId().equals(startedShard.shardId())) {
            relocatingNodeId = shard.relocatingNodeId();
            if (!shard.started()) {
              dirty = true;
              routingNodes.started(shard);
            }
            break;
          }
        }
      }

      // startedShard is the current state of the shard (post relocation for example)
      // this means that after relocation, the state will be started and the currentNodeId will be
      // the node we relocated to

      if (relocatingNodeId == null) {
        continue;
      }

      RoutingNodes.RoutingNodeIterator sourceRoutingNode =
          routingNodes.routingNodeIter(relocatingNodeId);
      if (sourceRoutingNode != null) {
        while (sourceRoutingNode.hasNext()) {
          MutableShardRouting shard = sourceRoutingNode.next();
          if (shard.shardId().equals(startedShard.shardId())) {
            if (shard.relocating()) {
              dirty = true;
              sourceRoutingNode.remove();
              break;
            }
          }
        }
      }
    }
    return dirty;
  }
Пример #4
0
  private boolean moveShards(RoutingAllocation allocation) {
    boolean changed = false;

    // create a copy of the shards interleaving between nodes, and check if they can remain
    List<MutableShardRouting> shards = new ArrayList<>();
    int index = 0;
    boolean found = true;
    final RoutingNodes routingNodes = allocation.routingNodes();
    while (found) {
      found = false;
      for (RoutingNode routingNode : routingNodes) {
        if (index >= routingNode.size()) {
          continue;
        }
        found = true;
        shards.add(routingNode.get(index));
      }
      index++;
    }
    for (int i = 0; i < shards.size(); i++) {
      MutableShardRouting shardRouting = shards.get(i);
      // we can only move started shards...
      if (!shardRouting.started()) {
        continue;
      }
      final RoutingNode routingNode = routingNodes.node(shardRouting.currentNodeId());
      Decision decision = allocation.deciders().canRemain(shardRouting, routingNode, allocation);
      if (decision.type() == Decision.Type.NO) {
        logger.debug(
            "[{}][{}] allocated on [{}], but can no longer be allocated on it, moving...",
            shardRouting.index(),
            shardRouting.id(),
            routingNode.node());
        boolean moved = shardsAllocators.move(shardRouting, routingNode, allocation);
        if (!moved) {
          logger.debug("[{}][{}] can't move", shardRouting.index(), shardRouting.id());
        } else {
          changed = true;
        }
      }
    }
    return changed;
  }
  @Override
  public void execute(RoutingAllocation allocation) throws ElasticSearchException {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);

    MutableShardRouting shardRouting = null;
    for (MutableShardRouting routing : allocation.routingNodes().unassigned()) {
      if (routing.shardId().equals(shardId)) {
        // prefer primaries first to allocate
        if (shardRouting == null || routing.primary()) {
          shardRouting = routing;
        }
      }
    }

    if (shardRouting == null) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] failed to find " + shardId + " on the list of unassigned shards");
    }

    if (shardRouting.primary() && !allowPrimary) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] trying to allocate a primary shard " + shardId + "], which is disabled");
    }

    RoutingNode routingNode = allocation.routingNodes().node(discoNode.id());
    allocation.addIgnoreDisable(shardRouting.shardId(), routingNode.nodeId());
    if (!allocation.deciders().canAllocate(shardRouting, routingNode, allocation).allowed()) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed");
    }
    // go over and remove it from the unassigned
    for (Iterator<MutableShardRouting> it = allocation.routingNodes().unassigned().iterator();
        it.hasNext(); ) {
      if (it.next() != shardRouting) {
        continue;
      }
      it.remove();
      routingNode.add(shardRouting);
      break;
    }
  }
  @Test
  public void indexLevelShardsLimitRemain() {
    AllocationService strategy =
        new AllocationService(
            settingsBuilder()
                .put("cluster.routing.allocation.concurrent_recoveries", 10)
                .put("cluster.routing.allocation.node_initial_primaries_recoveries", 10)
                .put("cluster.routing.allocation.cluster_concurrent_rebalance", -1)
                .put("cluster.routing.allocation.balance.index", 0.0f)
                .put("cluster.routing.allocation.balance.replica", 1.0f)
                .put("cluster.routing.allocation.balance.primary", 0.0f)
                .build());

    logger.info("Building initial routing table");

    MetaData metaData =
        newMetaDataBuilder()
            .put(
                newIndexMetaDataBuilder("test")
                    .settings(
                        ImmutableSettings.settingsBuilder()
                            .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5)
                            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)))
            .build();

    RoutingTable routingTable = routingTable().addAsNew(metaData.index("test")).build();

    ClusterState clusterState =
        newClusterStateBuilder().metaData(metaData).routingTable(routingTable).build();
    logger.info("Adding one node and reroute");
    clusterState =
        newClusterStateBuilder()
            .state(clusterState)
            .nodes(newNodesBuilder().put(newNode("node1")))
            .build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    logger.info("Start the primary shards");
    RoutingNodes routingNodes = clusterState.routingNodes();
    routingTable =
        strategy
            .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING))
            .routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    assertThat(clusterState.readOnlyRoutingNodes().numberOfShardsOfType(STARTED), equalTo(5));

    logger.info("add another index with 5 shards");
    metaData =
        newMetaDataBuilder()
            .metaData(metaData)
            .put(
                newIndexMetaDataBuilder("test1")
                    .settings(
                        ImmutableSettings.settingsBuilder()
                            .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5)
                            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)))
            .build();
    routingTable =
        routingTable().routingTable(routingTable).addAsNew(metaData.index("test1")).build();

    clusterState =
        newClusterStateBuilder()
            .state(clusterState)
            .metaData(metaData)
            .routingTable(routingTable)
            .build();

    logger.info("Add another one node and reroute");
    clusterState =
        newClusterStateBuilder()
            .state(clusterState)
            .nodes(newNodesBuilder().putAll(clusterState.nodes()).put(newNode("node2")))
            .build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    routingNodes = clusterState.routingNodes();
    routingTable =
        strategy
            .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING))
            .routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    assertThat(clusterState.readOnlyRoutingNodes().numberOfShardsOfType(STARTED), equalTo(10));

    for (MutableShardRouting shardRouting : clusterState.readOnlyRoutingNodes().node("node1")) {
      assertThat(shardRouting.index(), equalTo("test"));
    }
    for (MutableShardRouting shardRouting : clusterState.readOnlyRoutingNodes().node("node2")) {
      assertThat(shardRouting.index(), equalTo("test1"));
    }

    logger.info(
        "update "
            + ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE
            + " for test, see that things move");
    metaData =
        newMetaDataBuilder()
            .metaData(metaData)
            .put(
                newIndexMetaDataBuilder("test")
                    .settings(
                        ImmutableSettings.settingsBuilder()
                            .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5)
                            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
                            .put(ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE, 3)))
            .build();

    clusterState = newClusterStateBuilder().state(clusterState).metaData(metaData).build();

    logger.info("reroute after setting");
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    assertThat(
        clusterState.readOnlyRoutingNodes().node("node1").numberOfShardsWithState(STARTED),
        equalTo(3));
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node1").numberOfShardsWithState(RELOCATING),
        equalTo(2));
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node2").numberOfShardsWithState(RELOCATING),
        equalTo(2));
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node2").numberOfShardsWithState(STARTED),
        equalTo(3));
    // the first move will destroy the balance and the balancer will move 2 shards from node2 to
    // node one right after
    // moving the nodes to node2 since we consider INITIALIZING nodes during rebalance
    routingNodes = clusterState.routingNodes();
    routingTable =
        strategy
            .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING))
            .routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();
    // now we are done compared to EvenShardCountAllocator since the Balancer is not soely based on
    // the average
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node1").numberOfShardsWithState(STARTED),
        equalTo(5));
    assertThat(
        clusterState.readOnlyRoutingNodes().node("node2").numberOfShardsWithState(STARTED),
        equalTo(5));
  }
  @Test
  public void sameHost() {
    AllocationService strategy =
        new AllocationService(
            settingsBuilder().put(SameShardAllocationDecider.SAME_HOST_SETTING, true).build());

    MetaData metaData =
        newMetaDataBuilder()
            .put(newIndexMetaDataBuilder("test").numberOfShards(2).numberOfReplicas(1))
            .build();

    RoutingTable routingTable = routingTable().addAsNew(metaData.index("test")).build();
    ClusterState clusterState =
        newClusterStateBuilder().metaData(metaData).routingTable(routingTable).build();

    logger.info("--> adding two nodes with the same host");
    clusterState =
        newClusterStateBuilder()
            .state(clusterState)
            .nodes(
                newNodesBuilder()
                    .put(newNode("node1", new InetSocketTransportAddress("test1", 80)))
                    .put(newNode("node2", new InetSocketTransportAddress("test1", 80))))
            .build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    assertThat(
        clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.INITIALIZING),
        equalTo(2));

    logger.info(
        "--> start all primary shards, no replica will be started since its on the same host");
    routingTable =
        strategy
            .applyStartedShards(
                clusterState, clusterState.readOnlyRoutingNodes().shardsWithState(INITIALIZING))
            .routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    assertThat(
        clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.STARTED),
        equalTo(2));
    assertThat(
        clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.INITIALIZING),
        equalTo(0));

    logger.info("--> add another node, with a different host, replicas will be allocating");
    clusterState =
        newClusterStateBuilder()
            .state(clusterState)
            .nodes(
                newNodesBuilder()
                    .putAll(clusterState.nodes())
                    .put(newNode("node3", new InetSocketTransportAddress("test2", 80))))
            .build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = newClusterStateBuilder().state(clusterState).routingTable(routingTable).build();

    assertThat(
        clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.STARTED),
        equalTo(2));
    assertThat(
        clusterState.readOnlyRoutingNodes().numberOfShardsOfType(ShardRoutingState.INITIALIZING),
        equalTo(2));
    for (MutableShardRouting shardRouting :
        clusterState.readOnlyRoutingNodes().shardsWithState(INITIALIZING)) {
      assertThat(shardRouting.currentNodeId(), equalTo("node3"));
    }
  }
  @Test
  public void testMultiIndexEvenDistribution() {
    AllocationService strategy =
        createAllocationService(
            settingsBuilder()
                .put("cluster.routing.allocation.concurrent_recoveries", 10)
                .put("cluster.routing.allocation.allow_rebalance", "always")
                .put("cluster.routing.allocation.cluster_concurrent_rebalance", -1)
                .build());

    final int numberOfIndices = 50;
    logger.info("Building initial routing table with " + numberOfIndices + " indices");

    MetaData.Builder metaDataBuilder = MetaData.builder();
    for (int i = 0; i < numberOfIndices; i++) {
      metaDataBuilder.put(IndexMetaData.builder("test" + i).numberOfShards(1).numberOfReplicas(0));
    }
    MetaData metaData = metaDataBuilder.build();

    RoutingTable.Builder routingTableBuilder = RoutingTable.builder();
    for (int i = 0; i < numberOfIndices; i++) {
      routingTableBuilder.addAsNew(metaData.index("test" + i));
    }
    RoutingTable routingTable = routingTableBuilder.build();
    ClusterState clusterState =
        ClusterState.builder().metaData(metaData).routingTable(routingTable).build();

    assertThat(routingTable.indicesRouting().size(), equalTo(numberOfIndices));
    for (int i = 0; i < numberOfIndices; i++) {
      assertThat(routingTable.index("test" + i).shards().size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).shards().size(), equalTo(1));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).state(), equalTo(UNASSIGNED));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).currentNodeId(), nullValue());
    }

    logger.info("Adding " + (numberOfIndices / 2) + " nodes");
    DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder();
    List<DiscoveryNode> nodes = newArrayList();
    for (int i = 0; i < (numberOfIndices / 2); i++) {
      nodesBuilder.put(newNode("node" + i));
    }
    RoutingTable prevRoutingTable = routingTable;
    clusterState = ClusterState.builder(clusterState).nodes(nodesBuilder).build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();

    assertThat(prevRoutingTable != routingTable, equalTo(true));
    for (int i = 0; i < numberOfIndices; i++) {
      assertThat(routingTable.index("test" + i).shards().size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).shards().size(), equalTo(1));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).unassigned(), equalTo(false));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).state(), equalTo(INITIALIZING));
      assertThat(routingTable.index("test" + i).shard(0).shards().get(0).primary(), equalTo(true));
      // make sure we still have 2 shards initializing per node on the first 25 nodes
      String nodeId = routingTable.index("test" + i).shard(0).shards().get(0).currentNodeId();
      int nodeIndex = Integer.parseInt(nodeId.substring("node".length()));
      assertThat(nodeIndex, lessThan(25));
    }
    RoutingNodes routingNodes = clusterState.routingNodes();
    Set<String> encounteredIndices = newHashSet();
    for (RoutingNode routingNode : routingNodes) {
      assertThat(routingNode.numberOfShardsWithState(STARTED), equalTo(0));
      assertThat(routingNode.size(), equalTo(2));
      // make sure we still have 2 shards initializing per node on the only 25 nodes
      int nodeIndex = Integer.parseInt(routingNode.nodeId().substring("node".length()));
      assertThat(nodeIndex, lessThan(25));
      // check that we don't have a shard associated with a node with the same index name (we have a
      // single shard)
      for (MutableShardRouting shardRoutingEntry : routingNode) {
        assertThat(encounteredIndices, not(hasItem(shardRoutingEntry.index())));
        encounteredIndices.add(shardRoutingEntry.index());
      }
    }

    logger.info("Adding additional " + (numberOfIndices / 2) + " nodes, nothing should change");
    nodesBuilder = DiscoveryNodes.builder(clusterState.nodes());
    for (int i = (numberOfIndices / 2); i < numberOfIndices; i++) {
      nodesBuilder.put(newNode("node" + i));
    }
    prevRoutingTable = routingTable;
    clusterState = ClusterState.builder(clusterState).nodes(nodesBuilder).build();
    routingTable = strategy.reroute(clusterState).routingTable();
    clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();

    assertThat(prevRoutingTable != routingTable, equalTo(false));

    logger.info("Marking the shard as started");
    prevRoutingTable = routingTable;
    routingTable =
        strategy
            .applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING))
            .routingTable();
    clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();

    assertThat(prevRoutingTable != routingTable, equalTo(true));
    int numberOfRelocatingShards = 0;
    int numberOfStartedShards = 0;
    for (int i = 0; i < numberOfIndices; i++) {
      assertThat(routingTable.index("test" + i).shards().size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).size(), equalTo(1));
      assertThat(routingTable.index("test" + i).shard(0).shards().size(), equalTo(1));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).unassigned(), equalTo(false));
      assertThat(
          routingTable.index("test" + i).shard(0).shards().get(0).state(),
          anyOf(equalTo(STARTED), equalTo(RELOCATING)));
      if (routingTable.index("test" + i).shard(0).shards().get(0).state() == STARTED) {
        numberOfStartedShards++;
      } else if (routingTable.index("test" + i).shard(0).shards().get(0).state() == RELOCATING) {
        numberOfRelocatingShards++;
      }
      assertThat(routingTable.index("test" + i).shard(0).shards().get(0).primary(), equalTo(true));
      // make sure we still have 2 shards either relocating or started on the first 25 nodes (still)
      String nodeId = routingTable.index("test" + i).shard(0).shards().get(0).currentNodeId();
      int nodeIndex = Integer.parseInt(nodeId.substring("node".length()));
      assertThat(nodeIndex, lessThan(25));
    }
    assertThat(numberOfRelocatingShards, equalTo(25));
    assertThat(numberOfStartedShards, equalTo(25));
  }
  @Override
  public void execute(RoutingAllocation allocation) throws ElasticSearchException {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
    boolean found = false;
    for (RoutingNodes.RoutingNodeIterator it =
            allocation.routingNodes().routingNodeIter(discoNode.id());
        it.hasNext(); ) {
      MutableShardRouting shardRouting = it.next();
      if (!shardRouting.shardId().equals(shardId)) {
        continue;
      }
      found = true;
      if (shardRouting.relocatingNodeId() != null) {
        if (shardRouting.initializing()) {
          // the shard is initializing and recovering from another node, simply cancel the recovery
          it.remove();
          // and cancel the relocating state from the shard its being relocated from
          RoutingNode relocatingFromNode =
              allocation.routingNodes().node(shardRouting.relocatingNodeId());
          if (relocatingFromNode != null) {
            for (MutableShardRouting fromShardRouting : relocatingFromNode) {
              if (fromShardRouting.shardId().equals(shardRouting.shardId())
                  && fromShardRouting.state() == RELOCATING) {
                allocation.routingNodes().cancelRelocation(fromShardRouting);
                break;
              }
            }
          }
        } else if (shardRouting.relocating()) {

          // the shard is relocating to another node, cancel the recovery on the other node, and
          // deallocate this one
          if (!allowPrimary && shardRouting.primary()) {
            // can't cancel a primary shard being initialized
            throw new ElasticSearchIllegalArgumentException(
                "[cancel_allocation] can't cancel "
                    + shardId
                    + " on node "
                    + discoNode
                    + ", shard is primary and initializing its state");
          }
          it.moveToUnassigned();
          // now, go and find the shard that is initializing on the target node, and cancel it as
          // well...
          RoutingNodes.RoutingNodeIterator initializingNode =
              allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId());
          if (initializingNode != null) {
            while (initializingNode.hasNext()) {
              MutableShardRouting initializingShardRouting = initializingNode.next();
              if (initializingShardRouting.shardId().equals(shardRouting.shardId())
                  && initializingShardRouting.state() == INITIALIZING) {
                initializingNode.remove();
              }
            }
          }
        }
      } else {
        // the shard is not relocating, its either started, or initializing, just cancel it and move
        // on...
        if (!allowPrimary && shardRouting.primary()) {
          // can't cancel a primary shard being initialized
          throw new ElasticSearchIllegalArgumentException(
              "[cancel_allocation] can't cancel "
                  + shardId
                  + " on node "
                  + discoNode
                  + ", shard is primary and started");
        }
        it.remove();
        allocation
            .routingNodes()
            .unassigned()
            .add(
                new MutableShardRouting(
                    shardRouting.index(),
                    shardRouting.id(),
                    null,
                    shardRouting.primary(),
                    ShardRoutingState.UNASSIGNED,
                    shardRouting.version() + 1));
      }
    }
    if (!found) {
      throw new ElasticSearchIllegalArgumentException(
          "[cancel_allocation] can't cancel "
              + shardId
              + ", failed to find it on node "
              + discoNode);
    }
  }
Пример #10
0
  /**
   * Applies the relevant logic to handle a failed shard. Returns <tt>true</tt> if changes happened
   * that require relocation.
   */
  private boolean applyFailedShard(
      RoutingAllocation allocation, ShardRouting failedShard, boolean addToIgnoreList) {
    // create a copy of the failed shard, since we assume we can change possible references to it
    // without
    // changing the state of failed shard
    failedShard = new ImmutableShardRouting(failedShard);

    IndexRoutingTable indexRoutingTable = allocation.routingTable().index(failedShard.index());
    if (indexRoutingTable == null) {
      return false;
    }

    RoutingNodes routingNodes = allocation.routingNodes();
    boolean dirty = false;
    if (failedShard.relocatingNodeId() != null) {
      // the shard is relocating, either in initializing (recovery from another node) or relocating
      // (moving to another node)
      if (failedShard.state() == INITIALIZING) {
        // the shard is initializing and recovering from another node
        // first, we need to cancel the current node that is being initialized
        RoutingNodes.RoutingNodeIterator initializingNode =
            routingNodes.routingNodeIter(failedShard.currentNodeId());
        if (initializingNode != null) {
          while (initializingNode.hasNext()) {
            MutableShardRouting shardRouting = initializingNode.next();
            if (shardRouting.equals(failedShard)) {
              dirty = true;
              initializingNode.remove();
              if (addToIgnoreList) {
                // make sure we ignore this shard on the relevant node
                allocation.addIgnoreShardForNode(
                    failedShard.shardId(), failedShard.currentNodeId());
              }

              break;
            }
          }
        }
        if (dirty) {
          // now, find the node that we are relocating *from*, and cancel its relocation
          RoutingNode relocatingFromNode = routingNodes.node(failedShard.relocatingNodeId());
          if (relocatingFromNode != null) {
            for (MutableShardRouting shardRouting : relocatingFromNode) {
              if (shardRouting.shardId().equals(failedShard.shardId())
                  && shardRouting.relocating()) {
                dirty = true;
                routingNodes.cancelRelocation(shardRouting);
                break;
              }
            }
          }
        } else {
          logger.debug("failed shard {} not found in routingNodes, ignoring it", failedShard);
        }
        return dirty;
      } else if (failedShard.state() == RELOCATING) {
        // the shard is relocating, meaning its the source the shard is relocating from
        // first, we need to cancel the current relocation from the current node
        // now, find the node that we are recovering from, cancel the relocation, remove it from the
        // node
        // and add it to the unassigned shards list...
        RoutingNodes.RoutingNodeIterator relocatingFromNode =
            routingNodes.routingNodeIter(failedShard.currentNodeId());
        if (relocatingFromNode != null) {
          while (relocatingFromNode.hasNext()) {
            MutableShardRouting shardRouting = relocatingFromNode.next();
            if (shardRouting.equals(failedShard)) {
              dirty = true;
              relocatingFromNode.remove();
              if (addToIgnoreList) {
                // make sure we ignore this shard on the relevant node
                allocation.addIgnoreShardForNode(
                    failedShard.shardId(), failedShard.currentNodeId());
              }

              routingNodes
                  .unassigned()
                  .add(
                      new MutableShardRouting(
                          failedShard.index(),
                          failedShard.id(),
                          null,
                          failedShard.primary(),
                          ShardRoutingState.UNASSIGNED,
                          failedShard.version() + 1));
              break;
            }
          }
        }
        if (dirty) {
          // next, we need to find the target initializing shard that is recovering from, and remove
          // it...
          RoutingNodes.RoutingNodeIterator initializingNode =
              routingNodes.routingNodeIter(failedShard.relocatingNodeId());
          if (initializingNode != null) {
            while (initializingNode.hasNext()) {
              MutableShardRouting shardRouting = initializingNode.next();
              if (shardRouting.shardId().equals(failedShard.shardId())
                  && shardRouting.state() == INITIALIZING) {
                dirty = true;
                initializingNode.remove();
              }
            }
          }
        } else {
          logger.debug("failed shard {} not found in routingNodes, ignoring it", failedShard);
        }
      } else {
        throw new ElasticsearchIllegalStateException(
            "illegal state for a failed shard, relocating node id is set, but state does not match: "
                + failedShard);
      }
    } else {
      // the shard is not relocating, its either started, or initializing, just cancel it and move
      // on...
      RoutingNodes.RoutingNodeIterator node =
          routingNodes.routingNodeIter(failedShard.currentNodeId());
      if (node != null) {
        while (node.hasNext()) {
          MutableShardRouting shardRouting = node.next();
          if (shardRouting.equals(failedShard)) {
            dirty = true;
            if (addToIgnoreList) {
              // make sure we ignore this shard on the relevant node
              allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
            }
            node.remove();
            // move all the shards matching the failed shard to the end of the unassigned list
            // so we give a chance for other allocations and won't create poison failed allocations
            // that can keep other shards from being allocated (because of limits applied on how
            // many
            // shards we can start per node)
            List<MutableShardRouting> shardsToMove = Lists.newArrayList();
            for (Iterator<MutableShardRouting> unassignedIt = routingNodes.unassigned().iterator();
                unassignedIt.hasNext(); ) {
              MutableShardRouting unassignedShardRouting = unassignedIt.next();
              if (unassignedShardRouting.shardId().equals(failedShard.shardId())) {
                unassignedIt.remove();
                shardsToMove.add(unassignedShardRouting);
              }
            }
            if (!shardsToMove.isEmpty()) {
              routingNodes.unassigned().addAll(shardsToMove);
            }

            routingNodes
                .unassigned()
                .add(
                    new MutableShardRouting(
                        failedShard.index(),
                        failedShard.id(),
                        null,
                        null,
                        failedShard.restoreSource(),
                        failedShard.primary(),
                        ShardRoutingState.UNASSIGNED,
                        failedShard.version() + 1));

            break;
          }
        }
      }
      if (!dirty) {
        logger.debug("failed shard {} not found in routingNodes, ignoring it", failedShard);
      }
    }
    return dirty;
  }
Пример #11
0
  private boolean electPrimariesAndUnassignedDanglingReplicas(RoutingAllocation allocation) {
    boolean changed = false;
    RoutingNodes routingNodes = allocation.routingNodes();
    if (!routingNodes.hasUnassignedPrimaries()) {
      // move out if we don't have unassigned primaries
      return changed;
    }

    // go over and remove dangling replicas that are initializing for primary shards
    List<ShardRouting> shardsToFail = Lists.newArrayList();
    for (MutableShardRouting shardEntry : routingNodes.unassigned()) {
      if (shardEntry.primary()) {
        for (MutableShardRouting routing : routingNodes.assignedShards(shardEntry)) {
          if (!routing.primary() && routing.initializing()) {
            shardsToFail.add(routing);
          }
        }
      }
    }
    for (ShardRouting shardToFail : shardsToFail) {
      changed |= applyFailedShard(allocation, shardToFail, false);
    }

    // now, go over and elect a new primary if possible, not, from this code block on, if one is
    // elected,
    // routingNodes.hasUnassignedPrimaries() will potentially be false
    for (MutableShardRouting shardEntry : routingNodes.unassigned()) {
      if (shardEntry.primary()) {
        MutableShardRouting candidate = allocation.routingNodes().activeReplica(shardEntry);
        if (candidate != null) {
          routingNodes.swapPrimaryFlag(shardEntry, candidate);
          if (candidate.relocatingNodeId() != null) {
            changed = true;
            // its also relocating, make sure to move the other routing to primary
            RoutingNode node = routingNodes.node(candidate.relocatingNodeId());
            if (node != null) {
              for (MutableShardRouting shardRouting : node) {
                if (shardRouting.shardId().equals(candidate.shardId()) && !shardRouting.primary()) {
                  routingNodes.swapPrimaryFlag(shardRouting);
                  break;
                }
              }
            }
          }
        }
      }
    }

    return changed;
  }
  private ObjectLongOpenHashMap<DiscoveryNode> buildShardStates(
      final DiscoveryNodes nodes, MutableShardRouting shard) {
    ObjectLongOpenHashMap<DiscoveryNode> shardStates = cachedShardsState.get(shard.shardId());
    ObjectOpenHashSet<String> nodeIds;
    if (shardStates == null) {
      shardStates = new ObjectLongOpenHashMap<>();
      cachedShardsState.put(shard.shardId(), shardStates);
      nodeIds = ObjectOpenHashSet.from(nodes.dataNodes().keys());
    } else {
      // clean nodes that have failed
      shardStates
          .keys()
          .removeAll(
              new ObjectPredicate<DiscoveryNode>() {
                @Override
                public boolean apply(DiscoveryNode node) {
                  return !nodes.nodeExists(node.id());
                }
              });
      nodeIds = ObjectOpenHashSet.newInstance();
      // we have stored cached from before, see if the nodes changed, if they have, go fetch again
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        DiscoveryNode node = cursor.value;
        if (!shardStates.containsKey(node)) {
          nodeIds.add(node.id());
        }
      }
    }
    if (nodeIds.isEmpty()) {
      return shardStates;
    }

    String[] nodesIdsArray = nodeIds.toArray(String.class);
    TransportNodesListGatewayStartedShards.NodesGatewayStartedShards response =
        listGatewayStartedShards.list(shard.shardId(), nodesIdsArray, listTimeout).actionGet();
    if (logger.isDebugEnabled()) {
      if (response.failures().length > 0) {
        StringBuilder sb =
            new StringBuilder(shard + ": failures when trying to list shards on nodes:");
        for (int i = 0; i < response.failures().length; i++) {
          Throwable cause = ExceptionsHelper.unwrapCause(response.failures()[i]);
          if (cause instanceof ConnectTransportException) {
            continue;
          }
          sb.append("\n    -> ").append(response.failures()[i].getDetailedMessage());
        }
        logger.debug(sb.toString());
      }
    }

    for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState :
        response) {
      // -1 version means it does not exists, which is what the API returns, and what we expect to
      logger.trace(
          "[{}] on node [{}] has version [{}] of shard",
          shard,
          nodeShardState.getNode(),
          nodeShardState.version());
      shardStates.put(nodeShardState.getNode(), nodeShardState.version());
    }
    return shardStates;
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard);

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet();
      final boolean[] states = nodesState.allocated;
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      for (int i = 0; i < states.length; i++) {
        if (!states[i]) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (version != -1) {
          numberOfAllocationsFound++;
          if (highestVersion == -1) {
            nodesWithHighestVersion.add(node);
            highestVersion = version;
          } else {
            if (version > highestVersion) {
              nodesWithHighestVersion.clear();
              nodesWithHighestVersion.add(node);
              highestVersion = version;
            } else if (version == highestVersion) {
              nodesWithHighestVersion.add(node);
            }
          }
        }
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index());
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        continue;
      }

      Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
          buildShardStores(nodes, shard);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
          nodeStoreEntry : shardStores.entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          MutableShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                  shardStores.get(primaryNode);
              if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                long sizeMatched = 0;

                for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                  if (primaryNodeStore.fileExists(storeFileMetaData.name())
                      && primaryNodeStore
                          .file(storeFileMetaData.name())
                          .isSame(storeFileMetaData)) {
                    sizeMatched += storeFileMetaData.length();
                  }
                }
                logger.trace(
                    "{}: node [{}] has [{}/{}] bytes of re-usable data",
                    shard,
                    discoNode.name(),
                    new ByteSizeValue(sizeMatched),
                    sizeMatched);
                if (sizeMatched > lastSizeMatched) {
                  lastSizeMatched = sizeMatched;
                  lastDiscoNodeMatched = discoNode;
                  lastNodeMatched = node;
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          allocation.routingNodes().assign(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      }
    }
    return changed;
  }