/** Returns the changes comparing this nodes to the provided nodes. */
 public Delta delta(DiscoveryNodes other) {
   List<DiscoveryNode> removed = newArrayList();
   List<DiscoveryNode> added = newArrayList();
   for (DiscoveryNode node : other) {
     if (!this.nodeExists(node.id())) {
       removed.add(node);
     }
   }
   for (DiscoveryNode node : this) {
     if (!other.nodeExists(node.id())) {
       added.add(node);
     }
   }
   DiscoveryNode previousMasterNode = null;
   DiscoveryNode newMasterNode = null;
   if (masterNodeId != null) {
     if (other.masterNodeId == null || !other.masterNodeId.equals(masterNodeId)) {
       previousMasterNode = other.masterNode();
       newMasterNode = masterNode();
     }
   }
   return new Delta(
       previousMasterNode,
       newMasterNode,
       localNodeId,
       ImmutableList.copyOf(removed),
       ImmutableList.copyOf(added));
 }
  private Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
      buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) {
    Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
        cachedStores.get(shard.shardId());
    ObjectOpenHashSet<String> nodesIds;
    if (shardStores == null) {
      shardStores = Maps.newHashMap();
      cachedStores.put(shard.shardId(), shardStores);
      nodesIds = ObjectOpenHashSet.from(nodes.dataNodes().keys());
    } else {
      nodesIds = ObjectOpenHashSet.newInstance();
      // clean nodes that have failed
      for (Iterator<DiscoveryNode> it = shardStores.keySet().iterator(); it.hasNext(); ) {
        DiscoveryNode node = it.next();
        if (!nodes.nodeExists(node.id())) {
          it.remove();
        }
      }

      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        DiscoveryNode node = cursor.value;
        if (!shardStores.containsKey(node)) {
          nodesIds.add(node.id());
        }
      }
    }

    if (!nodesIds.isEmpty()) {
      String[] nodesIdsArray = nodesIds.toArray(String.class);
      TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData =
          listShardStoreMetaData
              .list(shard.shardId(), false, nodesIdsArray, listTimeout)
              .actionGet();
      if (logger.isTraceEnabled()) {
        if (nodesStoreFilesMetaData.failures().length > 0) {
          StringBuilder sb =
              new StringBuilder(shard + ": failures when trying to list stores on nodes:");
          for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) {
            Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]);
            if (cause instanceof ConnectTransportException) {
              continue;
            }
            sb.append("\n    -> ")
                .append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage());
          }
          logger.trace(sb.toString());
        }
      }

      for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData :
          nodesStoreFilesMetaData) {
        if (nodeStoreFilesMetaData.storeFilesMetaData() != null) {
          shardStores.put(
              nodeStoreFilesMetaData.getNode(), nodeStoreFilesMetaData.storeFilesMetaData());
        }
      }
    }

    return shardStores;
  }
 @Override
 public void run() {
   // master node will check against all nodes if its alive with certain discoveries
   // implementations,
   // but we can't rely on that, so we check on it as well
   for (DiscoveryNode node : clusterState.nodes()) {
     if (lifecycle.stoppedOrClosed()) {
       return;
     }
     if (!nodeRequiresConnection(node)) {
       continue;
     }
     if (clusterState
         .nodes()
         .nodeExists(
             node
                 .id())) { // we double check existence of node since connectToNode might take
                           // time...
       if (!transportService.nodeConnected(node)) {
         try {
           transportService.connectToNode(node);
         } catch (Exception e) {
           if (lifecycle.stoppedOrClosed()) {
             return;
           }
           if (clusterState
               .nodes()
               .nodeExists(node.id())) { // double check here as well, maybe its gone?
             logger.warn("failed to reconnect to node {}", e, node);
           }
         }
       }
     }
   }
 }
 @Override
 public int compare(DiscoveryNode o1, DiscoveryNode o2) {
   if (o1.masterNode() && !o2.masterNode()) {
     return -1;
   }
   if (!o1.masterNode() && o2.masterNode()) {
     return 1;
   }
   return o1.id().compareTo(o2.id());
 }
 public String shortSummary() {
   StringBuilder sb = new StringBuilder();
   if (!removed() && masterNodeChanged()) {
     if (newMasterNode.id().equals(localNodeId)) {
       // we are the master, no nodes we removed, we are actually the first master
       sb.append("new_master ").append(newMasterNode());
     } else {
       // we are not the master, so we just got this event. No nodes were removed, so its not a
       // *new* master
       sb.append("detected_master ").append(newMasterNode());
     }
   } else {
     if (masterNodeChanged()) {
       sb.append("master {new ").append(newMasterNode());
       if (previousMasterNode() != null) {
         sb.append(", previous ").append(previousMasterNode());
       }
       sb.append("}");
     }
     if (removed()) {
       if (masterNodeChanged()) {
         sb.append(", ");
       }
       sb.append("removed {");
       for (DiscoveryNode node : removedNodes()) {
         sb.append(node).append(',');
       }
       sb.append("}");
     }
   }
   if (added()) {
     // don't print if there is one added, and it is us
     if (!(addedNodes().size() == 1 && addedNodes().get(0).id().equals(localNodeId))) {
       if (removed() || masterNodeChanged()) {
         sb.append(", ");
       }
       sb.append("added {");
       for (DiscoveryNode node : addedNodes()) {
         if (!node.id().equals(localNodeId)) {
           // don't print ourself
           sb.append(node).append(',');
         }
       }
       sb.append("}");
     }
   }
   return sb.toString();
 }
  private ClusterState rejoin(ClusterState clusterState, String reason) {
    logger.warn(reason + ", current nodes: {}", clusterState.nodes());
    nodesFD.stop();
    masterFD.stop(reason);
    master = false;

    ClusterBlocks clusterBlocks =
        ClusterBlocks.builder()
            .blocks(clusterState.blocks())
            .addGlobalBlock(NO_MASTER_BLOCK)
            .addGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)
            .build();

    // clear the routing table, we have no master, so we need to recreate the routing when we reform
    // the cluster
    RoutingTable routingTable = RoutingTable.builder().build();
    // we also clean the metadata, since we are going to recover it if we become master
    MetaData metaData = MetaData.builder().build();

    // clean the nodes, we are now not connected to anybody, since we try and reform the cluster
    latestDiscoNodes =
        new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build();

    asyncJoinCluster();

    return ClusterState.builder(clusterState)
        .blocks(clusterBlocks)
        .nodes(latestDiscoNodes)
        .routingTable(routingTable)
        .metaData(metaData)
        .build();
  }
 private void handleTransportDisconnect(DiscoveryNode node) {
   if (!latestNodes.nodeExists(node.id())) {
     return;
   }
   NodeFD nodeFD = nodesFD.remove(node);
   if (nodeFD == null) {
     return;
   }
   if (!running) {
     return;
   }
   nodeFD.running = false;
   if (connectOnNetworkDisconnect) {
     try {
       transportService.connectToNode(node);
       nodesFD.put(node, new NodeFD());
       threadPool.schedule(pingInterval, ThreadPool.Names.SAME, new SendPingRequest(node));
     } catch (Exception e) {
       logger.trace("[node  ] [{}] transport disconnected (with verified connect)", node);
       notifyNodeFailure(node, "transport disconnected (with verified connect)");
     }
   } else {
     logger.trace("[node  ] [{}] transport disconnected", node);
     notifyNodeFailure(node, "transport disconnected");
   }
 }
 public DiscoveryNodes removeDeadMembers(Set<String> newNodes, String masterNodeId) {
   Builder builder = new Builder().masterNodeId(masterNodeId).localNodeId(localNodeId);
   for (DiscoveryNode node : this) {
     if (newNodes.contains(node.id())) {
       builder.put(node);
     }
   }
   return builder.build();
 }
 @Override
 public DiscoveryNodes nodes() {
   DiscoveryNodes latestNodes = this.latestDiscoNodes;
   if (latestNodes != null) {
     return latestNodes;
   }
   // have not decided yet, just send the local node
   return DiscoveryNodes.builder().put(localNode).localNodeId(localNode.id()).build();
 }
  public void verify(
      String repository, String verificationToken, final ActionListener<VerifyResponse> listener) {
    final DiscoveryNodes discoNodes = clusterService.state().nodes();
    final DiscoveryNode localNode = discoNodes.localNode();

    final ObjectContainer<DiscoveryNode> masterAndDataNodes =
        discoNodes.masterAndDataNodes().values();
    final List<DiscoveryNode> nodes = newArrayList();
    for (ObjectCursor<DiscoveryNode> cursor : masterAndDataNodes) {
      DiscoveryNode node = cursor.value;
      Version version = node.getVersion();
      // Verification wasn't supported before v1.4.0 - no reason to send verification request to
      // these nodes
      if (version != null && version.onOrAfter(Version.V_1_4_0)) {
        nodes.add(node);
      }
    }
    final CopyOnWriteArrayList<VerificationFailure> errors = new CopyOnWriteArrayList<>();
    final AtomicInteger counter = new AtomicInteger(nodes.size());
    for (final DiscoveryNode node : nodes) {
      if (node.equals(localNode)) {
        try {
          doVerify(repository, verificationToken);
        } catch (Throwable t) {
          logger.warn("[{}] failed to verify repository", t, repository);
          errors.add(new VerificationFailure(node.id(), ExceptionsHelper.detailedMessage(t)));
        }
        if (counter.decrementAndGet() == 0) {
          finishVerification(listener, nodes, errors);
        }
      } else {
        transportService.sendRequest(
            node,
            ACTION_NAME,
            new VerifyNodeRepositoryRequest(repository, verificationToken),
            new EmptyTransportResponseHandler(ThreadPool.Names.SAME) {
              @Override
              public void handleResponse(TransportResponse.Empty response) {
                if (counter.decrementAndGet() == 0) {
                  finishVerification(listener, nodes, errors);
                }
              }

              @Override
              public void handleException(TransportException exp) {
                errors.add(
                    new VerificationFailure(node.id(), ExceptionsHelper.detailedMessage(exp)));
                if (counter.decrementAndGet() == 0) {
                  finishVerification(listener, nodes, errors);
                }
              }
            });
      }
    }
  }
 public static DiscoveryNodes readFrom(StreamInput in, @Nullable DiscoveryNode localNode)
     throws IOException {
   Builder builder = new Builder();
   if (in.readBoolean()) {
     builder.masterNodeId(in.readUTF());
   }
   if (localNode != null) {
     builder.localNodeId(localNode.id());
   }
   int size = in.readVInt();
   for (int i = 0; i < size; i++) {
     DiscoveryNode node = DiscoveryNode.readNode(in);
     if (localNode != null && node.id().equals(localNode.id())) {
       // reuse the same instance of our address and local node id for faster equality
       node = localNode;
     }
     builder.put(node);
   }
   return builder.build();
 }
  private void updateMappingOnMaster(final String index, final String type) {
    try {
      MapperService mapperService = indicesService.indexServiceSafe(index).mapperService();
      final DocumentMapper documentMapper = mapperService.documentMapper(type);
      if (documentMapper == null) { // should not happen
        return;
      }
      IndexMetaData metaData = clusterService.state().metaData().index(index);
      if (metaData == null) {
        return;
      }

      long orderId = mappingUpdatedAction.generateNextMappingUpdateOrder();
      documentMapper.refreshSource();

      DiscoveryNode node = clusterService.localNode();
      final MappingUpdatedAction.MappingUpdatedRequest request =
          new MappingUpdatedAction.MappingUpdatedRequest(
              index,
              metaData.uuid(),
              type,
              documentMapper.mappingSource(),
              orderId,
              node != null ? node.id() : null);
      mappingUpdatedAction.execute(
          request,
          new ActionListener<MappingUpdatedAction.MappingUpdatedResponse>() {
            @Override
            public void onResponse(
                MappingUpdatedAction.MappingUpdatedResponse mappingUpdatedResponse) {
              // all is well
            }

            @Override
            public void onFailure(Throwable e) {
              try {
                logger.warn(
                    "failed to update master on updated mapping for index [{}], type [{}] and source [{}]",
                    e,
                    index,
                    type,
                    documentMapper.mappingSource().string());
              } catch (IOException e1) {
                // ignore
              }
            }
          });
    } catch (Exception e) {
      logger.warn(
          "failed to update master on updated mapping for index [{}], type [{}]", e, index, type);
    }
  }
    protected void onNodeResponse(DiscoveryNode node, int nodeIndex, NodeResponse response) {
      logger.trace("received response for [{}] from node [{}]", actionName, node.id());

      // this is defensive to protect against the possibility of double invocation
      // the current implementation of TransportService#sendRequest guards against this
      // but concurrency is hard, safety is important, and the small performance loss here does not
      // matter
      if (responses.compareAndSet(nodeIndex, null, response)) {
        if (counter.incrementAndGet() == responses.length()) {
          onCompletion();
        }
      }
    }
  public void testNodeVersionIsUpdated() {
    TransportClient client = (TransportClient) internalCluster().client();
    TransportClientNodesService nodeService = client.nodeService();
    Node node =
        new Node(
            Settings.builder()
                .put(internalCluster().getDefaultSettings())
                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
                .put("node.name", "testNodeVersionIsUpdated")
                .put("http.enabled", false)
                .put(Node.NODE_DATA_SETTING.getKey(), false)
                .put("cluster.name", "foobar")
                .put(
                    InternalSettingsPreparer.IGNORE_SYSTEM_PROPERTIES_SETTING.getKey(),
                    true) // make sure we get what we set :)
                .build());
    node.start();
    try {
      TransportAddress transportAddress =
          node.injector().getInstance(TransportService.class).boundAddress().publishAddress();
      client.addTransportAddress(transportAddress);
      assertThat(
          nodeService.connectedNodes().size(),
          greaterThanOrEqualTo(
              1)); // since we force transport clients there has to be one node started that we
                   // connect to.
      for (DiscoveryNode discoveryNode :
          nodeService.connectedNodes()) { // connected nodes have updated version
        assertThat(discoveryNode.getVersion(), equalTo(Version.CURRENT));
      }

      for (DiscoveryNode discoveryNode : nodeService.listedNodes()) {
        assertThat(discoveryNode.id(), startsWith("#transport#-"));
        assertThat(
            discoveryNode.getVersion(), equalTo(Version.CURRENT.minimumCompatibilityVersion()));
      }

      assertThat(nodeService.filteredNodes().size(), equalTo(1));
      for (DiscoveryNode discoveryNode : nodeService.filteredNodes()) {
        assertThat(
            discoveryNode.getVersion(), equalTo(Version.CURRENT.minimumCompatibilityVersion()));
      }
    } finally {
      node.close();
    }
  }
    protected void onNodeFailure(DiscoveryNode node, int nodeIndex, Throwable t) {
      String nodeId = node.id();
      if (logger.isDebugEnabled() && !(t instanceof NodeShouldNotConnectException)) {
        logger.debug("failed to execute [{}] on node [{}]", t, actionName, nodeId);
      }

      // this is defensive to protect against the possibility of double invocation
      // the current implementation of TransportService#sendRequest guards against this
      // but concurrency is hard, safety is important, and the small performance loss here does not
      // matter
      if (responses.compareAndSet(
          nodeIndex, null, new FailedNodeException(nodeId, "Failed node [" + nodeId + "]", t))) {
        if (counter.incrementAndGet() == responses.length()) {
          onCompletion();
        }
      }
    }
  @Override
  public void execute(RoutingAllocation allocation) throws ElasticSearchException {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);

    MutableShardRouting shardRouting = null;
    for (MutableShardRouting routing : allocation.routingNodes().unassigned()) {
      if (routing.shardId().equals(shardId)) {
        // prefer primaries first to allocate
        if (shardRouting == null || routing.primary()) {
          shardRouting = routing;
        }
      }
    }

    if (shardRouting == null) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] failed to find " + shardId + " on the list of unassigned shards");
    }

    if (shardRouting.primary() && !allowPrimary) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] trying to allocate a primary shard " + shardId + "], which is disabled");
    }

    RoutingNode routingNode = allocation.routingNodes().node(discoNode.id());
    allocation.addIgnoreDisable(shardRouting.shardId(), routingNode.nodeId());
    if (!allocation.deciders().canAllocate(shardRouting, routingNode, allocation).allowed()) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed");
    }
    // go over and remove it from the unassigned
    for (Iterator<MutableShardRouting> it = allocation.routingNodes().unassigned().iterator();
        it.hasNext(); ) {
      if (it.next() != shardRouting) {
        continue;
      }
      it.remove();
      routingNode.add(shardRouting);
      break;
    }
  }
  @Override
  protected void doStart() throws ElasticsearchException {
    Map<String, String> nodeAttributes = discoveryNodeService.buildAttributes();
    // note, we rely on the fact that its a new id each time we start, see FD and "kill -9" handling
    final String nodeId = getNodeUUID(settings);
    localNode =
        new DiscoveryNode(
            settings.get("name"),
            nodeId,
            transportService.boundAddress().publishAddress(),
            nodeAttributes,
            version);
    latestDiscoNodes =
        new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build();
    nodesFD.updateNodes(latestDiscoNodes);
    pingService.start();

    // do the join on a different thread, the DiscoveryService waits for 30s anyhow till it is
    // discovered
    asyncJoinCluster();
  }
 public void updateNodes(DiscoveryNodes nodes) {
   DiscoveryNodes prevNodes = latestNodes;
   this.latestNodes = nodes;
   if (!running) {
     return;
   }
   DiscoveryNodes.Delta delta = nodes.delta(prevNodes);
   for (DiscoveryNode newNode : delta.addedNodes()) {
     if (newNode.id().equals(nodes.localNodeId())) {
       // no need to monitor the local node
       continue;
     }
     if (!nodesFD.containsKey(newNode)) {
       nodesFD.put(newNode, new NodeFD());
       threadPool.schedule(pingInterval, ThreadPool.Names.SAME, new SendPingRequest(newNode));
     }
   }
   for (DiscoveryNode removedNode : delta.removedNodes()) {
     nodesFD.remove(removedNode);
   }
 }
    @Override
    public void run() {
      if (!running) {
        return;
      }
      transportService.sendRequest(
          node,
          PingRequestHandler.ACTION,
          new PingRequest(node.id()),
          options().withHighType().withTimeout(pingRetryTimeout),
          new BaseTransportResponseHandler<PingResponse>() {
            @Override
            public PingResponse newInstance() {
              return new PingResponse();
            }

            @Override
            public void handleResponse(PingResponse response) {
              if (!running) {
                return;
              }
              NodeFD nodeFD = nodesFD.get(node);
              if (nodeFD != null) {
                if (!nodeFD.running) {
                  return;
                }
                nodeFD.retryCount = 0;
                threadPool.schedule(pingInterval, ThreadPool.Names.SAME, SendPingRequest.this);
              }
            }

            @Override
            public void handleException(TransportException exp) {
              // check if the master node did not get switched on us...
              if (!running) {
                return;
              }
              if (exp instanceof ConnectTransportException) {
                // ignore this one, we already handle it by registering a connection listener
                return;
              }
              NodeFD nodeFD = nodesFD.get(node);
              if (nodeFD != null) {
                if (!nodeFD.running) {
                  return;
                }
                int retryCount = ++nodeFD.retryCount;
                logger.trace(
                    "[node  ] failed to ping [{}], retry [{}] out of [{}]",
                    exp,
                    node,
                    retryCount,
                    pingRetryCount);
                if (retryCount >= pingRetryCount) {
                  logger.debug(
                      "[node  ] failed to ping [{}], tried [{}] times, each with  maximum [{}] timeout",
                      node,
                      pingRetryCount,
                      pingRetryTimeout);
                  // not good, failure
                  if (nodesFD.remove(node) != null) {
                    notifyNodeFailure(
                        node,
                        "failed to ping, tried ["
                            + pingRetryCount
                            + "] times, each with maximum ["
                            + pingRetryTimeout
                            + "] timeout");
                  }
                } else {
                  // resend the request, not reschedule, rely on send timeout
                  transportService.sendRequest(
                      node,
                      PingRequestHandler.ACTION,
                      new PingRequest(node.id()),
                      options().withHighType().withTimeout(pingRetryTimeout),
                      this);
                }
              }
            }

            @Override
            public String executor() {
              return ThreadPool.Names.SAME;
            }
          });
    }
  @Override
  public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
    boolean found = false;
    for (RoutingNodes.RoutingNodeIterator it =
            allocation.routingNodes().routingNodeIter(discoNode.id());
        it.hasNext(); ) {
      ShardRouting shardRouting = it.next();
      if (!shardRouting.shardId().equals(shardId)) {
        continue;
      }
      found = true;
      if (shardRouting.relocatingNodeId() != null) {
        if (shardRouting.initializing()) {
          // the shard is initializing and recovering from another node, simply cancel the recovery
          it.remove();
          // and cancel the relocating state from the shard its being relocated from
          RoutingNode relocatingFromNode =
              allocation.routingNodes().node(shardRouting.relocatingNodeId());
          if (relocatingFromNode != null) {
            for (ShardRouting fromShardRouting : relocatingFromNode) {
              if (fromShardRouting.isSameShard(shardRouting)
                  && fromShardRouting.state() == RELOCATING) {
                allocation.routingNodes().cancelRelocation(fromShardRouting);
                break;
              }
            }
          }
        } else if (shardRouting.relocating()) {

          // the shard is relocating to another node, cancel the recovery on the other node, and
          // deallocate this one
          if (!allowPrimary && shardRouting.primary()) {
            // can't cancel a primary shard being initialized
            if (explain) {
              return new RerouteExplanation(
                  this,
                  allocation.decision(
                      Decision.NO,
                      "cancel_allocation_command",
                      "can't cancel "
                          + shardId
                          + " on node "
                          + discoNode
                          + ", shard is primary and initializing its state"));
            }
            throw new IllegalArgumentException(
                "[cancel_allocation] can't cancel "
                    + shardId
                    + " on node "
                    + discoNode
                    + ", shard is primary and initializing its state");
          }
          it.moveToUnassigned(new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null));
          // now, go and find the shard that is initializing on the target node, and cancel it as
          // well...
          RoutingNodes.RoutingNodeIterator initializingNode =
              allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId());
          if (initializingNode != null) {
            while (initializingNode.hasNext()) {
              ShardRouting initializingShardRouting = initializingNode.next();
              if (initializingShardRouting.isRelocationTargetOf(shardRouting)) {
                initializingNode.remove();
              }
            }
          }
        }
      } else {
        // the shard is not relocating, its either started, or initializing, just cancel it and move
        // on...
        if (!allowPrimary && shardRouting.primary()) {
          // can't cancel a primary shard being initialized
          if (explain) {
            return new RerouteExplanation(
                this,
                allocation.decision(
                    Decision.NO,
                    "cancel_allocation_command",
                    "can't cancel "
                        + shardId
                        + " on node "
                        + discoNode
                        + ", shard is primary and started"));
          }
          throw new IllegalArgumentException(
              "[cancel_allocation] can't cancel "
                  + shardId
                  + " on node "
                  + discoNode
                  + ", shard is primary and started");
        }
        it.moveToUnassigned(new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null));
      }
    }
    if (!found) {
      if (explain) {
        return new RerouteExplanation(
            this,
            allocation.decision(
                Decision.NO,
                "cancel_allocation_command",
                "can't cancel " + shardId + ", failed to find it on node " + discoNode));
      }
      throw new IllegalArgumentException(
          "[cancel_allocation] can't cancel "
              + shardId
              + ", failed to find it on node "
              + discoNode);
    }
    return new RerouteExplanation(
        this,
        allocation.decision(
            Decision.YES,
            "cancel_allocation_command",
            "shard " + shardId + " on node " + discoNode + " can be cancelled"));
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    final RoutingNodes routingNodes = allocation.routingNodes();
    final MetaData metaData = routingNodes.metaData();

    final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator =
        routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.removeAndIgnore();
        continue;
      }

      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores = fetchData(shard, allocation);
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        unassignedIterator.removeAndIgnore();
        continue; // still fetching
      }

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;
      boolean hasReplicaData = false;
      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          nodeStoreEntry : shardStores.getData().entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue().storeFilesMetaData();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          hasReplicaData |= storeFilesMetaData.iterator().hasNext();
          ShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
                  shardStores.getData().get(primaryNode);
              if (primaryNodeFilesStore != null) {
                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                    primaryNodeFilesStore.storeFilesMetaData();
                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                  long sizeMatched = 0;

                  String primarySyncId = primaryNodeStore.syncId();
                  String replicaSyncId = storeFilesMetaData.syncId();
                  // see if we have a sync id we can make use of
                  if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
                    logger.trace(
                        "{}: node [{}] has same sync id {} as primary",
                        shard,
                        discoNode.name(),
                        replicaSyncId);
                    lastNodeMatched = node;
                    lastSizeMatched = Long.MAX_VALUE;
                    lastDiscoNodeMatched = discoNode;
                  } else {
                    for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                      String metaDataFileName = storeFileMetaData.name();
                      if (primaryNodeStore.fileExists(metaDataFileName)
                          && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
                        sizeMatched += storeFileMetaData.length();
                      }
                    }
                    logger.trace(
                        "{}: node [{}] has [{}/{}] bytes of re-usable data",
                        shard,
                        discoNode.name(),
                        new ByteSizeValue(sizeMatched),
                        sizeMatched);
                    if (sizeMatched > lastSizeMatched) {
                      lastSizeMatched = sizeMatched;
                      lastDiscoNodeMatched = discoNode;
                      lastNodeMatched = node;
                    }
                  }
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.removeAndIgnore();
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          unassignedIterator.initialize(lastNodeMatched.nodeId());
        }
      } else if (hasReplicaData == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation
        // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
        // note: we only care about replica in delayed allocation, since if we have an unassigned
        // primary it
        //       will anyhow wait to find an existing copy of the shard to be allocated
        // note: the other side of the equation is scheduling a reroute in a timely manner, which
        // happens in the RoutingService
        long delay =
            shard
                .unassignedInfo()
                .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
        if (delay > 0) {
          logger.debug(
              "[{}][{}]: delaying allocation of [{}] for [{}]",
              shard.index(),
              shard.id(),
              shard,
              TimeValue.timeValueMillis(delay));
          /**
           * mark it as changed, since we want to kick a publishing to schedule future allocation,
           * see {@link
           * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
           */
          changed = true;
          unassignedIterator.removeAndIgnore();
        }
      }
    }
    return changed;
  }
  private Table buildTable(
      RestRequest req,
      ClusterStateResponse state,
      NodesInfoResponse nodesInfo,
      NodesStatsResponse nodesStats) {
    boolean fullId = req.paramAsBoolean("full_id", false);

    DiscoveryNodes nodes = state.getState().nodes();
    String masterId = nodes.masterNodeId();
    Table table = getTableWithHeader(req);

    for (DiscoveryNode node : nodes) {
      NodeInfo info = nodesInfo.getNodesMap().get(node.id());
      NodeStats stats = nodesStats.getNodesMap().get(node.id());

      JvmInfo jvmInfo = info == null ? null : info.getJvm();
      JvmStats jvmStats = stats == null ? null : stats.getJvm();
      FsInfo fsInfo = stats == null ? null : stats.getFs();
      OsStats osStats = stats == null ? null : stats.getOs();
      ProcessStats processStats = stats == null ? null : stats.getProcess();
      NodeIndicesStats indicesStats = stats == null ? null : stats.getIndices();

      table.startRow();

      table.addCell(fullId ? node.id() : Strings.substring(node.getId(), 0, 4));
      table.addCell(info == null ? null : info.getProcess().getId());
      table.addCell(node.getHostName());
      table.addCell(node.getHostAddress());
      if (node.address() instanceof InetSocketTransportAddress) {
        table.addCell(((InetSocketTransportAddress) node.address()).address().getPort());
      } else {
        table.addCell("-");
      }

      table.addCell(node.getVersion().number());
      table.addCell(info == null ? null : info.getBuild().shortHash());
      table.addCell(jvmInfo == null ? null : jvmInfo.version());
      table.addCell(fsInfo == null ? null : fsInfo.getTotal().getAvailable());
      table.addCell(jvmStats == null ? null : jvmStats.getMem().getHeapUsed());
      table.addCell(jvmStats == null ? null : jvmStats.getMem().getHeapUsedPercent());
      table.addCell(jvmInfo == null ? null : jvmInfo.getMem().getHeapMax());
      table.addCell(
          osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getUsed());
      table.addCell(
          osStats == null
              ? null
              : osStats.getMem() == null ? null : osStats.getMem().getUsedPercent());
      table.addCell(
          osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getTotal());
      table.addCell(processStats == null ? null : processStats.getOpenFileDescriptors());
      table.addCell(
          processStats == null
              ? null
              : calculatePercentage(
                  processStats.getOpenFileDescriptors(), processStats.getMaxFileDescriptors()));
      table.addCell(processStats == null ? null : processStats.getMaxFileDescriptors());

      table.addCell(
          osStats == null ? null : String.format(Locale.ROOT, "%.2f", osStats.getLoadAverage()));
      table.addCell(jvmStats == null ? null : jvmStats.getUptime());
      table.addCell(node.clientNode() ? "c" : node.dataNode() ? "d" : "-");
      table.addCell(
          masterId == null
              ? "x"
              : masterId.equals(node.id()) ? "*" : node.masterNode() ? "m" : "-");
      table.addCell(node.name());

      CompletionStats completionStats =
          indicesStats == null ? null : stats.getIndices().getCompletion();
      table.addCell(completionStats == null ? null : completionStats.getSize());

      FieldDataStats fdStats = indicesStats == null ? null : stats.getIndices().getFieldData();
      table.addCell(fdStats == null ? null : fdStats.getMemorySize());
      table.addCell(fdStats == null ? null : fdStats.getEvictions());

      QueryCacheStats fcStats = indicesStats == null ? null : indicesStats.getQueryCache();
      table.addCell(fcStats == null ? null : fcStats.getMemorySize());
      table.addCell(fcStats == null ? null : fcStats.getEvictions());

      RequestCacheStats qcStats = indicesStats == null ? null : indicesStats.getRequestCache();
      table.addCell(qcStats == null ? null : qcStats.getMemorySize());
      table.addCell(qcStats == null ? null : qcStats.getEvictions());
      table.addCell(qcStats == null ? null : qcStats.getHitCount());
      table.addCell(qcStats == null ? null : qcStats.getMissCount());

      FlushStats flushStats = indicesStats == null ? null : indicesStats.getFlush();
      table.addCell(flushStats == null ? null : flushStats.getTotal());
      table.addCell(flushStats == null ? null : flushStats.getTotalTime());

      GetStats getStats = indicesStats == null ? null : indicesStats.getGet();
      table.addCell(getStats == null ? null : getStats.current());
      table.addCell(getStats == null ? null : getStats.getTime());
      table.addCell(getStats == null ? null : getStats.getCount());
      table.addCell(getStats == null ? null : getStats.getExistsTime());
      table.addCell(getStats == null ? null : getStats.getExistsCount());
      table.addCell(getStats == null ? null : getStats.getMissingTime());
      table.addCell(getStats == null ? null : getStats.getMissingCount());

      IndexingStats indexingStats = indicesStats == null ? null : indicesStats.getIndexing();
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteCurrent());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteTime());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteCount());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexCurrent());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexTime());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexCount());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexFailedCount());

      MergeStats mergeStats = indicesStats == null ? null : indicesStats.getMerge();
      table.addCell(mergeStats == null ? null : mergeStats.getCurrent());
      table.addCell(mergeStats == null ? null : mergeStats.getCurrentNumDocs());
      table.addCell(mergeStats == null ? null : mergeStats.getCurrentSize());
      table.addCell(mergeStats == null ? null : mergeStats.getTotal());
      table.addCell(mergeStats == null ? null : mergeStats.getTotalNumDocs());
      table.addCell(mergeStats == null ? null : mergeStats.getTotalSize());
      table.addCell(mergeStats == null ? null : mergeStats.getTotalTime());

      PercolateStats percolateStats = indicesStats == null ? null : indicesStats.getPercolate();
      table.addCell(percolateStats == null ? null : percolateStats.getCurrent());
      table.addCell(percolateStats == null ? null : percolateStats.getMemorySize());
      table.addCell(percolateStats == null ? null : percolateStats.getNumQueries());
      table.addCell(percolateStats == null ? null : percolateStats.getTime());
      table.addCell(percolateStats == null ? null : percolateStats.getCount());

      RefreshStats refreshStats = indicesStats == null ? null : indicesStats.getRefresh();
      table.addCell(refreshStats == null ? null : refreshStats.getTotal());
      table.addCell(refreshStats == null ? null : refreshStats.getTotalTime());

      ScriptStats scriptStats = stats == null ? null : stats.getScriptStats();
      table.addCell(scriptStats == null ? null : scriptStats.getCompilations());
      table.addCell(scriptStats == null ? null : scriptStats.getCacheEvictions());

      SearchStats searchStats = indicesStats == null ? null : indicesStats.getSearch();
      table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchCurrent());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchTime());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchCount());
      table.addCell(searchStats == null ? null : searchStats.getOpenContexts());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCurrent());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryTime());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCount());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCurrent());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollTime());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCount());

      SegmentsStats segmentsStats = indicesStats == null ? null : indicesStats.getSegments();
      table.addCell(segmentsStats == null ? null : segmentsStats.getCount());
      table.addCell(segmentsStats == null ? null : segmentsStats.getMemory());
      table.addCell(segmentsStats == null ? null : segmentsStats.getIndexWriterMemory());
      table.addCell(segmentsStats == null ? null : segmentsStats.getIndexWriterMaxMemory());
      table.addCell(segmentsStats == null ? null : segmentsStats.getVersionMapMemory());
      table.addCell(segmentsStats == null ? null : segmentsStats.getBitsetMemory());

      SuggestStats suggestStats = indicesStats == null ? null : indicesStats.getSuggest();
      table.addCell(suggestStats == null ? null : suggestStats.getCurrent());
      table.addCell(suggestStats == null ? null : suggestStats.getTime());
      table.addCell(suggestStats == null ? null : suggestStats.getCount());

      table.endRow();
    }

    return table;
  }
    @Override
    protected void moveToSecondPhase() {
      sortedShardList = searchPhaseController.sortDocs(queryResults.values());
      final Map<SearchShardTarget, ExtTIntArrayList> docIdsToLoad =
          searchPhaseController.docIdsToLoad(sortedShardList);

      if (docIdsToLoad.isEmpty()) {
        releaseIrrelevantSearchContexts(queryResults, docIdsToLoad);
        finishHim();
      }

      final AtomicInteger counter = new AtomicInteger(docIdsToLoad.size());

      int localOperations = 0;
      for (Map.Entry<SearchShardTarget, ExtTIntArrayList> entry : docIdsToLoad.entrySet()) {
        DiscoveryNode node = nodes.get(entry.getKey().nodeId());
        if (node.id().equals(nodes.localNodeId())) {
          localOperations++;
        } else {
          FetchSearchRequest fetchSearchRequest =
              new FetchSearchRequest(queryResults.get(entry.getKey()).id(), entry.getValue());
          executeFetch(counter, fetchSearchRequest, node);
        }
      }

      if (localOperations > 0) {
        if (request.operationThreading() == SearchOperationThreading.SINGLE_THREAD) {
          threadPool.execute(
              new Runnable() {
                @Override
                public void run() {
                  for (Map.Entry<SearchShardTarget, ExtTIntArrayList> entry :
                      docIdsToLoad.entrySet()) {
                    DiscoveryNode node = nodes.get(entry.getKey().nodeId());
                    if (node.id().equals(nodes.localNodeId())) {
                      FetchSearchRequest fetchSearchRequest =
                          new FetchSearchRequest(
                              queryResults.get(entry.getKey()).id(), entry.getValue());
                      executeFetch(counter, fetchSearchRequest, node);
                    }
                  }
                }
              });
        } else {
          boolean localAsync =
              request.operationThreading() == SearchOperationThreading.THREAD_PER_SHARD;
          for (Map.Entry<SearchShardTarget, ExtTIntArrayList> entry : docIdsToLoad.entrySet()) {
            final DiscoveryNode node = nodes.get(entry.getKey().nodeId());
            if (node.id().equals(nodes.localNodeId())) {
              final FetchSearchRequest fetchSearchRequest =
                  new FetchSearchRequest(queryResults.get(entry.getKey()).id(), entry.getValue());
              if (localAsync) {
                threadPool.execute(
                    new Runnable() {
                      @Override
                      public void run() {
                        executeFetch(counter, fetchSearchRequest, node);
                      }
                    });
              } else {
                executeFetch(counter, fetchSearchRequest, node);
              }
            }
          }
        }
      }

      releaseIrrelevantSearchContexts(queryResults, docIdsToLoad);
    }
Example #24
0
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    final MetaData metaData = routingNodes.metaData();
    RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned();
    unassigned.sort(
        new PriorityComparator() {

          @Override
          protected Settings getIndexSettings(String index) {
            IndexMetaData indexMetaData = metaData.index(index);
            return indexMetaData.getSettings();
          }
        }); // sort for priority ordering
    Iterator<ShardRouting> unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch =
          asyncFetchStarted.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction);
        asyncFetchStarted.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>
          shardState = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardState.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard started state", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }
      shardState.processAllocation(allocation);

      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      /**
       * Build a map of DiscoveryNodes to shard state number for the given shard. A state of -1
       * means the shard does not exist on the node, where any shard state >= 0 is the state version
       * of the shard on that node's disk.
       *
       * <p>A shard on shared storage will return at least shard state 0 for all nodes, indicating
       * that the shard can be allocated to any node.
       */
      ObjectLongHashMap<DiscoveryNode> nodesState = new ObjectLongHashMap<>();
      for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState :
          shardState.getData().values()) {
        long version = nodeShardState.version();
        // -1 version means it does not exists, which is what the API returns, and what we expect to
        logger.trace(
            "[{}] on node [{}] has version [{}] of shard",
            shard,
            nodeShardState.getNode(),
            version);
        nodesState.put(nodeShardState.getNode(), version);
      }

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap();

      assert !nodesState.containsKey(null);
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      Settings idxSettings = indexMetaData.settings();
      for (int i = 0; i < keys.length; i++) {
        if (keys[i] == null) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (recoverOnAnyNode(idxSettings)) {
          numberOfAllocationsFound++;
          if (version > highestVersion) {
            highestVersion = version;
          }
          // We always put the node without clearing the map
          nodesWithVersion.put(node, version);
        } else if (version != -1) {
          numberOfAllocationsFound++;
          // If we've found a new "best" candidate, clear the
          // current candidates and add it
          if (version > highestVersion) {
            highestVersion = version;
            nodesWithVersion.clear();
            nodesWithVersion.put(node, version);
          } else if (version == highestVersion) {
            // If the candidate is the same, add it to the
            // list, but keep the current candidate
            nodesWithVersion.put(node, version);
          }
        }
      }
      // Now that we have a map of nodes to versions along with the
      // number of allocations found (and not ignored), we need to sort
      // it so the node with the highest version is at the beginning
      List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList();
      nodesWithHighestVersion.addAll(nodesWithVersion.keySet());
      CollectionUtil.timSort(
          nodesWithHighestVersion,
          new Comparator<DiscoveryNode>() {
            @Override
            public int compare(DiscoveryNode o1, DiscoveryNode o2) {
              return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1));
            }
          });

      if (logger.isDebugEnabled()) {
        logger.debug(
            "[{}][{}] found {} allocations of {}, highest version: [{}]",
            shard.index(),
            shard.id(),
            numberOfAllocationsFound,
            shard,
            highestVersion);
      }
      if (logger.isTraceEnabled()) {
        StringBuilder sb = new StringBuilder("[");
        for (DiscoveryNode n : nodesWithHighestVersion) {
          sb.append("[");
          sb.append(n.getName());
          sb.append("]");
          sb.append(" -> ");
          sb.append(nodesWithVersion.get(n));
          sb.append(", ");
        }
        sb.append("]");
        logger.trace("{} candidates for allocation: {}", shard, sb.toString());
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }

      AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch =
          asyncFetchStore.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction);
        asyncFetchStore.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores =
              fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue; // still fetching
      }
      shardStores.processAllocation(allocation);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;
      boolean hasReplicaData = false;
      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          nodeStoreEntry : shardStores.getData().entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue().storeFilesMetaData();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          hasReplicaData |= storeFilesMetaData.iterator().hasNext();
          ShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
                  shardStores.getData().get(primaryNode);
              if (primaryNodeFilesStore != null) {
                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                    primaryNodeFilesStore.storeFilesMetaData();
                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                  long sizeMatched = 0;

                  String primarySyncId = primaryNodeStore.syncId();
                  String replicaSyncId = storeFilesMetaData.syncId();
                  // see if we have a sync id we can make use of
                  if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
                    logger.trace(
                        "{}: node [{}] has same sync id {} as primary",
                        shard,
                        discoNode.name(),
                        replicaSyncId);
                    lastNodeMatched = node;
                    lastSizeMatched = Long.MAX_VALUE;
                    lastDiscoNodeMatched = discoNode;
                  } else {
                    for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                      String metaDataFileName = storeFileMetaData.name();
                      if (primaryNodeStore.fileExists(metaDataFileName)
                          && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
                        sizeMatched += storeFileMetaData.length();
                      }
                    }
                    logger.trace(
                        "{}: node [{}] has [{}/{}] bytes of re-usable data",
                        shard,
                        discoNode.name(),
                        new ByteSizeValue(sizeMatched),
                        sizeMatched);
                    if (sizeMatched > lastSizeMatched) {
                      lastSizeMatched = sizeMatched;
                      lastDiscoNodeMatched = discoNode;
                      lastNodeMatched = node;
                    }
                  }
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          routingNodes.initialize(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      } else if (hasReplicaData == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation
        // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
        // note: we only care about replica in delayed allocation, since if we have an unassigned
        // primary it
        //       will anyhow wait to find an existing copy of the shard to be allocated
        // note: the other side of the equation is scheduling a reroute in a timely manner, which
        // happens in the RoutingService
        long delay =
            shard
                .unassignedInfo()
                .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
        if (delay > 0) {
          logger.debug(
              "[{}][{}]: delaying allocation of [{}] for [{}]",
              shard.index(),
              shard.id(),
              shard,
              TimeValue.timeValueMillis(delay));
          /**
           * mark it as changed, since we want to kick a publishing to schedule future allocation,
           * see {@link
           * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
           */
          changed = true;
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        }
      }
    }
    return changed;
  }
Example #25
0
    @Override
    protected void configure() {
      functionBinder =
          MapBinder.newMapBinder(binder(), FunctionIdent.class, FunctionImplementation.class);
      functionBinder.addBinding(TestFunction.ident).toInstance(new TestFunction());
      bind(Functions.class).asEagerSingleton();
      bind(ReferenceInfos.class).toInstance(mock(ReferenceInfos.class));
      bind(ThreadPool.class).toInstance(testThreadPool);

      BulkRetryCoordinator bulkRetryCoordinator = mock(BulkRetryCoordinator.class);
      BulkRetryCoordinatorPool bulkRetryCoordinatorPool = mock(BulkRetryCoordinatorPool.class);
      when(bulkRetryCoordinatorPool.coordinator(any(ShardId.class)))
          .thenReturn(bulkRetryCoordinator);
      bind(BulkRetryCoordinatorPool.class).toInstance(bulkRetryCoordinatorPool);

      bind(TransportBulkCreateIndicesAction.class)
          .toInstance(mock(TransportBulkCreateIndicesAction.class));
      bind(CircuitBreakerService.class).toInstance(new NoneCircuitBreakerService());
      bind(ActionFilters.class).toInstance(mock(ActionFilters.class));
      bind(ScriptService.class).toInstance(mock(ScriptService.class));
      bind(SearchService.class).toInstance(mock(InternalSearchService.class));
      bind(AllocationService.class).toInstance(mock(AllocationService.class));
      bind(MetaDataCreateIndexService.class).toInstance(mock(MetaDataCreateIndexService.class));
      bind(DynamicSettings.class)
          .annotatedWith(ClusterDynamicSettings.class)
          .toInstance(mock(DynamicSettings.class));
      bind(MetaDataDeleteIndexService.class).toInstance(mock(MetaDataDeleteIndexService.class));
      bind(ClusterInfoService.class).toInstance(mock(ClusterInfoService.class));
      bind(TransportService.class).toInstance(mock(TransportService.class));
      bind(MapperService.class).toInstance(mock(MapperService.class));

      OsService osService = mock(OsService.class);
      OsStats osStats = mock(OsStats.class);
      when(osService.stats()).thenReturn(osStats);
      OsStats.Cpu osCpu = mock(OsStats.Cpu.class);
      when(osCpu.stolen()).thenReturn((short) 1);
      when(osStats.cpu()).thenReturn(osCpu);

      bind(OsService.class).toInstance(osService);
      bind(NodeService.class).toInstance(mock(NodeService.class));
      bind(Discovery.class).toInstance(mock(Discovery.class));
      bind(NetworkService.class).toInstance(mock(NetworkService.class));

      bind(TransportShardBulkAction.class).toInstance(mock(TransportShardBulkAction.class));
      bind(TransportCreateIndexAction.class).toInstance(mock(TransportCreateIndexAction.class));

      discoveryService = mock(DiscoveryService.class);
      DiscoveryNode discoveryNode = mock(DiscoveryNode.class);
      when(discoveryNode.id()).thenReturn(TEST_NODE_ID);
      when(discoveryService.localNode()).thenReturn(discoveryNode);

      ClusterService clusterService = mock(ClusterService.class);
      ClusterState state = mock(ClusterState.class);
      DiscoveryNodes discoveryNodes = mock(DiscoveryNodes.class);
      when(discoveryNodes.localNodeId()).thenReturn(TEST_NODE_ID);
      when(state.nodes()).thenReturn(discoveryNodes);
      when(clusterService.state()).thenReturn(state);
      when(clusterService.localNode()).thenReturn(discoveryNode);
      bind(ClusterService.class).toInstance(clusterService);

      IndicesService indicesService = mock(IndicesService.class);
      bind(IndicesService.class).toInstance(indicesService);
      bind(Settings.class).toInstance(ImmutableSettings.EMPTY);

      bind(MetaDataUpdateSettingsService.class)
          .toInstance(mock(MetaDataUpdateSettingsService.class));
      bind(Client.class).toInstance(mock(Client.class));

      Provider<TransportCreateIndexAction> transportCreateIndexActionProvider =
          mock(Provider.class);
      when(transportCreateIndexActionProvider.get())
          .thenReturn(mock(TransportCreateIndexAction.class));
      Provider<TransportDeleteIndexAction> transportDeleteActionProvider = mock(Provider.class);
      when(transportDeleteActionProvider.get()).thenReturn(mock(TransportDeleteIndexAction.class));
      Provider<TransportUpdateSettingsAction> transportUpdateSettingsActionProvider =
          mock(Provider.class);
      when(transportUpdateSettingsActionProvider.get())
          .thenReturn(mock(TransportUpdateSettingsAction.class));

      BlobIndices blobIndices =
          new BlobIndices(
              ImmutableSettings.EMPTY,
              transportCreateIndexActionProvider,
              transportDeleteActionProvider,
              transportUpdateSettingsActionProvider,
              indicesService,
              mock(IndicesLifecycle.class),
              mock(BlobEnvironment.class),
              clusterService);
      bind(BlobIndices.class).toInstance(blobIndices);

      bind(ReferenceResolver.class).to(GlobalReferenceResolver.class);

      TransportPutIndexTemplateAction transportPutIndexTemplateAction =
          mock(TransportPutIndexTemplateAction.class);
      bind(TransportPutIndexTemplateAction.class).toInstance(transportPutIndexTemplateAction);

      bind(IndexService.class).toInstance(indexService);
    }
  @Override
  public void execute(RoutingAllocation allocation) throws ElasticSearchException {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
    boolean found = false;
    for (RoutingNodes.RoutingNodeIterator it =
            allocation.routingNodes().routingNodeIter(discoNode.id());
        it.hasNext(); ) {
      MutableShardRouting shardRouting = it.next();
      if (!shardRouting.shardId().equals(shardId)) {
        continue;
      }
      found = true;
      if (shardRouting.relocatingNodeId() != null) {
        if (shardRouting.initializing()) {
          // the shard is initializing and recovering from another node, simply cancel the recovery
          it.remove();
          // and cancel the relocating state from the shard its being relocated from
          RoutingNode relocatingFromNode =
              allocation.routingNodes().node(shardRouting.relocatingNodeId());
          if (relocatingFromNode != null) {
            for (MutableShardRouting fromShardRouting : relocatingFromNode) {
              if (fromShardRouting.shardId().equals(shardRouting.shardId())
                  && fromShardRouting.state() == RELOCATING) {
                allocation.routingNodes().cancelRelocation(fromShardRouting);
                break;
              }
            }
          }
        } else if (shardRouting.relocating()) {

          // the shard is relocating to another node, cancel the recovery on the other node, and
          // deallocate this one
          if (!allowPrimary && shardRouting.primary()) {
            // can't cancel a primary shard being initialized
            throw new ElasticSearchIllegalArgumentException(
                "[cancel_allocation] can't cancel "
                    + shardId
                    + " on node "
                    + discoNode
                    + ", shard is primary and initializing its state");
          }
          it.moveToUnassigned();
          // now, go and find the shard that is initializing on the target node, and cancel it as
          // well...
          RoutingNodes.RoutingNodeIterator initializingNode =
              allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId());
          if (initializingNode != null) {
            while (initializingNode.hasNext()) {
              MutableShardRouting initializingShardRouting = initializingNode.next();
              if (initializingShardRouting.shardId().equals(shardRouting.shardId())
                  && initializingShardRouting.state() == INITIALIZING) {
                initializingNode.remove();
              }
            }
          }
        }
      } else {
        // the shard is not relocating, its either started, or initializing, just cancel it and move
        // on...
        if (!allowPrimary && shardRouting.primary()) {
          // can't cancel a primary shard being initialized
          throw new ElasticSearchIllegalArgumentException(
              "[cancel_allocation] can't cancel "
                  + shardId
                  + " on node "
                  + discoNode
                  + ", shard is primary and started");
        }
        it.remove();
        allocation
            .routingNodes()
            .unassigned()
            .add(
                new MutableShardRouting(
                    shardRouting.index(),
                    shardRouting.id(),
                    null,
                    shardRouting.primary(),
                    ShardRoutingState.UNASSIGNED,
                    shardRouting.version() + 1));
      }
    }
    if (!found) {
      throw new ElasticSearchIllegalArgumentException(
          "[cancel_allocation] can't cancel "
              + shardId
              + ", failed to find it on node "
              + discoNode);
    }
  }
Example #27
0
 public void addNode(DiscoveryNode node) {
   ensureMutable();
   RoutingNode routingNode = new RoutingNode(node.id(), node);
   nodesToShards.put(routingNode.nodeId(), routingNode);
 }
Example #28
0
    private ClusterState applyUpdate(ClusterState currentState, ClusterChangedEvent task) {
      boolean clusterStateChanged = false;
      ClusterState tribeState = task.state();
      DiscoveryNodes.Builder nodes = DiscoveryNodes.builder(currentState.nodes());
      // -- merge nodes
      // go over existing nodes, and see if they need to be removed
      for (DiscoveryNode discoNode : currentState.nodes()) {
        String markedTribeName = discoNode.attributes().get(TRIBE_NAME);
        if (markedTribeName != null && markedTribeName.equals(tribeName)) {
          if (tribeState.nodes().get(discoNode.id()) == null) {
            clusterStateChanged = true;
            logger.info("[{}] removing node [{}]", tribeName, discoNode);
            nodes.remove(discoNode.id());
          }
        }
      }
      // go over tribe nodes, and see if they need to be added
      for (DiscoveryNode tribe : tribeState.nodes()) {
        if (currentState.nodes().get(tribe.id()) == null) {
          // a new node, add it, but also add the tribe name to the attributes
          Map<String, String> tribeAttr = new HashMap<>();
          for (ObjectObjectCursor<String, String> attr : tribe.attributes()) {
            tribeAttr.put(attr.key, attr.value);
          }
          tribeAttr.put(TRIBE_NAME, tribeName);
          DiscoveryNode discoNode =
              new DiscoveryNode(
                  tribe.name(),
                  tribe.id(),
                  tribe.getHostName(),
                  tribe.getHostAddress(),
                  tribe.address(),
                  unmodifiableMap(tribeAttr),
                  tribe.version());
          clusterStateChanged = true;
          logger.info("[{}] adding node [{}]", tribeName, discoNode);
          nodes.put(discoNode);
        }
      }

      // -- merge metadata
      ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks());
      MetaData.Builder metaData = MetaData.builder(currentState.metaData());
      RoutingTable.Builder routingTable = RoutingTable.builder(currentState.routingTable());
      // go over existing indices, and see if they need to be removed
      for (IndexMetaData index : currentState.metaData()) {
        String markedTribeName = index.getSettings().get(TRIBE_NAME);
        if (markedTribeName != null && markedTribeName.equals(tribeName)) {
          IndexMetaData tribeIndex = tribeState.metaData().index(index.getIndex());
          clusterStateChanged = true;
          if (tribeIndex == null || tribeIndex.getState() == IndexMetaData.State.CLOSE) {
            logger.info("[{}] removing index [{}]", tribeName, index.getIndex());
            removeIndex(blocks, metaData, routingTable, index);
          } else {
            // always make sure to update the metadata and routing table, in case
            // there are changes in them (new mapping, shards moving from initializing to started)
            routingTable.add(tribeState.routingTable().index(index.getIndex()));
            Settings tribeSettings =
                Settings.builder().put(tribeIndex.getSettings()).put(TRIBE_NAME, tribeName).build();
            metaData.put(IndexMetaData.builder(tribeIndex).settings(tribeSettings));
          }
        }
      }
      // go over tribe one, and see if they need to be added
      for (IndexMetaData tribeIndex : tribeState.metaData()) {
        // if there is no routing table yet, do nothing with it...
        IndexRoutingTable table = tribeState.routingTable().index(tribeIndex.getIndex());
        if (table == null) {
          continue;
        }
        final IndexMetaData indexMetaData = currentState.metaData().index(tribeIndex.getIndex());
        if (indexMetaData == null) {
          if (!droppedIndices.contains(tribeIndex.getIndex())) {
            // a new index, add it, and add the tribe name as a setting
            clusterStateChanged = true;
            logger.info("[{}] adding index [{}]", tribeName, tribeIndex.getIndex());
            addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex);
          }
        } else {
          String existingFromTribe = indexMetaData.getSettings().get(TRIBE_NAME);
          if (!tribeName.equals(existingFromTribe)) {
            // we have a potential conflict on index names, decide what to do...
            if (ON_CONFLICT_ANY.equals(onConflict)) {
              // we chose any tribe, carry on
            } else if (ON_CONFLICT_DROP.equals(onConflict)) {
              // drop the indices, there is a conflict
              clusterStateChanged = true;
              logger.info(
                  "[{}] dropping index [{}] due to conflict with [{}]",
                  tribeName,
                  tribeIndex.getIndex(),
                  existingFromTribe);
              removeIndex(blocks, metaData, routingTable, tribeIndex);
              droppedIndices.add(tribeIndex.getIndex());
            } else if (onConflict.startsWith(ON_CONFLICT_PREFER)) {
              // on conflict, prefer a tribe...
              String preferredTribeName = onConflict.substring(ON_CONFLICT_PREFER.length());
              if (tribeName.equals(preferredTribeName)) {
                // the new one is hte preferred one, replace...
                clusterStateChanged = true;
                logger.info(
                    "[{}] adding index [{}], preferred over [{}]",
                    tribeName,
                    tribeIndex.getIndex(),
                    existingFromTribe);
                removeIndex(blocks, metaData, routingTable, tribeIndex);
                addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex);
              } // else: either the existing one is the preferred one, or we haven't seen one, carry
                // on
            }
          }
        }
      }

      if (!clusterStateChanged) {
        return currentState;
      } else {
        return ClusterState.builder(currentState)
            .incrementVersion()
            .blocks(blocks)
            .nodes(nodes)
            .metaData(metaData)
            .routingTable(routingTable.build())
            .build();
      }
    }
  private MatchingNodes findMatchingNodes(
      ShardRouting shard,
      RoutingAllocation allocation,
      TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore,
      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          data) {
    ObjectLongMap<DiscoveryNode> nodesToSize = new ObjectLongHashMap<>();
    for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
        nodeStoreEntry : data.getData().entrySet()) {
      DiscoveryNode discoNode = nodeStoreEntry.getKey();
      TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
          nodeStoreEntry.getValue().storeFilesMetaData();
      if (storeFilesMetaData == null) {
        // already allocated on that node...
        continue;
      }

      RoutingNode node = allocation.routingNodes().node(discoNode.id());
      if (node == null) {
        continue;
      }

      // check if we can allocate on that node...
      // we only check for NO, since if this node is THROTTLING and it has enough "same data"
      // then we will try and assign it next time
      Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
      if (decision.type() == Decision.Type.NO) {
        continue;
      }

      // if it is already allocated, we can't assign to it... (and it might be primary as well)
      if (storeFilesMetaData.allocated()) {
        continue;
      }

      // we don't have any files at all, it is an empty index
      if (storeFilesMetaData.iterator().hasNext() == false) {
        continue;
      }

      String primarySyncId = primaryStore.syncId();
      String replicaSyncId = storeFilesMetaData.syncId();
      // see if we have a sync id we can make use of
      if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
        logger.trace(
            "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId);
        nodesToSize.put(discoNode, Long.MAX_VALUE);
      } else {
        long sizeMatched = 0;
        for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
          String metaDataFileName = storeFileMetaData.name();
          if (primaryStore.fileExists(metaDataFileName)
              && primaryStore.file(metaDataFileName).isSame(storeFileMetaData)) {
            sizeMatched += storeFileMetaData.length();
          }
        }
        logger.trace(
            "{}: node [{}] has [{}/{}] bytes of re-usable data",
            shard,
            discoNode.name(),
            new ByteSizeValue(sizeMatched),
            sizeMatched);
        nodesToSize.put(discoNode, sizeMatched);
      }
    }

    return new MatchingNodes(nodesToSize);
  }
 @Override
 public String nodeDescription() {
   return clusterName.value() + "/" + localNode.id();
 }