예제 #1
0
  @Override
  public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
    if (getTaskFailures() != null && getTaskFailures().size() > 0) {
      builder.startArray("task_failures");
      for (TaskOperationFailure ex : getTaskFailures()) {
        builder.startObject();
        builder.value(ex);
        builder.endObject();
      }
      builder.endArray();
    }

    if (getNodeFailures() != null && getNodeFailures().size() > 0) {
      builder.startArray("node_failures");
      for (FailedNodeException ex : getNodeFailures()) {
        builder.startObject();
        ex.toXContent(builder, params);
        builder.endObject();
      }
      builder.endArray();
    }

    builder.startObject("nodes");
    for (Map.Entry<DiscoveryNode, List<TaskInfo>> entry : getPerNodeTasks().entrySet()) {
      DiscoveryNode node = entry.getKey();
      builder.startObject(node.getId(), XContentBuilder.FieldCaseConversion.NONE);
      builder.field("name", node.name());
      builder.field("transport_address", node.address().toString());
      builder.field("host", node.getHostName());
      builder.field("ip", node.getAddress());

      if (!node.attributes().isEmpty()) {
        builder.startObject("attributes");
        for (ObjectObjectCursor<String, String> attr : node.attributes()) {
          builder.field(attr.key, attr.value, XContentBuilder.FieldCaseConversion.NONE);
        }
        builder.endObject();
      }
      builder.startArray("tasks");
      for (TaskInfo task : entry.getValue()) {
        task.toXContent(builder, params);
      }
      builder.endArray();
      builder.endObject();
    }
    builder.endObject();
    return builder;
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    final RoutingNodes routingNodes = allocation.routingNodes();
    final MetaData metaData = routingNodes.metaData();

    final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator =
        routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.removeAndIgnore();
        continue;
      }

      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores = fetchData(shard, allocation);
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        unassignedIterator.removeAndIgnore();
        continue; // still fetching
      }

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;
      boolean hasReplicaData = false;
      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          nodeStoreEntry : shardStores.getData().entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue().storeFilesMetaData();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          hasReplicaData |= storeFilesMetaData.iterator().hasNext();
          ShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
                  shardStores.getData().get(primaryNode);
              if (primaryNodeFilesStore != null) {
                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                    primaryNodeFilesStore.storeFilesMetaData();
                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                  long sizeMatched = 0;

                  String primarySyncId = primaryNodeStore.syncId();
                  String replicaSyncId = storeFilesMetaData.syncId();
                  // see if we have a sync id we can make use of
                  if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
                    logger.trace(
                        "{}: node [{}] has same sync id {} as primary",
                        shard,
                        discoNode.name(),
                        replicaSyncId);
                    lastNodeMatched = node;
                    lastSizeMatched = Long.MAX_VALUE;
                    lastDiscoNodeMatched = discoNode;
                  } else {
                    for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                      String metaDataFileName = storeFileMetaData.name();
                      if (primaryNodeStore.fileExists(metaDataFileName)
                          && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
                        sizeMatched += storeFileMetaData.length();
                      }
                    }
                    logger.trace(
                        "{}: node [{}] has [{}/{}] bytes of re-usable data",
                        shard,
                        discoNode.name(),
                        new ByteSizeValue(sizeMatched),
                        sizeMatched);
                    if (sizeMatched > lastSizeMatched) {
                      lastSizeMatched = sizeMatched;
                      lastDiscoNodeMatched = discoNode;
                      lastNodeMatched = node;
                    }
                  }
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.removeAndIgnore();
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          unassignedIterator.initialize(lastNodeMatched.nodeId());
        }
      } else if (hasReplicaData == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation
        // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
        // note: we only care about replica in delayed allocation, since if we have an unassigned
        // primary it
        //       will anyhow wait to find an existing copy of the shard to be allocated
        // note: the other side of the equation is scheduling a reroute in a timely manner, which
        // happens in the RoutingService
        long delay =
            shard
                .unassignedInfo()
                .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
        if (delay > 0) {
          logger.debug(
              "[{}][{}]: delaying allocation of [{}] for [{}]",
              shard.index(),
              shard.id(),
              shard,
              TimeValue.timeValueMillis(delay));
          /**
           * mark it as changed, since we want to kick a publishing to schedule future allocation,
           * see {@link
           * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
           */
          changed = true;
          unassignedIterator.removeAndIgnore();
        }
      }
    }
    return changed;
  }
예제 #3
0
  private Table buildTable(
      RestRequest req,
      ClusterStateResponse state,
      NodesInfoResponse nodesInfo,
      NodesStatsResponse nodesStats) {
    boolean fullId = req.paramAsBoolean("full_id", false);

    DiscoveryNodes nodes = state.getState().nodes();
    String masterId = nodes.masterNodeId();
    Table table = getTableWithHeader(req);

    for (DiscoveryNode node : nodes) {
      NodeInfo info = nodesInfo.getNodesMap().get(node.id());
      NodeStats stats = nodesStats.getNodesMap().get(node.id());

      JvmInfo jvmInfo = info == null ? null : info.getJvm();
      JvmStats jvmStats = stats == null ? null : stats.getJvm();
      FsInfo fsInfo = stats == null ? null : stats.getFs();
      OsStats osStats = stats == null ? null : stats.getOs();
      ProcessStats processStats = stats == null ? null : stats.getProcess();
      NodeIndicesStats indicesStats = stats == null ? null : stats.getIndices();

      table.startRow();

      table.addCell(fullId ? node.id() : Strings.substring(node.getId(), 0, 4));
      table.addCell(info == null ? null : info.getProcess().getId());
      table.addCell(node.getHostName());
      table.addCell(node.getHostAddress());
      if (node.address() instanceof InetSocketTransportAddress) {
        table.addCell(((InetSocketTransportAddress) node.address()).address().getPort());
      } else {
        table.addCell("-");
      }

      table.addCell(node.getVersion().number());
      table.addCell(info == null ? null : info.getBuild().shortHash());
      table.addCell(jvmInfo == null ? null : jvmInfo.version());
      table.addCell(fsInfo == null ? null : fsInfo.getTotal().getAvailable());
      table.addCell(jvmStats == null ? null : jvmStats.getMem().getHeapUsed());
      table.addCell(jvmStats == null ? null : jvmStats.getMem().getHeapUsedPercent());
      table.addCell(jvmInfo == null ? null : jvmInfo.getMem().getHeapMax());
      table.addCell(
          osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getUsed());
      table.addCell(
          osStats == null
              ? null
              : osStats.getMem() == null ? null : osStats.getMem().getUsedPercent());
      table.addCell(
          osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getTotal());
      table.addCell(processStats == null ? null : processStats.getOpenFileDescriptors());
      table.addCell(
          processStats == null
              ? null
              : calculatePercentage(
                  processStats.getOpenFileDescriptors(), processStats.getMaxFileDescriptors()));
      table.addCell(processStats == null ? null : processStats.getMaxFileDescriptors());

      table.addCell(
          osStats == null ? null : String.format(Locale.ROOT, "%.2f", osStats.getLoadAverage()));
      table.addCell(jvmStats == null ? null : jvmStats.getUptime());
      table.addCell(node.clientNode() ? "c" : node.dataNode() ? "d" : "-");
      table.addCell(
          masterId == null
              ? "x"
              : masterId.equals(node.id()) ? "*" : node.masterNode() ? "m" : "-");
      table.addCell(node.name());

      CompletionStats completionStats =
          indicesStats == null ? null : stats.getIndices().getCompletion();
      table.addCell(completionStats == null ? null : completionStats.getSize());

      FieldDataStats fdStats = indicesStats == null ? null : stats.getIndices().getFieldData();
      table.addCell(fdStats == null ? null : fdStats.getMemorySize());
      table.addCell(fdStats == null ? null : fdStats.getEvictions());

      QueryCacheStats fcStats = indicesStats == null ? null : indicesStats.getQueryCache();
      table.addCell(fcStats == null ? null : fcStats.getMemorySize());
      table.addCell(fcStats == null ? null : fcStats.getEvictions());

      RequestCacheStats qcStats = indicesStats == null ? null : indicesStats.getRequestCache();
      table.addCell(qcStats == null ? null : qcStats.getMemorySize());
      table.addCell(qcStats == null ? null : qcStats.getEvictions());
      table.addCell(qcStats == null ? null : qcStats.getHitCount());
      table.addCell(qcStats == null ? null : qcStats.getMissCount());

      FlushStats flushStats = indicesStats == null ? null : indicesStats.getFlush();
      table.addCell(flushStats == null ? null : flushStats.getTotal());
      table.addCell(flushStats == null ? null : flushStats.getTotalTime());

      GetStats getStats = indicesStats == null ? null : indicesStats.getGet();
      table.addCell(getStats == null ? null : getStats.current());
      table.addCell(getStats == null ? null : getStats.getTime());
      table.addCell(getStats == null ? null : getStats.getCount());
      table.addCell(getStats == null ? null : getStats.getExistsTime());
      table.addCell(getStats == null ? null : getStats.getExistsCount());
      table.addCell(getStats == null ? null : getStats.getMissingTime());
      table.addCell(getStats == null ? null : getStats.getMissingCount());

      IndexingStats indexingStats = indicesStats == null ? null : indicesStats.getIndexing();
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteCurrent());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteTime());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteCount());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexCurrent());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexTime());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexCount());
      table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexFailedCount());

      MergeStats mergeStats = indicesStats == null ? null : indicesStats.getMerge();
      table.addCell(mergeStats == null ? null : mergeStats.getCurrent());
      table.addCell(mergeStats == null ? null : mergeStats.getCurrentNumDocs());
      table.addCell(mergeStats == null ? null : mergeStats.getCurrentSize());
      table.addCell(mergeStats == null ? null : mergeStats.getTotal());
      table.addCell(mergeStats == null ? null : mergeStats.getTotalNumDocs());
      table.addCell(mergeStats == null ? null : mergeStats.getTotalSize());
      table.addCell(mergeStats == null ? null : mergeStats.getTotalTime());

      PercolateStats percolateStats = indicesStats == null ? null : indicesStats.getPercolate();
      table.addCell(percolateStats == null ? null : percolateStats.getCurrent());
      table.addCell(percolateStats == null ? null : percolateStats.getMemorySize());
      table.addCell(percolateStats == null ? null : percolateStats.getNumQueries());
      table.addCell(percolateStats == null ? null : percolateStats.getTime());
      table.addCell(percolateStats == null ? null : percolateStats.getCount());

      RefreshStats refreshStats = indicesStats == null ? null : indicesStats.getRefresh();
      table.addCell(refreshStats == null ? null : refreshStats.getTotal());
      table.addCell(refreshStats == null ? null : refreshStats.getTotalTime());

      ScriptStats scriptStats = stats == null ? null : stats.getScriptStats();
      table.addCell(scriptStats == null ? null : scriptStats.getCompilations());
      table.addCell(scriptStats == null ? null : scriptStats.getCacheEvictions());

      SearchStats searchStats = indicesStats == null ? null : indicesStats.getSearch();
      table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchCurrent());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchTime());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchCount());
      table.addCell(searchStats == null ? null : searchStats.getOpenContexts());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCurrent());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryTime());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCount());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCurrent());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollTime());
      table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCount());

      SegmentsStats segmentsStats = indicesStats == null ? null : indicesStats.getSegments();
      table.addCell(segmentsStats == null ? null : segmentsStats.getCount());
      table.addCell(segmentsStats == null ? null : segmentsStats.getMemory());
      table.addCell(segmentsStats == null ? null : segmentsStats.getIndexWriterMemory());
      table.addCell(segmentsStats == null ? null : segmentsStats.getIndexWriterMaxMemory());
      table.addCell(segmentsStats == null ? null : segmentsStats.getVersionMapMemory());
      table.addCell(segmentsStats == null ? null : segmentsStats.getBitsetMemory());

      SuggestStats suggestStats = indicesStats == null ? null : indicesStats.getSuggest();
      table.addCell(suggestStats == null ? null : suggestStats.getCurrent());
      table.addCell(suggestStats == null ? null : suggestStats.getTime());
      table.addCell(suggestStats == null ? null : suggestStats.getCount());

      table.endRow();
    }

    return table;
  }
예제 #4
0
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    final MetaData metaData = routingNodes.metaData();
    RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned();
    unassigned.sort(
        new PriorityComparator() {

          @Override
          protected Settings getIndexSettings(String index) {
            IndexMetaData indexMetaData = metaData.index(index);
            return indexMetaData.getSettings();
          }
        }); // sort for priority ordering
    Iterator<ShardRouting> unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch =
          asyncFetchStarted.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction);
        asyncFetchStarted.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>
          shardState = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardState.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard started state", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }
      shardState.processAllocation(allocation);

      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      /**
       * Build a map of DiscoveryNodes to shard state number for the given shard. A state of -1
       * means the shard does not exist on the node, where any shard state >= 0 is the state version
       * of the shard on that node's disk.
       *
       * <p>A shard on shared storage will return at least shard state 0 for all nodes, indicating
       * that the shard can be allocated to any node.
       */
      ObjectLongHashMap<DiscoveryNode> nodesState = new ObjectLongHashMap<>();
      for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState :
          shardState.getData().values()) {
        long version = nodeShardState.version();
        // -1 version means it does not exists, which is what the API returns, and what we expect to
        logger.trace(
            "[{}] on node [{}] has version [{}] of shard",
            shard,
            nodeShardState.getNode(),
            version);
        nodesState.put(nodeShardState.getNode(), version);
      }

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap();

      assert !nodesState.containsKey(null);
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      Settings idxSettings = indexMetaData.settings();
      for (int i = 0; i < keys.length; i++) {
        if (keys[i] == null) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (recoverOnAnyNode(idxSettings)) {
          numberOfAllocationsFound++;
          if (version > highestVersion) {
            highestVersion = version;
          }
          // We always put the node without clearing the map
          nodesWithVersion.put(node, version);
        } else if (version != -1) {
          numberOfAllocationsFound++;
          // If we've found a new "best" candidate, clear the
          // current candidates and add it
          if (version > highestVersion) {
            highestVersion = version;
            nodesWithVersion.clear();
            nodesWithVersion.put(node, version);
          } else if (version == highestVersion) {
            // If the candidate is the same, add it to the
            // list, but keep the current candidate
            nodesWithVersion.put(node, version);
          }
        }
      }
      // Now that we have a map of nodes to versions along with the
      // number of allocations found (and not ignored), we need to sort
      // it so the node with the highest version is at the beginning
      List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList();
      nodesWithHighestVersion.addAll(nodesWithVersion.keySet());
      CollectionUtil.timSort(
          nodesWithHighestVersion,
          new Comparator<DiscoveryNode>() {
            @Override
            public int compare(DiscoveryNode o1, DiscoveryNode o2) {
              return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1));
            }
          });

      if (logger.isDebugEnabled()) {
        logger.debug(
            "[{}][{}] found {} allocations of {}, highest version: [{}]",
            shard.index(),
            shard.id(),
            numberOfAllocationsFound,
            shard,
            highestVersion);
      }
      if (logger.isTraceEnabled()) {
        StringBuilder sb = new StringBuilder("[");
        for (DiscoveryNode n : nodesWithHighestVersion) {
          sb.append("[");
          sb.append(n.getName());
          sb.append("]");
          sb.append(" -> ");
          sb.append(nodesWithVersion.get(n));
          sb.append(", ");
        }
        sb.append("]");
        logger.trace("{} candidates for allocation: {}", shard, sb.toString());
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }

      AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch =
          asyncFetchStore.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction);
        asyncFetchStore.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores =
              fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue; // still fetching
      }
      shardStores.processAllocation(allocation);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;
      boolean hasReplicaData = false;
      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          nodeStoreEntry : shardStores.getData().entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue().storeFilesMetaData();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          hasReplicaData |= storeFilesMetaData.iterator().hasNext();
          ShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
                  shardStores.getData().get(primaryNode);
              if (primaryNodeFilesStore != null) {
                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                    primaryNodeFilesStore.storeFilesMetaData();
                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                  long sizeMatched = 0;

                  String primarySyncId = primaryNodeStore.syncId();
                  String replicaSyncId = storeFilesMetaData.syncId();
                  // see if we have a sync id we can make use of
                  if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
                    logger.trace(
                        "{}: node [{}] has same sync id {} as primary",
                        shard,
                        discoNode.name(),
                        replicaSyncId);
                    lastNodeMatched = node;
                    lastSizeMatched = Long.MAX_VALUE;
                    lastDiscoNodeMatched = discoNode;
                  } else {
                    for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                      String metaDataFileName = storeFileMetaData.name();
                      if (primaryNodeStore.fileExists(metaDataFileName)
                          && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
                        sizeMatched += storeFileMetaData.length();
                      }
                    }
                    logger.trace(
                        "{}: node [{}] has [{}/{}] bytes of re-usable data",
                        shard,
                        discoNode.name(),
                        new ByteSizeValue(sizeMatched),
                        sizeMatched);
                    if (sizeMatched > lastSizeMatched) {
                      lastSizeMatched = sizeMatched;
                      lastDiscoNodeMatched = discoNode;
                      lastNodeMatched = node;
                    }
                  }
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          routingNodes.initialize(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      } else if (hasReplicaData == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation
        // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
        // note: we only care about replica in delayed allocation, since if we have an unassigned
        // primary it
        //       will anyhow wait to find an existing copy of the shard to be allocated
        // note: the other side of the equation is scheduling a reroute in a timely manner, which
        // happens in the RoutingService
        long delay =
            shard
                .unassignedInfo()
                .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
        if (delay > 0) {
          logger.debug(
              "[{}][{}]: delaying allocation of [{}] for [{}]",
              shard.index(),
              shard.id(),
              shard,
              TimeValue.timeValueMillis(delay));
          /**
           * mark it as changed, since we want to kick a publishing to schedule future allocation,
           * see {@link
           * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
           */
          changed = true;
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        }
      }
    }
    return changed;
  }
예제 #5
0
    private ClusterState applyUpdate(ClusterState currentState, ClusterChangedEvent task) {
      boolean clusterStateChanged = false;
      ClusterState tribeState = task.state();
      DiscoveryNodes.Builder nodes = DiscoveryNodes.builder(currentState.nodes());
      // -- merge nodes
      // go over existing nodes, and see if they need to be removed
      for (DiscoveryNode discoNode : currentState.nodes()) {
        String markedTribeName = discoNode.attributes().get(TRIBE_NAME);
        if (markedTribeName != null && markedTribeName.equals(tribeName)) {
          if (tribeState.nodes().get(discoNode.id()) == null) {
            clusterStateChanged = true;
            logger.info("[{}] removing node [{}]", tribeName, discoNode);
            nodes.remove(discoNode.id());
          }
        }
      }
      // go over tribe nodes, and see if they need to be added
      for (DiscoveryNode tribe : tribeState.nodes()) {
        if (currentState.nodes().get(tribe.id()) == null) {
          // a new node, add it, but also add the tribe name to the attributes
          Map<String, String> tribeAttr = new HashMap<>();
          for (ObjectObjectCursor<String, String> attr : tribe.attributes()) {
            tribeAttr.put(attr.key, attr.value);
          }
          tribeAttr.put(TRIBE_NAME, tribeName);
          DiscoveryNode discoNode =
              new DiscoveryNode(
                  tribe.name(),
                  tribe.id(),
                  tribe.getHostName(),
                  tribe.getHostAddress(),
                  tribe.address(),
                  unmodifiableMap(tribeAttr),
                  tribe.version());
          clusterStateChanged = true;
          logger.info("[{}] adding node [{}]", tribeName, discoNode);
          nodes.put(discoNode);
        }
      }

      // -- merge metadata
      ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks());
      MetaData.Builder metaData = MetaData.builder(currentState.metaData());
      RoutingTable.Builder routingTable = RoutingTable.builder(currentState.routingTable());
      // go over existing indices, and see if they need to be removed
      for (IndexMetaData index : currentState.metaData()) {
        String markedTribeName = index.getSettings().get(TRIBE_NAME);
        if (markedTribeName != null && markedTribeName.equals(tribeName)) {
          IndexMetaData tribeIndex = tribeState.metaData().index(index.getIndex());
          clusterStateChanged = true;
          if (tribeIndex == null || tribeIndex.getState() == IndexMetaData.State.CLOSE) {
            logger.info("[{}] removing index [{}]", tribeName, index.getIndex());
            removeIndex(blocks, metaData, routingTable, index);
          } else {
            // always make sure to update the metadata and routing table, in case
            // there are changes in them (new mapping, shards moving from initializing to started)
            routingTable.add(tribeState.routingTable().index(index.getIndex()));
            Settings tribeSettings =
                Settings.builder().put(tribeIndex.getSettings()).put(TRIBE_NAME, tribeName).build();
            metaData.put(IndexMetaData.builder(tribeIndex).settings(tribeSettings));
          }
        }
      }
      // go over tribe one, and see if they need to be added
      for (IndexMetaData tribeIndex : tribeState.metaData()) {
        // if there is no routing table yet, do nothing with it...
        IndexRoutingTable table = tribeState.routingTable().index(tribeIndex.getIndex());
        if (table == null) {
          continue;
        }
        final IndexMetaData indexMetaData = currentState.metaData().index(tribeIndex.getIndex());
        if (indexMetaData == null) {
          if (!droppedIndices.contains(tribeIndex.getIndex())) {
            // a new index, add it, and add the tribe name as a setting
            clusterStateChanged = true;
            logger.info("[{}] adding index [{}]", tribeName, tribeIndex.getIndex());
            addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex);
          }
        } else {
          String existingFromTribe = indexMetaData.getSettings().get(TRIBE_NAME);
          if (!tribeName.equals(existingFromTribe)) {
            // we have a potential conflict on index names, decide what to do...
            if (ON_CONFLICT_ANY.equals(onConflict)) {
              // we chose any tribe, carry on
            } else if (ON_CONFLICT_DROP.equals(onConflict)) {
              // drop the indices, there is a conflict
              clusterStateChanged = true;
              logger.info(
                  "[{}] dropping index [{}] due to conflict with [{}]",
                  tribeName,
                  tribeIndex.getIndex(),
                  existingFromTribe);
              removeIndex(blocks, metaData, routingTable, tribeIndex);
              droppedIndices.add(tribeIndex.getIndex());
            } else if (onConflict.startsWith(ON_CONFLICT_PREFER)) {
              // on conflict, prefer a tribe...
              String preferredTribeName = onConflict.substring(ON_CONFLICT_PREFER.length());
              if (tribeName.equals(preferredTribeName)) {
                // the new one is hte preferred one, replace...
                clusterStateChanged = true;
                logger.info(
                    "[{}] adding index [{}], preferred over [{}]",
                    tribeName,
                    tribeIndex.getIndex(),
                    existingFromTribe);
                removeIndex(blocks, metaData, routingTable, tribeIndex);
                addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex);
              } // else: either the existing one is the preferred one, or we haven't seen one, carry
                // on
            }
          }
        }
      }

      if (!clusterStateChanged) {
        return currentState;
      } else {
        return ClusterState.builder(currentState)
            .incrementVersion()
            .blocks(blocks)
            .nodes(nodes)
            .metaData(metaData)
            .routingTable(routingTable.build())
            .build();
      }
    }
  private MatchingNodes findMatchingNodes(
      ShardRouting shard,
      RoutingAllocation allocation,
      TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore,
      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          data) {
    ObjectLongMap<DiscoveryNode> nodesToSize = new ObjectLongHashMap<>();
    for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
        nodeStoreEntry : data.getData().entrySet()) {
      DiscoveryNode discoNode = nodeStoreEntry.getKey();
      TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
          nodeStoreEntry.getValue().storeFilesMetaData();
      if (storeFilesMetaData == null) {
        // already allocated on that node...
        continue;
      }

      RoutingNode node = allocation.routingNodes().node(discoNode.id());
      if (node == null) {
        continue;
      }

      // check if we can allocate on that node...
      // we only check for NO, since if this node is THROTTLING and it has enough "same data"
      // then we will try and assign it next time
      Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
      if (decision.type() == Decision.Type.NO) {
        continue;
      }

      // if it is already allocated, we can't assign to it... (and it might be primary as well)
      if (storeFilesMetaData.allocated()) {
        continue;
      }

      // we don't have any files at all, it is an empty index
      if (storeFilesMetaData.iterator().hasNext() == false) {
        continue;
      }

      String primarySyncId = primaryStore.syncId();
      String replicaSyncId = storeFilesMetaData.syncId();
      // see if we have a sync id we can make use of
      if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
        logger.trace(
            "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId);
        nodesToSize.put(discoNode, Long.MAX_VALUE);
      } else {
        long sizeMatched = 0;
        for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
          String metaDataFileName = storeFileMetaData.name();
          if (primaryStore.fileExists(metaDataFileName)
              && primaryStore.file(metaDataFileName).isSame(storeFileMetaData)) {
            sizeMatched += storeFileMetaData.length();
          }
        }
        logger.trace(
            "{}: node [{}] has [{}/{}] bytes of re-usable data",
            shard,
            discoNode.name(),
            new ByteSizeValue(sizeMatched),
            sizeMatched);
        nodesToSize.put(discoNode, sizeMatched);
      }
    }

    return new MatchingNodes(nodesToSize);
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard);

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet();
      final boolean[] states = nodesState.allocated;
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      for (int i = 0; i < states.length; i++) {
        if (!states[i]) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (version != -1) {
          numberOfAllocationsFound++;
          if (highestVersion == -1) {
            nodesWithHighestVersion.add(node);
            highestVersion = version;
          } else {
            if (version > highestVersion) {
              nodesWithHighestVersion.clear();
              nodesWithHighestVersion.add(node);
              highestVersion = version;
            } else if (version == highestVersion) {
              nodesWithHighestVersion.add(node);
            }
          }
        }
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index());
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        continue;
      }

      Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
          buildShardStores(nodes, shard);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
          nodeStoreEntry : shardStores.entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          MutableShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                  shardStores.get(primaryNode);
              if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                long sizeMatched = 0;

                for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                  if (primaryNodeStore.fileExists(storeFileMetaData.name())
                      && primaryNodeStore
                          .file(storeFileMetaData.name())
                          .isSame(storeFileMetaData)) {
                    sizeMatched += storeFileMetaData.length();
                  }
                }
                logger.trace(
                    "{}: node [{}] has [{}/{}] bytes of re-usable data",
                    shard,
                    discoNode.name(),
                    new ByteSizeValue(sizeMatched),
                    sizeMatched);
                if (sizeMatched > lastSizeMatched) {
                  lastSizeMatched = sizeMatched;
                  lastDiscoNodeMatched = discoNode;
                  lastNodeMatched = node;
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          allocation.routingNodes().assign(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      }
    }
    return changed;
  }
 public String[] resolveNodesIds(String... nodesIds) {
   if (isAllNodes(nodesIds)) {
     int index = 0;
     nodesIds = new String[nodes.size()];
     for (DiscoveryNode node : this) {
       nodesIds[index++] = node.id();
     }
     return nodesIds;
   } else {
     Set<String> resolvedNodesIds = new HashSet<String>(nodesIds.length);
     for (String nodeId : nodesIds) {
       if (nodeId.equals("_local")) {
         String localNodeId = localNodeId();
         if (localNodeId != null) {
           resolvedNodesIds.add(localNodeId);
         }
       } else if (nodeId.equals("_master")) {
         String masterNodeId = masterNodeId();
         if (masterNodeId != null) {
           resolvedNodesIds.add(masterNodeId);
         }
       } else if (nodeExists(nodeId)) {
         resolvedNodesIds.add(nodeId);
       } else {
         // not a node id, try and search by name
         for (DiscoveryNode node : this) {
           if (Regex.simpleMatch(nodeId, node.name())) {
             resolvedNodesIds.add(node.id());
           }
         }
         for (DiscoveryNode node : this) {
           if (node.address().match(nodeId)) {
             resolvedNodesIds.add(node.id());
           }
         }
         int index = nodeId.indexOf(':');
         if (index != -1) {
           String matchAttrName = nodeId.substring(0, index);
           String matchAttrValue = nodeId.substring(index + 1);
           if ("data".equals(matchAttrName)) {
             if (Booleans.parseBoolean(matchAttrValue, true)) {
               resolvedNodesIds.addAll(dataNodes.keySet());
             } else {
               resolvedNodesIds.removeAll(dataNodes.keySet());
             }
           } else if ("master".equals(matchAttrName)) {
             if (Booleans.parseBoolean(matchAttrValue, true)) {
               resolvedNodesIds.addAll(masterNodes.keySet());
             } else {
               resolvedNodesIds.removeAll(masterNodes.keySet());
             }
           } else {
             for (DiscoveryNode node : this) {
               for (Map.Entry<String, String> entry : node.attributes().entrySet()) {
                 String attrName = entry.getKey();
                 String attrValue = entry.getValue();
                 if (Regex.simpleMatch(matchAttrName, attrName)
                     && Regex.simpleMatch(matchAttrValue, attrValue)) {
                   resolvedNodesIds.add(node.id());
                 }
               }
             }
           }
         }
       }
     }
     return resolvedNodesIds.toArray(new String[resolvedNodesIds.size()]);
   }
 }
  @Test
  public void multipleNodesShutdownNonMasterNodes() throws Exception {
    logger.info("--> cleaning nodes");
    buildNode("node1", settingsBuilder().put("gateway.type", "local"));
    buildNode("node2", settingsBuilder().put("gateway.type", "local"));
    buildNode("node3", settingsBuilder().put("gateway.type", "local"));
    buildNode("node4", settingsBuilder().put("gateway.type", "local"));
    cleanAndCloseNodes();

    Settings settings =
        settingsBuilder()
            .put("discovery.zen.minimum_master_nodes", 3)
            .put("discovery.zen.ping_timeout", "200ms")
            .put("discovery.initial_state_timeout", "500ms")
            .put("gateway.type", "local")
            .build();

    logger.info("--> start first 2 nodes");
    startNode("node1", settings);
    startNode("node2", settings);

    Thread.sleep(500);

    ClusterState state =
        client("node1")
            .admin()
            .cluster()
            .prepareState()
            .setLocal(true)
            .execute()
            .actionGet()
            .state();
    assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
    state =
        client("node2")
            .admin()
            .cluster()
            .prepareState()
            .setLocal(true)
            .execute()
            .actionGet()
            .state();
    assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));

    logger.info("--> start two more nodes");
    startNode("node3", settings);
    startNode("node4", settings);

    ClusterHealthResponse clusterHealthResponse =
        client("node1")
            .admin()
            .cluster()
            .prepareHealth()
            .setWaitForNodes("4")
            .execute()
            .actionGet();
    assertThat(clusterHealthResponse.timedOut(), equalTo(false));

    state = client("node1").admin().cluster().prepareState().execute().actionGet().state();
    assertThat(state.nodes().size(), equalTo(4));
    String masterNode = state.nodes().masterNode().name();
    LinkedList<String> nonMasterNodes = new LinkedList<String>();
    for (DiscoveryNode node : state.nodes()) {
      if (!node.name().equals(masterNode)) {
        nonMasterNodes.add(node.name());
      }
    }

    logger.info("--> indexing some data");
    for (int i = 0; i < 100; i++) {
      client("node1")
          .prepareIndex("test", "type1", Integer.toString(i))
          .setSource("field", "value")
          .execute()
          .actionGet();
    }
    // flush for simpler debugging
    client("node1").admin().indices().prepareFlush().execute().actionGet();

    client("node1").admin().indices().prepareRefresh().execute().actionGet();
    logger.info("--> verify we the data back");
    for (int i = 0; i < 10; i++) {
      assertThat(
          client("node1")
              .prepareCount()
              .setQuery(QueryBuilders.matchAllQuery())
              .execute()
              .actionGet()
              .count(),
          equalTo(100l));
    }

    Set<String> nodesToShutdown = Sets.newHashSet();
    nodesToShutdown.add(nonMasterNodes.removeLast());
    nodesToShutdown.add(nonMasterNodes.removeLast());
    logger.info("--> shutting down two master nodes {}", nodesToShutdown);
    for (String nodeToShutdown : nodesToShutdown) {
      closeNode(nodeToShutdown);
    }

    Thread.sleep(1000);

    String lastNonMasterNodeUp = nonMasterNodes.removeLast();
    logger.info("--> verify that there is no master anymore on remaining nodes");
    state =
        client(masterNode)
            .admin()
            .cluster()
            .prepareState()
            .setLocal(true)
            .execute()
            .actionGet()
            .state();
    assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
    state =
        client(lastNonMasterNodeUp)
            .admin()
            .cluster()
            .prepareState()
            .setLocal(true)
            .execute()
            .actionGet()
            .state();
    assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));

    logger.info("--> start back the nodes {}", nodesToShutdown);
    for (String nodeToShutdown : nodesToShutdown) {
      startNode(nodeToShutdown, settings);
    }

    clusterHealthResponse =
        client("node1")
            .admin()
            .cluster()
            .prepareHealth()
            .setWaitForNodes("4")
            .execute()
            .actionGet();
    assertThat(clusterHealthResponse.timedOut(), equalTo(false));

    logger.info("Running Cluster Health");
    ClusterHealthResponse clusterHealth =
        client("node1")
            .admin()
            .cluster()
            .health(clusterHealthRequest().waitForGreenStatus())
            .actionGet();
    logger.info("Done Cluster Health, status " + clusterHealth.status());
    assertThat(clusterHealth.timedOut(), equalTo(false));
    assertThat(clusterHealth.status(), equalTo(ClusterHealthStatus.GREEN));

    state = client("node1").admin().cluster().prepareState().execute().actionGet().state();
    assertThat(state.nodes().size(), equalTo(4));

    logger.info("--> verify we the data back");
    for (int i = 0; i < 10; i++) {
      assertThat(
          client("node1")
              .prepareCount()
              .setQuery(QueryBuilders.matchAllQuery())
              .execute()
              .actionGet()
              .count(),
          equalTo(100l));
    }
  }
    @Override
    protected void doSample() {
      HashSet<DiscoveryNode> newNodes = new HashSet<>();
      HashSet<DiscoveryNode> newFilteredNodes = new HashSet<>();
      for (DiscoveryNode listedNode : listedNodes) {
        if (!transportService.nodeConnected(listedNode)) {
          try {
            // its a listed node, light connect to it...
            logger.trace("connecting to listed node (light) [{}]", listedNode);
            transportService.connectToNodeLight(listedNode);
          } catch (Throwable e) {
            logger.debug("failed to connect to node [{}], removed from nodes list", e, listedNode);
            continue;
          }
        }
        try {
          LivenessResponse livenessResponse =
              transportService
                  .submitRequest(
                      listedNode,
                      TransportLivenessAction.NAME,
                      headers.applyTo(new LivenessRequest()),
                      TransportRequestOptions.options()
                          .withType(TransportRequestOptions.Type.STATE)
                          .withTimeout(pingTimeout),
                      new FutureTransportResponseHandler<LivenessResponse>() {
                        @Override
                        public LivenessResponse newInstance() {
                          return new LivenessResponse();
                        }
                      })
                  .txGet();
          if (!ignoreClusterName && !clusterName.equals(livenessResponse.getClusterName())) {
            logger.warn("node {} not part of the cluster {}, ignoring...", listedNode, clusterName);
            newFilteredNodes.add(listedNode);
          } else if (livenessResponse.getDiscoveryNode() != null) {
            // use discovered information but do keep the original transport address, so people can
            // control which address is exactly used.
            DiscoveryNode nodeWithInfo = livenessResponse.getDiscoveryNode();
            newNodes.add(
                new DiscoveryNode(
                    nodeWithInfo.name(),
                    nodeWithInfo.id(),
                    nodeWithInfo.getHostName(),
                    nodeWithInfo.getHostAddress(),
                    listedNode.address(),
                    nodeWithInfo.attributes(),
                    nodeWithInfo.version()));
          } else {
            // although we asked for one node, our target may not have completed initialization yet
            // and doesn't have cluster nodes
            logger.debug(
                "node {} didn't return any discovery info, temporarily using transport discovery node",
                listedNode);
            newNodes.add(listedNode);
          }
        } catch (Throwable e) {
          logger.info("failed to get node info for {}, disconnecting...", e, listedNode);
          transportService.disconnectFromNode(listedNode);
        }
      }

      nodes = validateNewNodes(newNodes);
      filteredNodes = ImmutableList.copyOf(newFilteredNodes);
    }