/** Returns the changes comparing this nodes to the provided nodes. */ public Delta delta(DiscoveryNodes other) { List<DiscoveryNode> removed = newArrayList(); List<DiscoveryNode> added = newArrayList(); for (DiscoveryNode node : other) { if (!this.nodeExists(node.id())) { removed.add(node); } } for (DiscoveryNode node : this) { if (!other.nodeExists(node.id())) { added.add(node); } } DiscoveryNode previousMasterNode = null; DiscoveryNode newMasterNode = null; if (masterNodeId != null) { if (other.masterNodeId == null || !other.masterNodeId.equals(masterNodeId)) { previousMasterNode = other.masterNode(); newMasterNode = masterNode(); } } return new Delta( previousMasterNode, newMasterNode, localNodeId, ImmutableList.copyOf(removed), ImmutableList.copyOf(added)); }
private Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) { Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = cachedStores.get(shard.shardId()); ObjectOpenHashSet<String> nodesIds; if (shardStores == null) { shardStores = Maps.newHashMap(); cachedStores.put(shard.shardId(), shardStores); nodesIds = ObjectOpenHashSet.from(nodes.dataNodes().keys()); } else { nodesIds = ObjectOpenHashSet.newInstance(); // clean nodes that have failed for (Iterator<DiscoveryNode> it = shardStores.keySet().iterator(); it.hasNext(); ) { DiscoveryNode node = it.next(); if (!nodes.nodeExists(node.id())) { it.remove(); } } for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { DiscoveryNode node = cursor.value; if (!shardStores.containsKey(node)) { nodesIds.add(node.id()); } } } if (!nodesIds.isEmpty()) { String[] nodesIdsArray = nodesIds.toArray(String.class); TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData = listShardStoreMetaData .list(shard.shardId(), false, nodesIdsArray, listTimeout) .actionGet(); if (logger.isTraceEnabled()) { if (nodesStoreFilesMetaData.failures().length > 0) { StringBuilder sb = new StringBuilder(shard + ": failures when trying to list stores on nodes:"); for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) { Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]); if (cause instanceof ConnectTransportException) { continue; } sb.append("\n -> ") .append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage()); } logger.trace(sb.toString()); } } for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData : nodesStoreFilesMetaData) { if (nodeStoreFilesMetaData.storeFilesMetaData() != null) { shardStores.put( nodeStoreFilesMetaData.getNode(), nodeStoreFilesMetaData.storeFilesMetaData()); } } } return shardStores; }
@Override public void run() { // master node will check against all nodes if its alive with certain discoveries // implementations, // but we can't rely on that, so we check on it as well for (DiscoveryNode node : clusterState.nodes()) { if (lifecycle.stoppedOrClosed()) { return; } if (!nodeRequiresConnection(node)) { continue; } if (clusterState .nodes() .nodeExists( node .id())) { // we double check existence of node since connectToNode might take // time... if (!transportService.nodeConnected(node)) { try { transportService.connectToNode(node); } catch (Exception e) { if (lifecycle.stoppedOrClosed()) { return; } if (clusterState .nodes() .nodeExists(node.id())) { // double check here as well, maybe its gone? logger.warn("failed to reconnect to node {}", e, node); } } } } } }
@Override public int compare(DiscoveryNode o1, DiscoveryNode o2) { if (o1.masterNode() && !o2.masterNode()) { return -1; } if (!o1.masterNode() && o2.masterNode()) { return 1; } return o1.id().compareTo(o2.id()); }
public String shortSummary() { StringBuilder sb = new StringBuilder(); if (!removed() && masterNodeChanged()) { if (newMasterNode.id().equals(localNodeId)) { // we are the master, no nodes we removed, we are actually the first master sb.append("new_master ").append(newMasterNode()); } else { // we are not the master, so we just got this event. No nodes were removed, so its not a // *new* master sb.append("detected_master ").append(newMasterNode()); } } else { if (masterNodeChanged()) { sb.append("master {new ").append(newMasterNode()); if (previousMasterNode() != null) { sb.append(", previous ").append(previousMasterNode()); } sb.append("}"); } if (removed()) { if (masterNodeChanged()) { sb.append(", "); } sb.append("removed {"); for (DiscoveryNode node : removedNodes()) { sb.append(node).append(','); } sb.append("}"); } } if (added()) { // don't print if there is one added, and it is us if (!(addedNodes().size() == 1 && addedNodes().get(0).id().equals(localNodeId))) { if (removed() || masterNodeChanged()) { sb.append(", "); } sb.append("added {"); for (DiscoveryNode node : addedNodes()) { if (!node.id().equals(localNodeId)) { // don't print ourself sb.append(node).append(','); } } sb.append("}"); } } return sb.toString(); }
private ClusterState rejoin(ClusterState clusterState, String reason) { logger.warn(reason + ", current nodes: {}", clusterState.nodes()); nodesFD.stop(); masterFD.stop(reason); master = false; ClusterBlocks clusterBlocks = ClusterBlocks.builder() .blocks(clusterState.blocks()) .addGlobalBlock(NO_MASTER_BLOCK) .addGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK) .build(); // clear the routing table, we have no master, so we need to recreate the routing when we reform // the cluster RoutingTable routingTable = RoutingTable.builder().build(); // we also clean the metadata, since we are going to recover it if we become master MetaData metaData = MetaData.builder().build(); // clean the nodes, we are now not connected to anybody, since we try and reform the cluster latestDiscoNodes = new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build(); asyncJoinCluster(); return ClusterState.builder(clusterState) .blocks(clusterBlocks) .nodes(latestDiscoNodes) .routingTable(routingTable) .metaData(metaData) .build(); }
private void handleTransportDisconnect(DiscoveryNode node) { if (!latestNodes.nodeExists(node.id())) { return; } NodeFD nodeFD = nodesFD.remove(node); if (nodeFD == null) { return; } if (!running) { return; } nodeFD.running = false; if (connectOnNetworkDisconnect) { try { transportService.connectToNode(node); nodesFD.put(node, new NodeFD()); threadPool.schedule(pingInterval, ThreadPool.Names.SAME, new SendPingRequest(node)); } catch (Exception e) { logger.trace("[node ] [{}] transport disconnected (with verified connect)", node); notifyNodeFailure(node, "transport disconnected (with verified connect)"); } } else { logger.trace("[node ] [{}] transport disconnected", node); notifyNodeFailure(node, "transport disconnected"); } }
public DiscoveryNodes removeDeadMembers(Set<String> newNodes, String masterNodeId) { Builder builder = new Builder().masterNodeId(masterNodeId).localNodeId(localNodeId); for (DiscoveryNode node : this) { if (newNodes.contains(node.id())) { builder.put(node); } } return builder.build(); }
@Override public DiscoveryNodes nodes() { DiscoveryNodes latestNodes = this.latestDiscoNodes; if (latestNodes != null) { return latestNodes; } // have not decided yet, just send the local node return DiscoveryNodes.builder().put(localNode).localNodeId(localNode.id()).build(); }
public void verify( String repository, String verificationToken, final ActionListener<VerifyResponse> listener) { final DiscoveryNodes discoNodes = clusterService.state().nodes(); final DiscoveryNode localNode = discoNodes.localNode(); final ObjectContainer<DiscoveryNode> masterAndDataNodes = discoNodes.masterAndDataNodes().values(); final List<DiscoveryNode> nodes = newArrayList(); for (ObjectCursor<DiscoveryNode> cursor : masterAndDataNodes) { DiscoveryNode node = cursor.value; Version version = node.getVersion(); // Verification wasn't supported before v1.4.0 - no reason to send verification request to // these nodes if (version != null && version.onOrAfter(Version.V_1_4_0)) { nodes.add(node); } } final CopyOnWriteArrayList<VerificationFailure> errors = new CopyOnWriteArrayList<>(); final AtomicInteger counter = new AtomicInteger(nodes.size()); for (final DiscoveryNode node : nodes) { if (node.equals(localNode)) { try { doVerify(repository, verificationToken); } catch (Throwable t) { logger.warn("[{}] failed to verify repository", t, repository); errors.add(new VerificationFailure(node.id(), ExceptionsHelper.detailedMessage(t))); } if (counter.decrementAndGet() == 0) { finishVerification(listener, nodes, errors); } } else { transportService.sendRequest( node, ACTION_NAME, new VerifyNodeRepositoryRequest(repository, verificationToken), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleResponse(TransportResponse.Empty response) { if (counter.decrementAndGet() == 0) { finishVerification(listener, nodes, errors); } } @Override public void handleException(TransportException exp) { errors.add( new VerificationFailure(node.id(), ExceptionsHelper.detailedMessage(exp))); if (counter.decrementAndGet() == 0) { finishVerification(listener, nodes, errors); } } }); } } }
public static DiscoveryNodes readFrom(StreamInput in, @Nullable DiscoveryNode localNode) throws IOException { Builder builder = new Builder(); if (in.readBoolean()) { builder.masterNodeId(in.readUTF()); } if (localNode != null) { builder.localNodeId(localNode.id()); } int size = in.readVInt(); for (int i = 0; i < size; i++) { DiscoveryNode node = DiscoveryNode.readNode(in); if (localNode != null && node.id().equals(localNode.id())) { // reuse the same instance of our address and local node id for faster equality node = localNode; } builder.put(node); } return builder.build(); }
private void updateMappingOnMaster(final String index, final String type) { try { MapperService mapperService = indicesService.indexServiceSafe(index).mapperService(); final DocumentMapper documentMapper = mapperService.documentMapper(type); if (documentMapper == null) { // should not happen return; } IndexMetaData metaData = clusterService.state().metaData().index(index); if (metaData == null) { return; } long orderId = mappingUpdatedAction.generateNextMappingUpdateOrder(); documentMapper.refreshSource(); DiscoveryNode node = clusterService.localNode(); final MappingUpdatedAction.MappingUpdatedRequest request = new MappingUpdatedAction.MappingUpdatedRequest( index, metaData.uuid(), type, documentMapper.mappingSource(), orderId, node != null ? node.id() : null); mappingUpdatedAction.execute( request, new ActionListener<MappingUpdatedAction.MappingUpdatedResponse>() { @Override public void onResponse( MappingUpdatedAction.MappingUpdatedResponse mappingUpdatedResponse) { // all is well } @Override public void onFailure(Throwable e) { try { logger.warn( "failed to update master on updated mapping for index [{}], type [{}] and source [{}]", e, index, type, documentMapper.mappingSource().string()); } catch (IOException e1) { // ignore } } }); } catch (Exception e) { logger.warn( "failed to update master on updated mapping for index [{}], type [{}]", e, index, type); } }
protected void onNodeResponse(DiscoveryNode node, int nodeIndex, NodeResponse response) { logger.trace("received response for [{}] from node [{}]", actionName, node.id()); // this is defensive to protect against the possibility of double invocation // the current implementation of TransportService#sendRequest guards against this // but concurrency is hard, safety is important, and the small performance loss here does not // matter if (responses.compareAndSet(nodeIndex, null, response)) { if (counter.incrementAndGet() == responses.length()) { onCompletion(); } } }
public void testNodeVersionIsUpdated() { TransportClient client = (TransportClient) internalCluster().client(); TransportClientNodesService nodeService = client.nodeService(); Node node = new Node( Settings.builder() .put(internalCluster().getDefaultSettings()) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) .put("node.name", "testNodeVersionIsUpdated") .put("http.enabled", false) .put(Node.NODE_DATA_SETTING.getKey(), false) .put("cluster.name", "foobar") .put( InternalSettingsPreparer.IGNORE_SYSTEM_PROPERTIES_SETTING.getKey(), true) // make sure we get what we set :) .build()); node.start(); try { TransportAddress transportAddress = node.injector().getInstance(TransportService.class).boundAddress().publishAddress(); client.addTransportAddress(transportAddress); assertThat( nodeService.connectedNodes().size(), greaterThanOrEqualTo( 1)); // since we force transport clients there has to be one node started that we // connect to. for (DiscoveryNode discoveryNode : nodeService.connectedNodes()) { // connected nodes have updated version assertThat(discoveryNode.getVersion(), equalTo(Version.CURRENT)); } for (DiscoveryNode discoveryNode : nodeService.listedNodes()) { assertThat(discoveryNode.id(), startsWith("#transport#-")); assertThat( discoveryNode.getVersion(), equalTo(Version.CURRENT.minimumCompatibilityVersion())); } assertThat(nodeService.filteredNodes().size(), equalTo(1)); for (DiscoveryNode discoveryNode : nodeService.filteredNodes()) { assertThat( discoveryNode.getVersion(), equalTo(Version.CURRENT.minimumCompatibilityVersion())); } } finally { node.close(); } }
protected void onNodeFailure(DiscoveryNode node, int nodeIndex, Throwable t) { String nodeId = node.id(); if (logger.isDebugEnabled() && !(t instanceof NodeShouldNotConnectException)) { logger.debug("failed to execute [{}] on node [{}]", t, actionName, nodeId); } // this is defensive to protect against the possibility of double invocation // the current implementation of TransportService#sendRequest guards against this // but concurrency is hard, safety is important, and the small performance loss here does not // matter if (responses.compareAndSet( nodeIndex, null, new FailedNodeException(nodeId, "Failed node [" + nodeId + "]", t))) { if (counter.incrementAndGet() == responses.length()) { onCompletion(); } } }
@Override public void execute(RoutingAllocation allocation) throws ElasticSearchException { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); MutableShardRouting shardRouting = null; for (MutableShardRouting routing : allocation.routingNodes().unassigned()) { if (routing.shardId().equals(shardId)) { // prefer primaries first to allocate if (shardRouting == null || routing.primary()) { shardRouting = routing; } } } if (shardRouting == null) { throw new ElasticSearchIllegalArgumentException( "[allocate] failed to find " + shardId + " on the list of unassigned shards"); } if (shardRouting.primary() && !allowPrimary) { throw new ElasticSearchIllegalArgumentException( "[allocate] trying to allocate a primary shard " + shardId + "], which is disabled"); } RoutingNode routingNode = allocation.routingNodes().node(discoNode.id()); allocation.addIgnoreDisable(shardRouting.shardId(), routingNode.nodeId()); if (!allocation.deciders().canAllocate(shardRouting, routingNode, allocation).allowed()) { throw new ElasticSearchIllegalArgumentException( "[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed"); } // go over and remove it from the unassigned for (Iterator<MutableShardRouting> it = allocation.routingNodes().unassigned().iterator(); it.hasNext(); ) { if (it.next() != shardRouting) { continue; } it.remove(); routingNode.add(shardRouting); break; } }
@Override protected void doStart() throws ElasticsearchException { Map<String, String> nodeAttributes = discoveryNodeService.buildAttributes(); // note, we rely on the fact that its a new id each time we start, see FD and "kill -9" handling final String nodeId = getNodeUUID(settings); localNode = new DiscoveryNode( settings.get("name"), nodeId, transportService.boundAddress().publishAddress(), nodeAttributes, version); latestDiscoNodes = new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build(); nodesFD.updateNodes(latestDiscoNodes); pingService.start(); // do the join on a different thread, the DiscoveryService waits for 30s anyhow till it is // discovered asyncJoinCluster(); }
public void updateNodes(DiscoveryNodes nodes) { DiscoveryNodes prevNodes = latestNodes; this.latestNodes = nodes; if (!running) { return; } DiscoveryNodes.Delta delta = nodes.delta(prevNodes); for (DiscoveryNode newNode : delta.addedNodes()) { if (newNode.id().equals(nodes.localNodeId())) { // no need to monitor the local node continue; } if (!nodesFD.containsKey(newNode)) { nodesFD.put(newNode, new NodeFD()); threadPool.schedule(pingInterval, ThreadPool.Names.SAME, new SendPingRequest(newNode)); } } for (DiscoveryNode removedNode : delta.removedNodes()) { nodesFD.remove(removedNode); } }
@Override public void run() { if (!running) { return; } transportService.sendRequest( node, PingRequestHandler.ACTION, new PingRequest(node.id()), options().withHighType().withTimeout(pingRetryTimeout), new BaseTransportResponseHandler<PingResponse>() { @Override public PingResponse newInstance() { return new PingResponse(); } @Override public void handleResponse(PingResponse response) { if (!running) { return; } NodeFD nodeFD = nodesFD.get(node); if (nodeFD != null) { if (!nodeFD.running) { return; } nodeFD.retryCount = 0; threadPool.schedule(pingInterval, ThreadPool.Names.SAME, SendPingRequest.this); } } @Override public void handleException(TransportException exp) { // check if the master node did not get switched on us... if (!running) { return; } if (exp instanceof ConnectTransportException) { // ignore this one, we already handle it by registering a connection listener return; } NodeFD nodeFD = nodesFD.get(node); if (nodeFD != null) { if (!nodeFD.running) { return; } int retryCount = ++nodeFD.retryCount; logger.trace( "[node ] failed to ping [{}], retry [{}] out of [{}]", exp, node, retryCount, pingRetryCount); if (retryCount >= pingRetryCount) { logger.debug( "[node ] failed to ping [{}], tried [{}] times, each with maximum [{}] timeout", node, pingRetryCount, pingRetryTimeout); // not good, failure if (nodesFD.remove(node) != null) { notifyNodeFailure( node, "failed to ping, tried [" + pingRetryCount + "] times, each with maximum [" + pingRetryTimeout + "] timeout"); } } else { // resend the request, not reschedule, rely on send timeout transportService.sendRequest( node, PingRequestHandler.ACTION, new PingRequest(node.id()), options().withHighType().withTimeout(pingRetryTimeout), this); } } } @Override public String executor() { return ThreadPool.Names.SAME; } }); }
@Override public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); boolean found = false; for (RoutingNodes.RoutingNodeIterator it = allocation.routingNodes().routingNodeIter(discoNode.id()); it.hasNext(); ) { ShardRouting shardRouting = it.next(); if (!shardRouting.shardId().equals(shardId)) { continue; } found = true; if (shardRouting.relocatingNodeId() != null) { if (shardRouting.initializing()) { // the shard is initializing and recovering from another node, simply cancel the recovery it.remove(); // and cancel the relocating state from the shard its being relocated from RoutingNode relocatingFromNode = allocation.routingNodes().node(shardRouting.relocatingNodeId()); if (relocatingFromNode != null) { for (ShardRouting fromShardRouting : relocatingFromNode) { if (fromShardRouting.isSameShard(shardRouting) && fromShardRouting.state() == RELOCATING) { allocation.routingNodes().cancelRelocation(fromShardRouting); break; } } } } else if (shardRouting.relocating()) { // the shard is relocating to another node, cancel the recovery on the other node, and // deallocate this one if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + " on node " + discoNode + ", shard is primary and initializing its state")); } throw new IllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and initializing its state"); } it.moveToUnassigned(new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null)); // now, go and find the shard that is initializing on the target node, and cancel it as // well... RoutingNodes.RoutingNodeIterator initializingNode = allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId()); if (initializingNode != null) { while (initializingNode.hasNext()) { ShardRouting initializingShardRouting = initializingNode.next(); if (initializingShardRouting.isRelocationTargetOf(shardRouting)) { initializingNode.remove(); } } } } } else { // the shard is not relocating, its either started, or initializing, just cancel it and move // on... if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + " on node " + discoNode + ", shard is primary and started")); } throw new IllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and started"); } it.moveToUnassigned(new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null)); } } if (!found) { if (explain) { return new RerouteExplanation( this, allocation.decision( Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + ", failed to find it on node " + discoNode)); } throw new IllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + ", failed to find it on node " + discoNode); } return new RerouteExplanation( this, allocation.decision( Decision.YES, "cancel_allocation_command", "shard " + shardId + " on node " + discoNode + " can be cancelled")); }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; final RoutingNodes routingNodes = allocation.routingNodes(); final MetaData metaData = routingNodes.metaData(); final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.removeAndIgnore(); continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); unassignedIterator.removeAndIgnore(); continue; // still fetching } long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; boolean hasReplicaData = false; IndexMetaData indexMetaData = metaData.index(shard.getIndex()); for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : shardStores.getData().entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { hasReplicaData |= storeFilesMetaData.iterator().hasNext(); ShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore = shardStores.getData().get(primaryNode); if (primaryNodeFilesStore != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = primaryNodeFilesStore.storeFilesMetaData(); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; String primarySyncId = primaryNodeStore.syncId(); String replicaSyncId = storeFilesMetaData.syncId(); // see if we have a sync id we can make use of if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) { logger.trace( "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId); lastNodeMatched = node; lastSizeMatched = Long.MAX_VALUE; lastDiscoNodeMatched = discoNode; } else { for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { String metaDataFileName = storeFileMetaData.name(); if (primaryNodeStore.fileExists(metaDataFileName) && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.removeAndIgnore(); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; unassignedIterator.initialize(lastNodeMatched.nodeId()); } } else if (hasReplicaData == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list // note: we only care about replica in delayed allocation, since if we have an unassigned // primary it // will anyhow wait to find an existing copy of the shard to be allocated // note: the other side of the equation is scheduling a reroute in a timely manner, which // happens in the RoutingService long delay = shard .unassignedInfo() .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings()); if (delay > 0) { logger.debug( "[{}][{}]: delaying allocation of [{}] for [{}]", shard.index(), shard.id(), shard, TimeValue.timeValueMillis(delay)); /** * mark it as changed, since we want to kick a publishing to schedule future allocation, * see {@link * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}). */ changed = true; unassignedIterator.removeAndIgnore(); } } } return changed; }
private Table buildTable( RestRequest req, ClusterStateResponse state, NodesInfoResponse nodesInfo, NodesStatsResponse nodesStats) { boolean fullId = req.paramAsBoolean("full_id", false); DiscoveryNodes nodes = state.getState().nodes(); String masterId = nodes.masterNodeId(); Table table = getTableWithHeader(req); for (DiscoveryNode node : nodes) { NodeInfo info = nodesInfo.getNodesMap().get(node.id()); NodeStats stats = nodesStats.getNodesMap().get(node.id()); JvmInfo jvmInfo = info == null ? null : info.getJvm(); JvmStats jvmStats = stats == null ? null : stats.getJvm(); FsInfo fsInfo = stats == null ? null : stats.getFs(); OsStats osStats = stats == null ? null : stats.getOs(); ProcessStats processStats = stats == null ? null : stats.getProcess(); NodeIndicesStats indicesStats = stats == null ? null : stats.getIndices(); table.startRow(); table.addCell(fullId ? node.id() : Strings.substring(node.getId(), 0, 4)); table.addCell(info == null ? null : info.getProcess().getId()); table.addCell(node.getHostName()); table.addCell(node.getHostAddress()); if (node.address() instanceof InetSocketTransportAddress) { table.addCell(((InetSocketTransportAddress) node.address()).address().getPort()); } else { table.addCell("-"); } table.addCell(node.getVersion().number()); table.addCell(info == null ? null : info.getBuild().shortHash()); table.addCell(jvmInfo == null ? null : jvmInfo.version()); table.addCell(fsInfo == null ? null : fsInfo.getTotal().getAvailable()); table.addCell(jvmStats == null ? null : jvmStats.getMem().getHeapUsed()); table.addCell(jvmStats == null ? null : jvmStats.getMem().getHeapUsedPercent()); table.addCell(jvmInfo == null ? null : jvmInfo.getMem().getHeapMax()); table.addCell( osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getUsed()); table.addCell( osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getUsedPercent()); table.addCell( osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getTotal()); table.addCell(processStats == null ? null : processStats.getOpenFileDescriptors()); table.addCell( processStats == null ? null : calculatePercentage( processStats.getOpenFileDescriptors(), processStats.getMaxFileDescriptors())); table.addCell(processStats == null ? null : processStats.getMaxFileDescriptors()); table.addCell( osStats == null ? null : String.format(Locale.ROOT, "%.2f", osStats.getLoadAverage())); table.addCell(jvmStats == null ? null : jvmStats.getUptime()); table.addCell(node.clientNode() ? "c" : node.dataNode() ? "d" : "-"); table.addCell( masterId == null ? "x" : masterId.equals(node.id()) ? "*" : node.masterNode() ? "m" : "-"); table.addCell(node.name()); CompletionStats completionStats = indicesStats == null ? null : stats.getIndices().getCompletion(); table.addCell(completionStats == null ? null : completionStats.getSize()); FieldDataStats fdStats = indicesStats == null ? null : stats.getIndices().getFieldData(); table.addCell(fdStats == null ? null : fdStats.getMemorySize()); table.addCell(fdStats == null ? null : fdStats.getEvictions()); QueryCacheStats fcStats = indicesStats == null ? null : indicesStats.getQueryCache(); table.addCell(fcStats == null ? null : fcStats.getMemorySize()); table.addCell(fcStats == null ? null : fcStats.getEvictions()); RequestCacheStats qcStats = indicesStats == null ? null : indicesStats.getRequestCache(); table.addCell(qcStats == null ? null : qcStats.getMemorySize()); table.addCell(qcStats == null ? null : qcStats.getEvictions()); table.addCell(qcStats == null ? null : qcStats.getHitCount()); table.addCell(qcStats == null ? null : qcStats.getMissCount()); FlushStats flushStats = indicesStats == null ? null : indicesStats.getFlush(); table.addCell(flushStats == null ? null : flushStats.getTotal()); table.addCell(flushStats == null ? null : flushStats.getTotalTime()); GetStats getStats = indicesStats == null ? null : indicesStats.getGet(); table.addCell(getStats == null ? null : getStats.current()); table.addCell(getStats == null ? null : getStats.getTime()); table.addCell(getStats == null ? null : getStats.getCount()); table.addCell(getStats == null ? null : getStats.getExistsTime()); table.addCell(getStats == null ? null : getStats.getExistsCount()); table.addCell(getStats == null ? null : getStats.getMissingTime()); table.addCell(getStats == null ? null : getStats.getMissingCount()); IndexingStats indexingStats = indicesStats == null ? null : indicesStats.getIndexing(); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteCurrent()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteTime()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteCount()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexCurrent()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexTime()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexCount()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexFailedCount()); MergeStats mergeStats = indicesStats == null ? null : indicesStats.getMerge(); table.addCell(mergeStats == null ? null : mergeStats.getCurrent()); table.addCell(mergeStats == null ? null : mergeStats.getCurrentNumDocs()); table.addCell(mergeStats == null ? null : mergeStats.getCurrentSize()); table.addCell(mergeStats == null ? null : mergeStats.getTotal()); table.addCell(mergeStats == null ? null : mergeStats.getTotalNumDocs()); table.addCell(mergeStats == null ? null : mergeStats.getTotalSize()); table.addCell(mergeStats == null ? null : mergeStats.getTotalTime()); PercolateStats percolateStats = indicesStats == null ? null : indicesStats.getPercolate(); table.addCell(percolateStats == null ? null : percolateStats.getCurrent()); table.addCell(percolateStats == null ? null : percolateStats.getMemorySize()); table.addCell(percolateStats == null ? null : percolateStats.getNumQueries()); table.addCell(percolateStats == null ? null : percolateStats.getTime()); table.addCell(percolateStats == null ? null : percolateStats.getCount()); RefreshStats refreshStats = indicesStats == null ? null : indicesStats.getRefresh(); table.addCell(refreshStats == null ? null : refreshStats.getTotal()); table.addCell(refreshStats == null ? null : refreshStats.getTotalTime()); ScriptStats scriptStats = stats == null ? null : stats.getScriptStats(); table.addCell(scriptStats == null ? null : scriptStats.getCompilations()); table.addCell(scriptStats == null ? null : scriptStats.getCacheEvictions()); SearchStats searchStats = indicesStats == null ? null : indicesStats.getSearch(); table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchCurrent()); table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchTime()); table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchCount()); table.addCell(searchStats == null ? null : searchStats.getOpenContexts()); table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCurrent()); table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryTime()); table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCount()); table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCurrent()); table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollTime()); table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCount()); SegmentsStats segmentsStats = indicesStats == null ? null : indicesStats.getSegments(); table.addCell(segmentsStats == null ? null : segmentsStats.getCount()); table.addCell(segmentsStats == null ? null : segmentsStats.getMemory()); table.addCell(segmentsStats == null ? null : segmentsStats.getIndexWriterMemory()); table.addCell(segmentsStats == null ? null : segmentsStats.getIndexWriterMaxMemory()); table.addCell(segmentsStats == null ? null : segmentsStats.getVersionMapMemory()); table.addCell(segmentsStats == null ? null : segmentsStats.getBitsetMemory()); SuggestStats suggestStats = indicesStats == null ? null : indicesStats.getSuggest(); table.addCell(suggestStats == null ? null : suggestStats.getCurrent()); table.addCell(suggestStats == null ? null : suggestStats.getTime()); table.addCell(suggestStats == null ? null : suggestStats.getCount()); table.endRow(); } return table; }
@Override protected void moveToSecondPhase() { sortedShardList = searchPhaseController.sortDocs(queryResults.values()); final Map<SearchShardTarget, ExtTIntArrayList> docIdsToLoad = searchPhaseController.docIdsToLoad(sortedShardList); if (docIdsToLoad.isEmpty()) { releaseIrrelevantSearchContexts(queryResults, docIdsToLoad); finishHim(); } final AtomicInteger counter = new AtomicInteger(docIdsToLoad.size()); int localOperations = 0; for (Map.Entry<SearchShardTarget, ExtTIntArrayList> entry : docIdsToLoad.entrySet()) { DiscoveryNode node = nodes.get(entry.getKey().nodeId()); if (node.id().equals(nodes.localNodeId())) { localOperations++; } else { FetchSearchRequest fetchSearchRequest = new FetchSearchRequest(queryResults.get(entry.getKey()).id(), entry.getValue()); executeFetch(counter, fetchSearchRequest, node); } } if (localOperations > 0) { if (request.operationThreading() == SearchOperationThreading.SINGLE_THREAD) { threadPool.execute( new Runnable() { @Override public void run() { for (Map.Entry<SearchShardTarget, ExtTIntArrayList> entry : docIdsToLoad.entrySet()) { DiscoveryNode node = nodes.get(entry.getKey().nodeId()); if (node.id().equals(nodes.localNodeId())) { FetchSearchRequest fetchSearchRequest = new FetchSearchRequest( queryResults.get(entry.getKey()).id(), entry.getValue()); executeFetch(counter, fetchSearchRequest, node); } } } }); } else { boolean localAsync = request.operationThreading() == SearchOperationThreading.THREAD_PER_SHARD; for (Map.Entry<SearchShardTarget, ExtTIntArrayList> entry : docIdsToLoad.entrySet()) { final DiscoveryNode node = nodes.get(entry.getKey().nodeId()); if (node.id().equals(nodes.localNodeId())) { final FetchSearchRequest fetchSearchRequest = new FetchSearchRequest(queryResults.get(entry.getKey()).id(), entry.getValue()); if (localAsync) { threadPool.execute( new Runnable() { @Override public void run() { executeFetch(counter, fetchSearchRequest, node); } }); } else { executeFetch(counter, fetchSearchRequest, node); } } } } } releaseIrrelevantSearchContexts(queryResults, docIdsToLoad); }
public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; DiscoveryNodes nodes = allocation.nodes(); RoutingNodes routingNodes = allocation.routingNodes(); // First, handle primaries, they must find a place to be allocated on here final MetaData metaData = routingNodes.metaData(); RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned(); unassigned.sort( new PriorityComparator() { @Override protected Settings getIndexSettings(String index) { IndexMetaData indexMetaData = metaData.index(index); return indexMetaData.getSettings(); } }); // sort for priority ordering Iterator<ShardRouting> unassignedIterator = unassigned.iterator(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (!shard.primary()) { continue; } // this is an API allocation, ignore since we know there is no data... if (!routingNodes .routingTable() .index(shard.index()) .shard(shard.id()) .primaryAllocatedPostApi()) { continue; } AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch = asyncFetchStarted.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction); asyncFetchStarted.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId())); if (shardState.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard started state", shard); unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); continue; } shardState.processAllocation(allocation); IndexMetaData indexMetaData = metaData.index(shard.getIndex()); /** * Build a map of DiscoveryNodes to shard state number for the given shard. A state of -1 * means the shard does not exist on the node, where any shard state >= 0 is the state version * of the shard on that node's disk. * * <p>A shard on shared storage will return at least shard state 0 for all nodes, indicating * that the shard can be allocated to any node. */ ObjectLongHashMap<DiscoveryNode> nodesState = new ObjectLongHashMap<>(); for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : shardState.getData().values()) { long version = nodeShardState.version(); // -1 version means it does not exists, which is what the API returns, and what we expect to logger.trace( "[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), version); nodesState.put(nodeShardState.getNode(), version); } int numberOfAllocationsFound = 0; long highestVersion = -1; final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap(); assert !nodesState.containsKey(null); final Object[] keys = nodesState.keys; final long[] values = nodesState.values; Settings idxSettings = indexMetaData.settings(); for (int i = 0; i < keys.length; i++) { if (keys[i] == null) { continue; } DiscoveryNode node = (DiscoveryNode) keys[i]; long version = values[i]; // since we don't check in NO allocation, we need to double check here if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) { continue; } if (recoverOnAnyNode(idxSettings)) { numberOfAllocationsFound++; if (version > highestVersion) { highestVersion = version; } // We always put the node without clearing the map nodesWithVersion.put(node, version); } else if (version != -1) { numberOfAllocationsFound++; // If we've found a new "best" candidate, clear the // current candidates and add it if (version > highestVersion) { highestVersion = version; nodesWithVersion.clear(); nodesWithVersion.put(node, version); } else if (version == highestVersion) { // If the candidate is the same, add it to the // list, but keep the current candidate nodesWithVersion.put(node, version); } } } // Now that we have a map of nodes to versions along with the // number of allocations found (and not ignored), we need to sort // it so the node with the highest version is at the beginning List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList(); nodesWithHighestVersion.addAll(nodesWithVersion.keySet()); CollectionUtil.timSort( nodesWithHighestVersion, new Comparator<DiscoveryNode>() { @Override public int compare(DiscoveryNode o1, DiscoveryNode o2) { return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1)); } }); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}] found {} allocations of {}, highest version: [{}]", shard.index(), shard.id(), numberOfAllocationsFound, shard, highestVersion); } if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder("["); for (DiscoveryNode n : nodesWithHighestVersion) { sb.append("["); sb.append(n.getName()); sb.append("]"); sb.append(" -> "); sb.append(nodesWithVersion.get(n)); sb.append(", "); } sb.append("]"); logger.trace("{} candidates for allocation: {}", shard, sb.toString()); } // check if the counts meets the minimum set int requiredAllocation = 1; // if we restore from a repository one copy is more then enough if (shard.restoreSource() == null) { try { String initialShards = indexMetaData .settings() .get( INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards)); if ("quorum".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1; } } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 2) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2); } } else if ("one".equals(initialShards)) { requiredAllocation = 1; } else if ("full".equals(initialShards) || "all".equals(initialShards)) { requiredAllocation = indexMetaData.numberOfReplicas() + 1; } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = indexMetaData.numberOfReplicas(); } } else { requiredAllocation = Integer.parseInt(initialShards); } } catch (Exception e) { logger.warn( "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(), shard.id(), initialShards, shard); } } // not enough found for this shard, continue... if (numberOfAllocationsFound < requiredAllocation) { // if we are restoring this shard we still can allocate if (shard.restoreSource() == null) { // we can't really allocate, so ignore it and continue unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]", shard.index(), shard.id(), numberOfAllocationsFound, requiredAllocation); } } else if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: missing local data, will restore from [{}]", shard.index(), shard.id(), shard.restoreSource()); } continue; } Set<DiscoveryNode> throttledNodes = Sets.newHashSet(); Set<DiscoveryNode> noNodes = Sets.newHashSet(); for (DiscoveryNode discoNode : nodesWithHighestVersion) { RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.THROTTLE) { throttledNodes.add(discoNode); } else if (decision.type() == Decision.Type.NO) { noNodes.add(discoNode); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); // found a node, so no throttling, no "no", and break out of the loop throttledNodes.clear(); noNodes.clear(); break; } } if (throttledNodes.isEmpty()) { // if we have a node that we "can't" allocate to, force allocation, since this is our master // data! if (!noNodes.isEmpty()) { DiscoveryNode discoNode = noNodes.iterator().next(); RoutingNode node = routingNodes.node(discoNode.id()); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); } } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, throttledNodes); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } if (!routingNodes.hasUnassigned()) { return changed; } // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was // allocated on unassignedIterator = unassigned.iterator(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); continue; } AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch = asyncFetchStore.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction); asyncFetchStore.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId())); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); continue; // still fetching } shardStores.processAllocation(allocation); long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; boolean hasReplicaData = false; IndexMetaData indexMetaData = metaData.index(shard.getIndex()); for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : shardStores.getData().entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { hasReplicaData |= storeFilesMetaData.iterator().hasNext(); ShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore = shardStores.getData().get(primaryNode); if (primaryNodeFilesStore != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = primaryNodeFilesStore.storeFilesMetaData(); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; String primarySyncId = primaryNodeStore.syncId(); String replicaSyncId = storeFilesMetaData.syncId(); // see if we have a sync id we can make use of if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) { logger.trace( "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId); lastNodeMatched = node; lastSizeMatched = Long.MAX_VALUE; lastDiscoNodeMatched = discoNode; } else { for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { String metaDataFileName = storeFileMetaData.name(); if (primaryNodeStore.fileExists(metaDataFileName) && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; routingNodes.initialize(shard, lastNodeMatched.nodeId()); unassignedIterator.remove(); } } else if (hasReplicaData == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list // note: we only care about replica in delayed allocation, since if we have an unassigned // primary it // will anyhow wait to find an existing copy of the shard to be allocated // note: the other side of the equation is scheduling a reroute in a timely manner, which // happens in the RoutingService long delay = shard .unassignedInfo() .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings()); if (delay > 0) { logger.debug( "[{}][{}]: delaying allocation of [{}] for [{}]", shard.index(), shard.id(), shard, TimeValue.timeValueMillis(delay)); /** * mark it as changed, since we want to kick a publishing to schedule future allocation, * see {@link * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}). */ changed = true; unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } } return changed; }
@Override protected void configure() { functionBinder = MapBinder.newMapBinder(binder(), FunctionIdent.class, FunctionImplementation.class); functionBinder.addBinding(TestFunction.ident).toInstance(new TestFunction()); bind(Functions.class).asEagerSingleton(); bind(ReferenceInfos.class).toInstance(mock(ReferenceInfos.class)); bind(ThreadPool.class).toInstance(testThreadPool); BulkRetryCoordinator bulkRetryCoordinator = mock(BulkRetryCoordinator.class); BulkRetryCoordinatorPool bulkRetryCoordinatorPool = mock(BulkRetryCoordinatorPool.class); when(bulkRetryCoordinatorPool.coordinator(any(ShardId.class))) .thenReturn(bulkRetryCoordinator); bind(BulkRetryCoordinatorPool.class).toInstance(bulkRetryCoordinatorPool); bind(TransportBulkCreateIndicesAction.class) .toInstance(mock(TransportBulkCreateIndicesAction.class)); bind(CircuitBreakerService.class).toInstance(new NoneCircuitBreakerService()); bind(ActionFilters.class).toInstance(mock(ActionFilters.class)); bind(ScriptService.class).toInstance(mock(ScriptService.class)); bind(SearchService.class).toInstance(mock(InternalSearchService.class)); bind(AllocationService.class).toInstance(mock(AllocationService.class)); bind(MetaDataCreateIndexService.class).toInstance(mock(MetaDataCreateIndexService.class)); bind(DynamicSettings.class) .annotatedWith(ClusterDynamicSettings.class) .toInstance(mock(DynamicSettings.class)); bind(MetaDataDeleteIndexService.class).toInstance(mock(MetaDataDeleteIndexService.class)); bind(ClusterInfoService.class).toInstance(mock(ClusterInfoService.class)); bind(TransportService.class).toInstance(mock(TransportService.class)); bind(MapperService.class).toInstance(mock(MapperService.class)); OsService osService = mock(OsService.class); OsStats osStats = mock(OsStats.class); when(osService.stats()).thenReturn(osStats); OsStats.Cpu osCpu = mock(OsStats.Cpu.class); when(osCpu.stolen()).thenReturn((short) 1); when(osStats.cpu()).thenReturn(osCpu); bind(OsService.class).toInstance(osService); bind(NodeService.class).toInstance(mock(NodeService.class)); bind(Discovery.class).toInstance(mock(Discovery.class)); bind(NetworkService.class).toInstance(mock(NetworkService.class)); bind(TransportShardBulkAction.class).toInstance(mock(TransportShardBulkAction.class)); bind(TransportCreateIndexAction.class).toInstance(mock(TransportCreateIndexAction.class)); discoveryService = mock(DiscoveryService.class); DiscoveryNode discoveryNode = mock(DiscoveryNode.class); when(discoveryNode.id()).thenReturn(TEST_NODE_ID); when(discoveryService.localNode()).thenReturn(discoveryNode); ClusterService clusterService = mock(ClusterService.class); ClusterState state = mock(ClusterState.class); DiscoveryNodes discoveryNodes = mock(DiscoveryNodes.class); when(discoveryNodes.localNodeId()).thenReturn(TEST_NODE_ID); when(state.nodes()).thenReturn(discoveryNodes); when(clusterService.state()).thenReturn(state); when(clusterService.localNode()).thenReturn(discoveryNode); bind(ClusterService.class).toInstance(clusterService); IndicesService indicesService = mock(IndicesService.class); bind(IndicesService.class).toInstance(indicesService); bind(Settings.class).toInstance(ImmutableSettings.EMPTY); bind(MetaDataUpdateSettingsService.class) .toInstance(mock(MetaDataUpdateSettingsService.class)); bind(Client.class).toInstance(mock(Client.class)); Provider<TransportCreateIndexAction> transportCreateIndexActionProvider = mock(Provider.class); when(transportCreateIndexActionProvider.get()) .thenReturn(mock(TransportCreateIndexAction.class)); Provider<TransportDeleteIndexAction> transportDeleteActionProvider = mock(Provider.class); when(transportDeleteActionProvider.get()).thenReturn(mock(TransportDeleteIndexAction.class)); Provider<TransportUpdateSettingsAction> transportUpdateSettingsActionProvider = mock(Provider.class); when(transportUpdateSettingsActionProvider.get()) .thenReturn(mock(TransportUpdateSettingsAction.class)); BlobIndices blobIndices = new BlobIndices( ImmutableSettings.EMPTY, transportCreateIndexActionProvider, transportDeleteActionProvider, transportUpdateSettingsActionProvider, indicesService, mock(IndicesLifecycle.class), mock(BlobEnvironment.class), clusterService); bind(BlobIndices.class).toInstance(blobIndices); bind(ReferenceResolver.class).to(GlobalReferenceResolver.class); TransportPutIndexTemplateAction transportPutIndexTemplateAction = mock(TransportPutIndexTemplateAction.class); bind(TransportPutIndexTemplateAction.class).toInstance(transportPutIndexTemplateAction); bind(IndexService.class).toInstance(indexService); }
@Override public void execute(RoutingAllocation allocation) throws ElasticSearchException { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); boolean found = false; for (RoutingNodes.RoutingNodeIterator it = allocation.routingNodes().routingNodeIter(discoNode.id()); it.hasNext(); ) { MutableShardRouting shardRouting = it.next(); if (!shardRouting.shardId().equals(shardId)) { continue; } found = true; if (shardRouting.relocatingNodeId() != null) { if (shardRouting.initializing()) { // the shard is initializing and recovering from another node, simply cancel the recovery it.remove(); // and cancel the relocating state from the shard its being relocated from RoutingNode relocatingFromNode = allocation.routingNodes().node(shardRouting.relocatingNodeId()); if (relocatingFromNode != null) { for (MutableShardRouting fromShardRouting : relocatingFromNode) { if (fromShardRouting.shardId().equals(shardRouting.shardId()) && fromShardRouting.state() == RELOCATING) { allocation.routingNodes().cancelRelocation(fromShardRouting); break; } } } } else if (shardRouting.relocating()) { // the shard is relocating to another node, cancel the recovery on the other node, and // deallocate this one if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and initializing its state"); } it.moveToUnassigned(); // now, go and find the shard that is initializing on the target node, and cancel it as // well... RoutingNodes.RoutingNodeIterator initializingNode = allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId()); if (initializingNode != null) { while (initializingNode.hasNext()) { MutableShardRouting initializingShardRouting = initializingNode.next(); if (initializingShardRouting.shardId().equals(shardRouting.shardId()) && initializingShardRouting.state() == INITIALIZING) { initializingNode.remove(); } } } } } else { // the shard is not relocating, its either started, or initializing, just cancel it and move // on... if (!allowPrimary && shardRouting.primary()) { // can't cancel a primary shard being initialized throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and started"); } it.remove(); allocation .routingNodes() .unassigned() .add( new MutableShardRouting( shardRouting.index(), shardRouting.id(), null, shardRouting.primary(), ShardRoutingState.UNASSIGNED, shardRouting.version() + 1)); } } if (!found) { throw new ElasticSearchIllegalArgumentException( "[cancel_allocation] can't cancel " + shardId + ", failed to find it on node " + discoNode); } }
public void addNode(DiscoveryNode node) { ensureMutable(); RoutingNode routingNode = new RoutingNode(node.id(), node); nodesToShards.put(routingNode.nodeId(), routingNode); }
private ClusterState applyUpdate(ClusterState currentState, ClusterChangedEvent task) { boolean clusterStateChanged = false; ClusterState tribeState = task.state(); DiscoveryNodes.Builder nodes = DiscoveryNodes.builder(currentState.nodes()); // -- merge nodes // go over existing nodes, and see if they need to be removed for (DiscoveryNode discoNode : currentState.nodes()) { String markedTribeName = discoNode.attributes().get(TRIBE_NAME); if (markedTribeName != null && markedTribeName.equals(tribeName)) { if (tribeState.nodes().get(discoNode.id()) == null) { clusterStateChanged = true; logger.info("[{}] removing node [{}]", tribeName, discoNode); nodes.remove(discoNode.id()); } } } // go over tribe nodes, and see if they need to be added for (DiscoveryNode tribe : tribeState.nodes()) { if (currentState.nodes().get(tribe.id()) == null) { // a new node, add it, but also add the tribe name to the attributes Map<String, String> tribeAttr = new HashMap<>(); for (ObjectObjectCursor<String, String> attr : tribe.attributes()) { tribeAttr.put(attr.key, attr.value); } tribeAttr.put(TRIBE_NAME, tribeName); DiscoveryNode discoNode = new DiscoveryNode( tribe.name(), tribe.id(), tribe.getHostName(), tribe.getHostAddress(), tribe.address(), unmodifiableMap(tribeAttr), tribe.version()); clusterStateChanged = true; logger.info("[{}] adding node [{}]", tribeName, discoNode); nodes.put(discoNode); } } // -- merge metadata ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks()); MetaData.Builder metaData = MetaData.builder(currentState.metaData()); RoutingTable.Builder routingTable = RoutingTable.builder(currentState.routingTable()); // go over existing indices, and see if they need to be removed for (IndexMetaData index : currentState.metaData()) { String markedTribeName = index.getSettings().get(TRIBE_NAME); if (markedTribeName != null && markedTribeName.equals(tribeName)) { IndexMetaData tribeIndex = tribeState.metaData().index(index.getIndex()); clusterStateChanged = true; if (tribeIndex == null || tribeIndex.getState() == IndexMetaData.State.CLOSE) { logger.info("[{}] removing index [{}]", tribeName, index.getIndex()); removeIndex(blocks, metaData, routingTable, index); } else { // always make sure to update the metadata and routing table, in case // there are changes in them (new mapping, shards moving from initializing to started) routingTable.add(tribeState.routingTable().index(index.getIndex())); Settings tribeSettings = Settings.builder().put(tribeIndex.getSettings()).put(TRIBE_NAME, tribeName).build(); metaData.put(IndexMetaData.builder(tribeIndex).settings(tribeSettings)); } } } // go over tribe one, and see if they need to be added for (IndexMetaData tribeIndex : tribeState.metaData()) { // if there is no routing table yet, do nothing with it... IndexRoutingTable table = tribeState.routingTable().index(tribeIndex.getIndex()); if (table == null) { continue; } final IndexMetaData indexMetaData = currentState.metaData().index(tribeIndex.getIndex()); if (indexMetaData == null) { if (!droppedIndices.contains(tribeIndex.getIndex())) { // a new index, add it, and add the tribe name as a setting clusterStateChanged = true; logger.info("[{}] adding index [{}]", tribeName, tribeIndex.getIndex()); addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex); } } else { String existingFromTribe = indexMetaData.getSettings().get(TRIBE_NAME); if (!tribeName.equals(existingFromTribe)) { // we have a potential conflict on index names, decide what to do... if (ON_CONFLICT_ANY.equals(onConflict)) { // we chose any tribe, carry on } else if (ON_CONFLICT_DROP.equals(onConflict)) { // drop the indices, there is a conflict clusterStateChanged = true; logger.info( "[{}] dropping index [{}] due to conflict with [{}]", tribeName, tribeIndex.getIndex(), existingFromTribe); removeIndex(blocks, metaData, routingTable, tribeIndex); droppedIndices.add(tribeIndex.getIndex()); } else if (onConflict.startsWith(ON_CONFLICT_PREFER)) { // on conflict, prefer a tribe... String preferredTribeName = onConflict.substring(ON_CONFLICT_PREFER.length()); if (tribeName.equals(preferredTribeName)) { // the new one is hte preferred one, replace... clusterStateChanged = true; logger.info( "[{}] adding index [{}], preferred over [{}]", tribeName, tribeIndex.getIndex(), existingFromTribe); removeIndex(blocks, metaData, routingTable, tribeIndex); addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex); } // else: either the existing one is the preferred one, or we haven't seen one, carry // on } } } } if (!clusterStateChanged) { return currentState; } else { return ClusterState.builder(currentState) .incrementVersion() .blocks(blocks) .nodes(nodes) .metaData(metaData) .routingTable(routingTable.build()) .build(); } }
private MatchingNodes findMatchingNodes( ShardRouting shard, RoutingAllocation allocation, TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore, AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> data) { ObjectLongMap<DiscoveryNode> nodesToSize = new ObjectLongHashMap<>(); for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : data.getData().entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData(); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = allocation.routingNodes().node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... (and it might be primary as well) if (storeFilesMetaData.allocated()) { continue; } // we don't have any files at all, it is an empty index if (storeFilesMetaData.iterator().hasNext() == false) { continue; } String primarySyncId = primaryStore.syncId(); String replicaSyncId = storeFilesMetaData.syncId(); // see if we have a sync id we can make use of if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) { logger.trace( "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId); nodesToSize.put(discoNode, Long.MAX_VALUE); } else { long sizeMatched = 0; for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { String metaDataFileName = storeFileMetaData.name(); if (primaryStore.fileExists(metaDataFileName) && primaryStore.file(metaDataFileName).isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); nodesToSize.put(discoNode, sizeMatched); } } return new MatchingNodes(nodesToSize); }
@Override public String nodeDescription() { return clusterName.value() + "/" + localNode.id(); }