@Override public void onNodeAck(DiscoveryNode node, @Nullable Exception e) { if (!ackedTaskListener.mustAck(node)) { // we always wait for the master ack anyway if (!node.equals(nodes.getMasterNode())) { return; } } if (e == null) { logger.trace( "ack received from node [{}], cluster_state update (version: {})", node, clusterStateVersion); } else { this.lastFailure = e; logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "ack received from node [{}], cluster_state update (version: {})", node, clusterStateVersion), e); } if (countDown.countDown()) { logger.trace( "all expected nodes acknowledged cluster_state update (version: {})", clusterStateVersion); FutureUtils.cancel(ackTimeoutCallback); ackedTaskListener.onAllNodesAcked(lastFailure); } }
public void verify( String repository, String verificationToken, final ActionListener<VerifyResponse> listener) { final DiscoveryNodes discoNodes = clusterService.state().nodes(); final DiscoveryNode localNode = discoNodes.localNode(); final ObjectContainer<DiscoveryNode> masterAndDataNodes = discoNodes.masterAndDataNodes().values(); final List<DiscoveryNode> nodes = newArrayList(); for (ObjectCursor<DiscoveryNode> cursor : masterAndDataNodes) { DiscoveryNode node = cursor.value; Version version = node.getVersion(); // Verification wasn't supported before v1.4.0 - no reason to send verification request to // these nodes if (version != null && version.onOrAfter(Version.V_1_4_0)) { nodes.add(node); } } final CopyOnWriteArrayList<VerificationFailure> errors = new CopyOnWriteArrayList<>(); final AtomicInteger counter = new AtomicInteger(nodes.size()); for (final DiscoveryNode node : nodes) { if (node.equals(localNode)) { try { doVerify(repository, verificationToken); } catch (Throwable t) { logger.warn("[{}] failed to verify repository", t, repository); errors.add(new VerificationFailure(node.id(), ExceptionsHelper.detailedMessage(t))); } if (counter.decrementAndGet() == 0) { finishVerification(listener, nodes, errors); } } else { transportService.sendRequest( node, ACTION_NAME, new VerifyNodeRepositoryRequest(repository, verificationToken), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleResponse(TransportResponse.Empty response) { if (counter.decrementAndGet() == 0) { finishVerification(listener, nodes, errors); } } @Override public void handleException(TransportException exp) { errors.add( new VerificationFailure(node.id(), ExceptionsHelper.detailedMessage(exp))); if (counter.decrementAndGet() == 0) { finishVerification(listener, nodes, errors); } } }); } } }
private void handleTransportDisconnect(DiscoveryNode node) { synchronized (masterNodeMutex) { if (!node.equals(this.masterNode)) { return; } if (connectOnNetworkDisconnect) { try { transportService.connectToNode(node); } catch (Exception e) { logger.trace("[master] [{}] transport disconnected (with verified connect)", masterNode); notifyMasterFailure(masterNode, "transport disconnected (with verified connect)"); } } else { logger.trace("[master] [{}] transport disconnected", node); notifyMasterFailure(node, "transport disconnected"); } } }
@Override protected void doStop() throws ElasticsearchException { pingService.stop(); masterFD.stop("zen disco stop"); nodesFD.stop(); initialStateSent.set(false); if (sendLeaveRequest) { if (!master && latestDiscoNodes.masterNode() != null) { try { membership.sendLeaveRequestBlocking( latestDiscoNodes.masterNode(), localNode, TimeValue.timeValueSeconds(1)); } catch (Exception e) { logger.debug( "failed to send leave request to master [{}]", e, latestDiscoNodes.masterNode()); } } else { DiscoveryNode[] possibleMasters = electMaster.nextPossibleMasters(latestDiscoNodes.nodes().values(), 5); for (DiscoveryNode possibleMaster : possibleMasters) { if (localNode.equals(possibleMaster)) { continue; } try { membership.sendLeaveRequest(latestDiscoNodes.masterNode(), possibleMaster); } catch (Exception e) { logger.debug( "failed to send leave request from master [{}] to possible master [{}]", e, latestDiscoNodes.masterNode(), possibleMaster); } } } } master = false; if (currentJoinThread != null) { try { currentJoinThread.interrupt(); } catch (Exception e) { // ignore } } }
@Test // Without the 'include temporalResponses responses to nodesToConnect' improvement in // UnicastZenPing#sendPings this // test fails, because 2 nodes elect themselves as master and the health request times out b/c // waiting_for_nodes=N // can't be satisfied. public void testMinimumMasterNodes() throws Exception { int currentNumNodes = randomIntBetween(3, 5); int currentNumOfUnicastHosts = randomIntBetween(1, currentNumNodes); final Settings settings = ImmutableSettings.settingsBuilder() .put("discovery.zen.minimum_master_nodes", currentNumNodes / 2 + 1) .build(); discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen( currentNumNodes, currentNumOfUnicastHosts, settings); List<String> nodes = internalCluster().startNodesAsync(currentNumNodes).get(); ensureGreen(); DiscoveryNode masterDiscoNode = null; for (String node : nodes) { ClusterState state = internalCluster() .client(node) .admin() .cluster() .prepareState() .setLocal(true) .execute() .actionGet() .getState(); assertThat(state.nodes().size(), equalTo(currentNumNodes)); if (masterDiscoNode == null) { masterDiscoNode = state.nodes().masterNode(); } else { assertThat(masterDiscoNode.equals(state.nodes().masterNode()), equalTo(true)); } } }
private void handleTransportDisconnect(DiscoveryNode node) { synchronized (masterNodeMutex) { if (!node.equals(this.masterNode)) { return; } if (connectOnNetworkDisconnect) { try { transportService.connectToNode(node); // if all is well, make sure we restart the pinger if (masterPinger != null) { masterPinger.stop(); } this.masterPinger = new MasterPinger(); threadPool.schedule(pingInterval, ThreadPool.Names.SAME, masterPinger); } catch (Exception e) { logger.trace("[master] [{}] transport disconnected (with verified connect)", masterNode); notifyMasterFailure(masterNode, "transport disconnected (with verified connect)"); } } else { logger.trace("[master] [{}] transport disconnected", node); notifyMasterFailure(node, "transport disconnected"); } } }
@Test @TestLogging("discovery.zen:TRACE") // The bug zen unicast ping override bug, may rarely manifest itself, it is very timing dependant. // Without the fix in UnicastZenPing, this test fails roughly 1 out of 10 runs from the command // line. public void testMasterElectionNotMissed() throws Exception { final Settings settings = settingsBuilder() // Failure only manifests if multicast ping is disabled! .put("discovery.zen.ping.multicast.ping.enabled", false) .put("discovery.zen.minimum_master_nodes", 2) // Can't use this, b/c at the moment all node will only ping localhost:9300 // .put("discovery.zen.ping.unicast.hosts", "localhost") .put( "discovery.zen.ping.unicast.hosts", "localhost:15300,localhost:15301,localhost:15302") .put("transport.tcp.port", "15300-15400") .build(); final CountDownLatch latch = new CountDownLatch(3); final AtomicArray<String> nodes = new AtomicArray<String>(3); Runnable r1 = new Runnable() { @Override public void run() { logger.info("--> start first node"); nodes.set(0, cluster().startNode(settings)); latch.countDown(); } }; new Thread(r1).start(); sleep(between(500, 3000)); Runnable r2 = new Runnable() { @Override public void run() { logger.info("--> start second node"); nodes.set(1, cluster().startNode(settings)); latch.countDown(); } }; new Thread(r2).start(); sleep(between(500, 3000)); Runnable r3 = new Runnable() { @Override public void run() { logger.info("--> start third node"); nodes.set(2, cluster().startNode(settings)); latch.countDown(); } }; new Thread(r3).start(); latch.await(); ClusterHealthResponse clusterHealthResponse = client() .admin() .cluster() .prepareHealth() .setWaitForEvents(Priority.LANGUID) .setWaitForNodes("3") .execute() .actionGet(); assertThat(clusterHealthResponse.isTimedOut(), equalTo(false)); DiscoveryNode masterDiscoNode = null; for (String node : nodes.toArray(new String[3])) { ClusterState state = cluster() .client(node) .admin() .cluster() .prepareState() .setLocal(true) .execute() .actionGet() .getState(); assertThat(state.nodes().size(), equalTo(3)); if (masterDiscoNode == null) { masterDiscoNode = state.nodes().masterNode(); } else { assertThat(masterDiscoNode.equals(state.nodes().masterNode()), equalTo(true)); } } }
private DiscoveryNode findMaster() { ZenPing.PingResponse[] fullPingResponses = pingService.pingAndWait(pingTimeout); if (fullPingResponses == null) { logger.trace("No full ping responses"); return null; } if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder("full ping responses:"); if (fullPingResponses.length == 0) { sb.append(" {none}"); } else { for (ZenPing.PingResponse pingResponse : fullPingResponses) { sb.append("\n\t--> ") .append("target [") .append(pingResponse.target()) .append("], master [") .append(pingResponse.master()) .append("]"); } } logger.trace(sb.toString()); } // filter responses List<ZenPing.PingResponse> pingResponses = Lists.newArrayList(); for (ZenPing.PingResponse pingResponse : fullPingResponses) { DiscoveryNode node = pingResponse.target(); if (masterElectionFilterClientNodes && (node.clientNode() || (!node.masterNode() && !node.dataNode()))) { // filter out the client node, which is a client node, or also one that is not data and not // master (effectively, client) } else if (masterElectionFilterDataNodes && (!node.masterNode() && node.dataNode())) { // filter out data node that is not also master } else { pingResponses.add(pingResponse); } } if (logger.isDebugEnabled()) { StringBuilder sb = new StringBuilder("filtered ping responses: (filter_client[") .append(masterElectionFilterClientNodes) .append("], filter_data[") .append(masterElectionFilterDataNodes) .append("])"); if (pingResponses.isEmpty()) { sb.append(" {none}"); } else { for (ZenPing.PingResponse pingResponse : pingResponses) { sb.append("\n\t--> ") .append("target [") .append(pingResponse.target()) .append("], master [") .append(pingResponse.master()) .append("]"); } } logger.debug(sb.toString()); } List<DiscoveryNode> pingMasters = newArrayList(); for (ZenPing.PingResponse pingResponse : pingResponses) { if (pingResponse.master() != null) { pingMasters.add(pingResponse.master()); } } Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet(); possibleMasterNodes.add(localNode); for (ZenPing.PingResponse pingResponse : pingResponses) { possibleMasterNodes.add(pingResponse.target()); } // if we don't have enough master nodes, we bail, even if we get a response that indicates // there is a master by other node, we don't see enough... if (!electMaster.hasEnoughMasterNodes(possibleMasterNodes)) { return null; } if (pingMasters.isEmpty()) { // lets tie break between discovered nodes DiscoveryNode electedMaster = electMaster.electMaster(possibleMasterNodes); if (localNode.equals(electedMaster)) { return localNode; } } else { DiscoveryNode electedMaster = electMaster.electMaster(pingMasters); if (electedMaster != null) { return electedMaster; } } return null; }
private void innerJoinCluster() { boolean retry = true; while (retry) { if (lifecycle.stoppedOrClosed()) { return; } retry = false; DiscoveryNode masterNode = findMaster(); if (masterNode == null) { logger.trace("no masterNode returned"); retry = true; continue; } if (localNode.equals(masterNode)) { this.master = true; nodesFD.start(); // start the nodes FD clusterService.submitStateUpdateTask( "zen-disco-join (elected_as_master)", Priority.URGENT, new ProcessedClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { DiscoveryNodes.Builder builder = new DiscoveryNodes.Builder() .localNodeId(localNode.id()) .masterNodeId(localNode.id()) // put our local node .put(localNode); // update the fact that we are the master... latestDiscoNodes = builder.build(); ClusterBlocks clusterBlocks = ClusterBlocks.builder() .blocks(currentState.blocks()) .removeGlobalBlock(NO_MASTER_BLOCK) .build(); return ClusterState.builder(currentState) .nodes(latestDiscoNodes) .blocks(clusterBlocks) .build(); } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); } @Override public void clusterStateProcessed( String source, ClusterState oldState, ClusterState newState) { sendInitialStateEventIfNeeded(); } }); } else { this.master = false; try { // first, make sure we can connect to the master transportService.connectToNode(masterNode); } catch (Exception e) { logger.warn("failed to connect to master [{}], retrying...", e, masterNode); retry = true; continue; } // send join request try { membership.sendJoinRequestBlocking(masterNode, localNode, pingTimeout); } catch (Exception e) { if (e instanceof ElasticsearchException) { logger.info( "failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage()); } else { logger.info( "failed to send join request to master [{}], reason [{}]", masterNode, e.getMessage()); } if (logger.isTraceEnabled()) { logger.trace("detailed failed reason", e); } // failed to send the join request, retry retry = true; continue; } masterFD.start(masterNode, "initial_join"); // no need to submit the received cluster state, we will get it from the master when it // publishes // the fact that we joined } } }
@Test public void testCancellationCleansTempFiles() throws Exception { final String indexName = "test"; final String p_node = internalCluster().startNode(); client() .admin() .indices() .prepareCreate(indexName) .setSettings( Settings.builder() .put( IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1, IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)) .get(); internalCluster().startNodesAsync(2).get(); List<IndexRequestBuilder> requests = new ArrayList<>(); int numDocs = scaledRandomIntBetween(25, 250); for (int i = 0; i < numDocs; i++) { requests.add(client().prepareIndex(indexName, "type").setCreate(true).setSource("{}")); } indexRandom(true, requests); assertFalse( client() .admin() .cluster() .prepareHealth() .setWaitForNodes("3") .setWaitForGreenStatus() .get() .isTimedOut()); flush(); int allowedFailures = randomIntBetween(3, 10); logger.info("--> blocking recoveries from primary (allowed failures: [{}])", allowedFailures); CountDownLatch corruptionCount = new CountDownLatch(allowedFailures); ClusterService clusterService = internalCluster().getInstance(ClusterService.class, p_node); MockTransportService mockTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, p_node); for (DiscoveryNode node : clusterService.state().nodes()) { if (!node.equals(clusterService.localNode())) { mockTransportService.addDelegate( node, new RecoveryCorruption(mockTransportService.original(), corruptionCount)); } } client() .admin() .indices() .prepareUpdateSettings(indexName) .setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)) .get(); corruptionCount.await(); logger.info("--> stopping replica assignment"); assertAcked( client() .admin() .cluster() .prepareUpdateSettings() .setTransientSettings( Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE, "none"))); logger.info("--> wait for all replica shards to be removed, on all nodes"); assertBusy( new Runnable() { @Override public void run() { for (String node : internalCluster().getNodeNames()) { if (node.equals(p_node)) { continue; } ClusterState state = client(node).admin().cluster().prepareState().setLocal(true).get().getState(); assertThat( node + " indicates assigned replicas", state .getRoutingTable() .index(indexName) .shardsWithState(ShardRoutingState.UNASSIGNED) .size(), equalTo(1)); } } }); logger.info("--> verifying no temporary recoveries are left"); for (String node : internalCluster().getNodeNames()) { NodeEnvironment nodeEnvironment = internalCluster().getInstance(NodeEnvironment.class, node); for (final Path shardLoc : nodeEnvironment.availableShardPaths(new ShardId(indexName, 0))) { if (Files.exists(shardLoc)) { assertBusy( new Runnable() { @Override public void run() { try { Files.walkFileTree( shardLoc, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { assertThat( "found a temporary recovery file: " + file, file.getFileName().toString(), not(startsWith("recovery."))); return FileVisitResult.CONTINUE; } }); } catch (IOException e) { throw new AssertionError( "failed to walk file tree starting at [" + shardLoc + "]", e); } } }); } } } }
/** * Process existing recoveries of replicas and see if we need to cancel them if we find a better * match. Today, a better match is one that has full sync id match compared to not having one in * the previous recovery. */ public boolean processExistingRecoveries(RoutingAllocation allocation) { boolean changed = false; MetaData metaData = allocation.metaData(); for (RoutingNodes.RoutingNodesIterator nodes = allocation.routingNodes().nodes(); nodes.hasNext(); ) { nodes.next(); for (RoutingNodes.RoutingNodeIterator it = nodes.nodeShards(); it.hasNext(); ) { ShardRouting shard = it.next(); if (shard.primary() == true) { continue; } if (shard.initializing() == false) { continue; } if (shard.relocatingNodeId() != null) { continue; } // if we are allocating a replica because of index creation, no need to go and find a copy, // there isn't one... IndexMetaData indexMetaData = metaData.index(shard.getIndexName()); if (shard.allocatedPostIndexCreate(indexMetaData) == false) { continue; } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation); if (shardStores.hasData() == false) { logger.trace("{}: fetching new stores for initializing shard", shard); continue; // still fetching } ShardRouting primaryShard = allocation.routingNodes().activePrimary(shard); assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore = findStore(primaryShard, allocation, shardStores); if (primaryStore == null || primaryStore.allocated() == false) { // if we can't find the primary data, it is probably because the primary shard is // corrupted (and listing failed) // just let the recovery find it out, no need to do anything about it for the initializing // shard logger.trace( "{}: no primary shard store found or allocated, letting actual allocation figure it out", shard); continue; } MatchingNodes matchingNodes = findMatchingNodes(shard, allocation, primaryStore, shardStores); if (matchingNodes.getNodeWithHighestMatch() != null) { DiscoveryNode currentNode = allocation.nodes().get(shard.currentNodeId()); DiscoveryNode nodeWithHighestMatch = matchingNodes.getNodeWithHighestMatch(); if (currentNode.equals(nodeWithHighestMatch) == false && matchingNodes.isNodeMatchBySyncID(currentNode) == false && matchingNodes.isNodeMatchBySyncID(nodeWithHighestMatch) == true) { // we found a better match that has a full sync id match, the existing allocation is not // fully synced // so we found a better one, cancel this one it.moveToUnassigned( new UnassignedInfo( UnassignedInfo.Reason.REALLOCATED_REPLICA, "existing allocation of replica to [" + currentNode + "] cancelled, sync id match found on node [" + nodeWithHighestMatch + "]", null, allocation.getCurrentNanoTime(), System.currentTimeMillis())); changed = true; } } } } return changed; }