@Override
    public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
      if (!ackedTaskListener.mustAck(node)) {
        // we always wait for the master ack anyway
        if (!node.equals(nodes.getMasterNode())) {
          return;
        }
      }
      if (e == null) {
        logger.trace(
            "ack received from node [{}], cluster_state update (version: {})",
            node,
            clusterStateVersion);
      } else {
        this.lastFailure = e;
        logger.debug(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "ack received from node [{}], cluster_state update (version: {})",
                        node,
                        clusterStateVersion),
            e);
      }

      if (countDown.countDown()) {
        logger.trace(
            "all expected nodes acknowledged cluster_state update (version: {})",
            clusterStateVersion);
        FutureUtils.cancel(ackTimeoutCallback);
        ackedTaskListener.onAllNodesAcked(lastFailure);
      }
    }
  public void verify(
      String repository, String verificationToken, final ActionListener<VerifyResponse> listener) {
    final DiscoveryNodes discoNodes = clusterService.state().nodes();
    final DiscoveryNode localNode = discoNodes.localNode();

    final ObjectContainer<DiscoveryNode> masterAndDataNodes =
        discoNodes.masterAndDataNodes().values();
    final List<DiscoveryNode> nodes = newArrayList();
    for (ObjectCursor<DiscoveryNode> cursor : masterAndDataNodes) {
      DiscoveryNode node = cursor.value;
      Version version = node.getVersion();
      // Verification wasn't supported before v1.4.0 - no reason to send verification request to
      // these nodes
      if (version != null && version.onOrAfter(Version.V_1_4_0)) {
        nodes.add(node);
      }
    }
    final CopyOnWriteArrayList<VerificationFailure> errors = new CopyOnWriteArrayList<>();
    final AtomicInteger counter = new AtomicInteger(nodes.size());
    for (final DiscoveryNode node : nodes) {
      if (node.equals(localNode)) {
        try {
          doVerify(repository, verificationToken);
        } catch (Throwable t) {
          logger.warn("[{}] failed to verify repository", t, repository);
          errors.add(new VerificationFailure(node.id(), ExceptionsHelper.detailedMessage(t)));
        }
        if (counter.decrementAndGet() == 0) {
          finishVerification(listener, nodes, errors);
        }
      } else {
        transportService.sendRequest(
            node,
            ACTION_NAME,
            new VerifyNodeRepositoryRequest(repository, verificationToken),
            new EmptyTransportResponseHandler(ThreadPool.Names.SAME) {
              @Override
              public void handleResponse(TransportResponse.Empty response) {
                if (counter.decrementAndGet() == 0) {
                  finishVerification(listener, nodes, errors);
                }
              }

              @Override
              public void handleException(TransportException exp) {
                errors.add(
                    new VerificationFailure(node.id(), ExceptionsHelper.detailedMessage(exp)));
                if (counter.decrementAndGet() == 0) {
                  finishVerification(listener, nodes, errors);
                }
              }
            });
      }
    }
  }
 private void handleTransportDisconnect(DiscoveryNode node) {
   synchronized (masterNodeMutex) {
     if (!node.equals(this.masterNode)) {
       return;
     }
     if (connectOnNetworkDisconnect) {
       try {
         transportService.connectToNode(node);
       } catch (Exception e) {
         logger.trace("[master] [{}] transport disconnected (with verified connect)", masterNode);
         notifyMasterFailure(masterNode, "transport disconnected (with verified connect)");
       }
     } else {
       logger.trace("[master] [{}] transport disconnected", node);
       notifyMasterFailure(node, "transport disconnected");
     }
   }
 }
 @Override
 protected void doStop() throws ElasticsearchException {
   pingService.stop();
   masterFD.stop("zen disco stop");
   nodesFD.stop();
   initialStateSent.set(false);
   if (sendLeaveRequest) {
     if (!master && latestDiscoNodes.masterNode() != null) {
       try {
         membership.sendLeaveRequestBlocking(
             latestDiscoNodes.masterNode(), localNode, TimeValue.timeValueSeconds(1));
       } catch (Exception e) {
         logger.debug(
             "failed to send leave request to master [{}]", e, latestDiscoNodes.masterNode());
       }
     } else {
       DiscoveryNode[] possibleMasters =
           electMaster.nextPossibleMasters(latestDiscoNodes.nodes().values(), 5);
       for (DiscoveryNode possibleMaster : possibleMasters) {
         if (localNode.equals(possibleMaster)) {
           continue;
         }
         try {
           membership.sendLeaveRequest(latestDiscoNodes.masterNode(), possibleMaster);
         } catch (Exception e) {
           logger.debug(
               "failed to send leave request from master [{}] to possible master [{}]",
               e,
               latestDiscoNodes.masterNode(),
               possibleMaster);
         }
       }
     }
   }
   master = false;
   if (currentJoinThread != null) {
     try {
       currentJoinThread.interrupt();
     } catch (Exception e) {
       // ignore
     }
   }
 }
  @Test
  // Without the 'include temporalResponses responses to nodesToConnect' improvement in
  // UnicastZenPing#sendPings this
  // test fails, because 2 nodes elect themselves as master and the health request times out b/c
  // waiting_for_nodes=N
  // can't be satisfied.
  public void testMinimumMasterNodes() throws Exception {
    int currentNumNodes = randomIntBetween(3, 5);
    int currentNumOfUnicastHosts = randomIntBetween(1, currentNumNodes);
    final Settings settings =
        ImmutableSettings.settingsBuilder()
            .put("discovery.zen.minimum_master_nodes", currentNumNodes / 2 + 1)
            .build();
    discoveryConfig =
        new ClusterDiscoveryConfiguration.UnicastZen(
            currentNumNodes, currentNumOfUnicastHosts, settings);

    List<String> nodes = internalCluster().startNodesAsync(currentNumNodes).get();

    ensureGreen();

    DiscoveryNode masterDiscoNode = null;
    for (String node : nodes) {
      ClusterState state =
          internalCluster()
              .client(node)
              .admin()
              .cluster()
              .prepareState()
              .setLocal(true)
              .execute()
              .actionGet()
              .getState();
      assertThat(state.nodes().size(), equalTo(currentNumNodes));
      if (masterDiscoNode == null) {
        masterDiscoNode = state.nodes().masterNode();
      } else {
        assertThat(masterDiscoNode.equals(state.nodes().masterNode()), equalTo(true));
      }
    }
  }
 private void handleTransportDisconnect(DiscoveryNode node) {
   synchronized (masterNodeMutex) {
     if (!node.equals(this.masterNode)) {
       return;
     }
     if (connectOnNetworkDisconnect) {
       try {
         transportService.connectToNode(node);
         // if all is well, make sure we restart the pinger
         if (masterPinger != null) {
           masterPinger.stop();
         }
         this.masterPinger = new MasterPinger();
         threadPool.schedule(pingInterval, ThreadPool.Names.SAME, masterPinger);
       } catch (Exception e) {
         logger.trace("[master] [{}] transport disconnected (with verified connect)", masterNode);
         notifyMasterFailure(masterNode, "transport disconnected (with verified connect)");
       }
     } else {
       logger.trace("[master] [{}] transport disconnected", node);
       notifyMasterFailure(node, "transport disconnected");
     }
   }
 }
  @Test
  @TestLogging("discovery.zen:TRACE")
  // The bug zen unicast ping override bug, may rarely manifest itself, it is very timing dependant.
  // Without the fix in UnicastZenPing, this test fails roughly 1 out of 10 runs from the command
  // line.
  public void testMasterElectionNotMissed() throws Exception {
    final Settings settings =
        settingsBuilder()
            // Failure only manifests if multicast ping is disabled!
            .put("discovery.zen.ping.multicast.ping.enabled", false)
            .put("discovery.zen.minimum_master_nodes", 2)
            // Can't use this, b/c at the moment all node will only ping localhost:9300
            //                .put("discovery.zen.ping.unicast.hosts", "localhost")
            .put(
                "discovery.zen.ping.unicast.hosts",
                "localhost:15300,localhost:15301,localhost:15302")
            .put("transport.tcp.port", "15300-15400")
            .build();

    final CountDownLatch latch = new CountDownLatch(3);
    final AtomicArray<String> nodes = new AtomicArray<String>(3);
    Runnable r1 =
        new Runnable() {

          @Override
          public void run() {
            logger.info("--> start first node");
            nodes.set(0, cluster().startNode(settings));
            latch.countDown();
          }
        };
    new Thread(r1).start();

    sleep(between(500, 3000));
    Runnable r2 =
        new Runnable() {

          @Override
          public void run() {
            logger.info("--> start second node");
            nodes.set(1, cluster().startNode(settings));
            latch.countDown();
          }
        };
    new Thread(r2).start();

    sleep(between(500, 3000));
    Runnable r3 =
        new Runnable() {

          @Override
          public void run() {
            logger.info("--> start third node");
            nodes.set(2, cluster().startNode(settings));
            latch.countDown();
          }
        };
    new Thread(r3).start();
    latch.await();

    ClusterHealthResponse clusterHealthResponse =
        client()
            .admin()
            .cluster()
            .prepareHealth()
            .setWaitForEvents(Priority.LANGUID)
            .setWaitForNodes("3")
            .execute()
            .actionGet();
    assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));

    DiscoveryNode masterDiscoNode = null;
    for (String node : nodes.toArray(new String[3])) {
      ClusterState state =
          cluster()
              .client(node)
              .admin()
              .cluster()
              .prepareState()
              .setLocal(true)
              .execute()
              .actionGet()
              .getState();
      assertThat(state.nodes().size(), equalTo(3));
      if (masterDiscoNode == null) {
        masterDiscoNode = state.nodes().masterNode();
      } else {
        assertThat(masterDiscoNode.equals(state.nodes().masterNode()), equalTo(true));
      }
    }
  }
  private DiscoveryNode findMaster() {
    ZenPing.PingResponse[] fullPingResponses = pingService.pingAndWait(pingTimeout);
    if (fullPingResponses == null) {
      logger.trace("No full ping responses");
      return null;
    }
    if (logger.isTraceEnabled()) {
      StringBuilder sb = new StringBuilder("full ping responses:");
      if (fullPingResponses.length == 0) {
        sb.append(" {none}");
      } else {
        for (ZenPing.PingResponse pingResponse : fullPingResponses) {
          sb.append("\n\t--> ")
              .append("target [")
              .append(pingResponse.target())
              .append("], master [")
              .append(pingResponse.master())
              .append("]");
        }
      }
      logger.trace(sb.toString());
    }

    // filter responses
    List<ZenPing.PingResponse> pingResponses = Lists.newArrayList();
    for (ZenPing.PingResponse pingResponse : fullPingResponses) {
      DiscoveryNode node = pingResponse.target();
      if (masterElectionFilterClientNodes
          && (node.clientNode() || (!node.masterNode() && !node.dataNode()))) {
        // filter out the client node, which is a client node, or also one that is not data and not
        // master (effectively, client)
      } else if (masterElectionFilterDataNodes && (!node.masterNode() && node.dataNode())) {
        // filter out data node that is not also master
      } else {
        pingResponses.add(pingResponse);
      }
    }

    if (logger.isDebugEnabled()) {
      StringBuilder sb =
          new StringBuilder("filtered ping responses: (filter_client[")
              .append(masterElectionFilterClientNodes)
              .append("], filter_data[")
              .append(masterElectionFilterDataNodes)
              .append("])");
      if (pingResponses.isEmpty()) {
        sb.append(" {none}");
      } else {
        for (ZenPing.PingResponse pingResponse : pingResponses) {
          sb.append("\n\t--> ")
              .append("target [")
              .append(pingResponse.target())
              .append("], master [")
              .append(pingResponse.master())
              .append("]");
        }
      }
      logger.debug(sb.toString());
    }
    List<DiscoveryNode> pingMasters = newArrayList();
    for (ZenPing.PingResponse pingResponse : pingResponses) {
      if (pingResponse.master() != null) {
        pingMasters.add(pingResponse.master());
      }
    }

    Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet();
    possibleMasterNodes.add(localNode);
    for (ZenPing.PingResponse pingResponse : pingResponses) {
      possibleMasterNodes.add(pingResponse.target());
    }
    // if we don't have enough master nodes, we bail, even if we get a response that indicates
    // there is a master by other node, we don't see enough...
    if (!electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
      return null;
    }

    if (pingMasters.isEmpty()) {
      // lets tie break between discovered nodes
      DiscoveryNode electedMaster = electMaster.electMaster(possibleMasterNodes);
      if (localNode.equals(electedMaster)) {
        return localNode;
      }
    } else {
      DiscoveryNode electedMaster = electMaster.electMaster(pingMasters);
      if (electedMaster != null) {
        return electedMaster;
      }
    }
    return null;
  }
  private void innerJoinCluster() {
    boolean retry = true;
    while (retry) {
      if (lifecycle.stoppedOrClosed()) {
        return;
      }
      retry = false;
      DiscoveryNode masterNode = findMaster();
      if (masterNode == null) {
        logger.trace("no masterNode returned");
        retry = true;
        continue;
      }
      if (localNode.equals(masterNode)) {
        this.master = true;
        nodesFD.start(); // start the nodes FD
        clusterService.submitStateUpdateTask(
            "zen-disco-join (elected_as_master)",
            Priority.URGENT,
            new ProcessedClusterStateUpdateTask() {
              @Override
              public ClusterState execute(ClusterState currentState) {
                DiscoveryNodes.Builder builder =
                    new DiscoveryNodes.Builder()
                        .localNodeId(localNode.id())
                        .masterNodeId(localNode.id())
                        // put our local node
                        .put(localNode);
                // update the fact that we are the master...
                latestDiscoNodes = builder.build();
                ClusterBlocks clusterBlocks =
                    ClusterBlocks.builder()
                        .blocks(currentState.blocks())
                        .removeGlobalBlock(NO_MASTER_BLOCK)
                        .build();
                return ClusterState.builder(currentState)
                    .nodes(latestDiscoNodes)
                    .blocks(clusterBlocks)
                    .build();
              }

              @Override
              public void onFailure(String source, Throwable t) {
                logger.error("unexpected failure during [{}]", t, source);
              }

              @Override
              public void clusterStateProcessed(
                  String source, ClusterState oldState, ClusterState newState) {
                sendInitialStateEventIfNeeded();
              }
            });
      } else {
        this.master = false;
        try {
          // first, make sure we can connect to the master
          transportService.connectToNode(masterNode);
        } catch (Exception e) {
          logger.warn("failed to connect to master [{}], retrying...", e, masterNode);
          retry = true;
          continue;
        }
        // send join request
        try {
          membership.sendJoinRequestBlocking(masterNode, localNode, pingTimeout);
        } catch (Exception e) {
          if (e instanceof ElasticsearchException) {
            logger.info(
                "failed to send join request to master [{}], reason [{}]",
                masterNode,
                ((ElasticsearchException) e).getDetailedMessage());
          } else {
            logger.info(
                "failed to send join request to master [{}], reason [{}]",
                masterNode,
                e.getMessage());
          }
          if (logger.isTraceEnabled()) {
            logger.trace("detailed failed reason", e);
          }
          // failed to send the join request, retry
          retry = true;
          continue;
        }
        masterFD.start(masterNode, "initial_join");
        // no need to submit the received cluster state, we will get it from the master when it
        // publishes
        // the fact that we joined
      }
    }
  }
  @Test
  public void testCancellationCleansTempFiles() throws Exception {
    final String indexName = "test";

    final String p_node = internalCluster().startNode();

    client()
        .admin()
        .indices()
        .prepareCreate(indexName)
        .setSettings(
            Settings.builder()
                .put(
                    IndexMetaData.SETTING_NUMBER_OF_SHARDS,
                    1,
                    IndexMetaData.SETTING_NUMBER_OF_REPLICAS,
                    0))
        .get();

    internalCluster().startNodesAsync(2).get();

    List<IndexRequestBuilder> requests = new ArrayList<>();
    int numDocs = scaledRandomIntBetween(25, 250);
    for (int i = 0; i < numDocs; i++) {
      requests.add(client().prepareIndex(indexName, "type").setCreate(true).setSource("{}"));
    }
    indexRandom(true, requests);
    assertFalse(
        client()
            .admin()
            .cluster()
            .prepareHealth()
            .setWaitForNodes("3")
            .setWaitForGreenStatus()
            .get()
            .isTimedOut());
    flush();

    int allowedFailures = randomIntBetween(3, 10);
    logger.info("--> blocking recoveries from primary (allowed failures: [{}])", allowedFailures);
    CountDownLatch corruptionCount = new CountDownLatch(allowedFailures);
    ClusterService clusterService = internalCluster().getInstance(ClusterService.class, p_node);
    MockTransportService mockTransportService =
        (MockTransportService) internalCluster().getInstance(TransportService.class, p_node);
    for (DiscoveryNode node : clusterService.state().nodes()) {
      if (!node.equals(clusterService.localNode())) {
        mockTransportService.addDelegate(
            node, new RecoveryCorruption(mockTransportService.original(), corruptionCount));
      }
    }

    client()
        .admin()
        .indices()
        .prepareUpdateSettings(indexName)
        .setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1))
        .get();

    corruptionCount.await();

    logger.info("--> stopping replica assignment");
    assertAcked(
        client()
            .admin()
            .cluster()
            .prepareUpdateSettings()
            .setTransientSettings(
                Settings.builder()
                    .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE, "none")));

    logger.info("--> wait for all replica shards to be removed, on all nodes");
    assertBusy(
        new Runnable() {
          @Override
          public void run() {
            for (String node : internalCluster().getNodeNames()) {
              if (node.equals(p_node)) {
                continue;
              }
              ClusterState state =
                  client(node).admin().cluster().prepareState().setLocal(true).get().getState();
              assertThat(
                  node + " indicates assigned replicas",
                  state
                      .getRoutingTable()
                      .index(indexName)
                      .shardsWithState(ShardRoutingState.UNASSIGNED)
                      .size(),
                  equalTo(1));
            }
          }
        });

    logger.info("--> verifying no temporary recoveries are left");
    for (String node : internalCluster().getNodeNames()) {
      NodeEnvironment nodeEnvironment = internalCluster().getInstance(NodeEnvironment.class, node);
      for (final Path shardLoc : nodeEnvironment.availableShardPaths(new ShardId(indexName, 0))) {
        if (Files.exists(shardLoc)) {
          assertBusy(
              new Runnable() {
                @Override
                public void run() {
                  try {
                    Files.walkFileTree(
                        shardLoc,
                        new SimpleFileVisitor<Path>() {
                          @Override
                          public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
                              throws IOException {
                            assertThat(
                                "found a temporary recovery file: " + file,
                                file.getFileName().toString(),
                                not(startsWith("recovery.")));
                            return FileVisitResult.CONTINUE;
                          }
                        });
                  } catch (IOException e) {
                    throw new AssertionError(
                        "failed to walk file tree starting at [" + shardLoc + "]", e);
                  }
                }
              });
        }
      }
    }
  }
  /**
   * Process existing recoveries of replicas and see if we need to cancel them if we find a better
   * match. Today, a better match is one that has full sync id match compared to not having one in
   * the previous recovery.
   */
  public boolean processExistingRecoveries(RoutingAllocation allocation) {
    boolean changed = false;
    MetaData metaData = allocation.metaData();
    for (RoutingNodes.RoutingNodesIterator nodes = allocation.routingNodes().nodes();
        nodes.hasNext(); ) {
      nodes.next();
      for (RoutingNodes.RoutingNodeIterator it = nodes.nodeShards(); it.hasNext(); ) {
        ShardRouting shard = it.next();
        if (shard.primary() == true) {
          continue;
        }
        if (shard.initializing() == false) {
          continue;
        }
        if (shard.relocatingNodeId() != null) {
          continue;
        }

        // if we are allocating a replica because of index creation, no need to go and find a copy,
        // there isn't one...
        IndexMetaData indexMetaData = metaData.index(shard.getIndexName());
        if (shard.allocatedPostIndexCreate(indexMetaData) == false) {
          continue;
        }

        AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
            shardStores = fetchData(shard, allocation);
        if (shardStores.hasData() == false) {
          logger.trace("{}: fetching new stores for initializing shard", shard);
          continue; // still fetching
        }

        ShardRouting primaryShard = allocation.routingNodes().activePrimary(shard);
        assert primaryShard != null
            : "the replica shard can be allocated on at least one node, so there must be an active primary";
        TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore =
            findStore(primaryShard, allocation, shardStores);
        if (primaryStore == null || primaryStore.allocated() == false) {
          // if we can't find the primary data, it is probably because the primary shard is
          // corrupted (and listing failed)
          // just let the recovery find it out, no need to do anything about it for the initializing
          // shard
          logger.trace(
              "{}: no primary shard store found or allocated, letting actual allocation figure it out",
              shard);
          continue;
        }

        MatchingNodes matchingNodes =
            findMatchingNodes(shard, allocation, primaryStore, shardStores);
        if (matchingNodes.getNodeWithHighestMatch() != null) {
          DiscoveryNode currentNode = allocation.nodes().get(shard.currentNodeId());
          DiscoveryNode nodeWithHighestMatch = matchingNodes.getNodeWithHighestMatch();
          if (currentNode.equals(nodeWithHighestMatch) == false
              && matchingNodes.isNodeMatchBySyncID(currentNode) == false
              && matchingNodes.isNodeMatchBySyncID(nodeWithHighestMatch) == true) {
            // we found a better match that has a full sync id match, the existing allocation is not
            // fully synced
            // so we found a better one, cancel this one
            it.moveToUnassigned(
                new UnassignedInfo(
                    UnassignedInfo.Reason.REALLOCATED_REPLICA,
                    "existing allocation of replica to ["
                        + currentNode
                        + "] cancelled, sync id match found on node ["
                        + nodeWithHighestMatch
                        + "]",
                    null,
                    allocation.getCurrentNanoTime(),
                    System.currentTimeMillis()));
            changed = true;
          }
        }
      }
    }
    return changed;
  }