예제 #1
0
  public void testDisconnectsWhileRecovering() throws Exception {
    final String indexName = "test";
    final Settings nodeSettings =
        Settings.builder()
            .put(RecoverySettings.INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING.getKey(), "100ms")
            .put(RecoverySettings.INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.getKey(), "1s")
            .put(
                MockFSDirectoryService.RANDOM_PREVENT_DOUBLE_WRITE_SETTING.getKey(),
                false) // restarted recoveries will delete temp files and write them again
            .build();
    // start a master node
    internalCluster().startNode(nodeSettings);

    final String blueNodeName =
        internalCluster()
            .startNode(Settings.builder().put("node.attr.color", "blue").put(nodeSettings).build());
    final String redNodeName =
        internalCluster()
            .startNode(Settings.builder().put("node.attr.color", "red").put(nodeSettings).build());

    ClusterHealthResponse response =
        client().admin().cluster().prepareHealth().setWaitForNodes(">=3").get();
    assertThat(response.isTimedOut(), is(false));

    client()
        .admin()
        .indices()
        .prepareCreate(indexName)
        .setSettings(
            Settings.builder()
                .put(IndexMetaData.INDEX_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "color", "blue")
                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0))
        .get();

    List<IndexRequestBuilder> requests = new ArrayList<>();
    int numDocs = scaledRandomIntBetween(25, 250);
    for (int i = 0; i < numDocs; i++) {
      requests.add(client().prepareIndex(indexName, "type").setSource("{}"));
    }
    indexRandom(true, requests);
    ensureSearchable(indexName);

    ClusterStateResponse stateResponse = client().admin().cluster().prepareState().get();
    final String blueNodeId =
        internalCluster().getInstance(ClusterService.class, blueNodeName).localNode().getId();

    assertFalse(stateResponse.getState().getRoutingNodes().node(blueNodeId).isEmpty());

    SearchResponse searchResponse = client().prepareSearch(indexName).get();
    assertHitCount(searchResponse, numDocs);

    String[] recoveryActions =
        new String[] {
          PeerRecoverySourceService.Actions.START_RECOVERY,
          PeerRecoveryTargetService.Actions.FILES_INFO,
          PeerRecoveryTargetService.Actions.FILE_CHUNK,
          PeerRecoveryTargetService.Actions.CLEAN_FILES,
          // RecoveryTarget.Actions.TRANSLOG_OPS, <-- may not be sent if already flushed
          PeerRecoveryTargetService.Actions.PREPARE_TRANSLOG,
          PeerRecoveryTargetService.Actions.FINALIZE
        };
    final String recoveryActionToBlock = randomFrom(recoveryActions);
    final boolean dropRequests = randomBoolean();
    logger.info(
        "--> will {} between blue & red on [{}]",
        dropRequests ? "drop requests" : "break connection",
        recoveryActionToBlock);

    MockTransportService blueMockTransportService =
        (MockTransportService) internalCluster().getInstance(TransportService.class, blueNodeName);
    MockTransportService redMockTransportService =
        (MockTransportService) internalCluster().getInstance(TransportService.class, redNodeName);
    TransportService redTransportService =
        internalCluster().getInstance(TransportService.class, redNodeName);
    TransportService blueTransportService =
        internalCluster().getInstance(TransportService.class, blueNodeName);
    final CountDownLatch requestBlocked = new CountDownLatch(1);

    blueMockTransportService.addDelegate(
        redTransportService,
        new RecoveryActionBlocker(
            dropRequests,
            recoveryActionToBlock,
            blueMockTransportService.original(),
            requestBlocked));
    redMockTransportService.addDelegate(
        blueTransportService,
        new RecoveryActionBlocker(
            dropRequests,
            recoveryActionToBlock,
            redMockTransportService.original(),
            requestBlocked));

    logger.info("--> starting recovery from blue to red");
    client()
        .admin()
        .indices()
        .prepareUpdateSettings(indexName)
        .setSettings(
            Settings.builder()
                .put(
                    IndexMetaData.INDEX_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "color",
                    "red,blue")
                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1))
        .get();

    requestBlocked.await();

    logger.info("--> stopping to block recovery");
    blueMockTransportService.clearAllRules();
    redMockTransportService.clearAllRules();

    ensureGreen();
    searchResponse = client(redNodeName).prepareSearch(indexName).setPreference("_local").get();
    assertHitCount(searchResponse, numDocs);
  }
  /**
   * Tests corruption that happens on the network layer and that the primary does not get affected
   * by corruption that happens on the way to the replica. The file on disk stays uncorrupted
   */
  public void testCorruptionOnNetworkLayer() throws ExecutionException, InterruptedException {
    int numDocs = scaledRandomIntBetween(100, 1000);
    internalCluster().ensureAtLeastNumDataNodes(2);
    if (cluster().numDataNodes() < 3) {
      internalCluster()
          .startNode(
              Settings.builder()
                  .put(Node.NODE_DATA_SETTING.getKey(), true)
                  .put(Node.NODE_MASTER_SETTING.getKey(), false));
    }
    NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get();
    List<NodeStats> dataNodeStats = new ArrayList<>();
    for (NodeStats stat : nodeStats.getNodes()) {
      if (stat.getNode().isDataNode()) {
        dataNodeStats.add(stat);
      }
    }

    assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2));
    Collections.shuffle(dataNodeStats, random());
    NodeStats primariesNode = dataNodeStats.get(0);
    NodeStats unluckyNode = dataNodeStats.get(1);

    assertAcked(
        prepareCreate("test")
            .setSettings(
                Settings.builder()
                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0")
                    .put(
                        IndexMetaData.SETTING_NUMBER_OF_SHARDS,
                        between(1, 4)) // don't go crazy here it must recovery fast
                    // This does corrupt files on the replica, so we can't check:
                    .put(MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false)
                    .put(
                        "index.routing.allocation.include._name", primariesNode.getNode().getName())
                    .put(
                        EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(),
                        EnableAllocationDecider.Rebalance.NONE)));
    ensureGreen();
    IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs];
    for (int i = 0; i < builders.length; i++) {
      builders[i] = client().prepareIndex("test", "type").setSource("field", "value");
    }
    indexRandom(true, builders);
    ensureGreen();
    assertAllSuccessful(
        client()
            .admin()
            .indices()
            .prepareFlush()
            .setForce(true)
            .setWaitIfOngoing(true)
            .execute()
            .actionGet());
    // we have to flush at least once here since we don't corrupt the translog
    SearchResponse countResponse = client().prepareSearch().setSize(0).get();
    assertHitCount(countResponse, numDocs);
    final boolean truncate = randomBoolean();
    for (NodeStats dataNode : dataNodeStats) {
      MockTransportService mockTransportService =
          ((MockTransportService)
              internalCluster().getInstance(TransportService.class, dataNode.getNode().getName()));
      mockTransportService.addDelegate(
          internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()),
          new MockTransportService.DelegateTransport(mockTransportService.original()) {

            @Override
            public void sendRequest(
                DiscoveryNode node,
                long requestId,
                String action,
                TransportRequest request,
                TransportRequestOptions options)
                throws IOException, TransportException {
              if (action.equals(RecoveryTargetService.Actions.FILE_CHUNK)) {
                RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request;
                if (truncate && req.length() > 1) {
                  BytesRef bytesRef = req.content().toBytesRef();
                  BytesArray array =
                      new BytesArray(bytesRef.bytes, bytesRef.offset, (int) req.length() - 1);
                  request =
                      new RecoveryFileChunkRequest(
                          req.recoveryId(),
                          req.shardId(),
                          req.metadata(),
                          req.position(),
                          array,
                          req.lastChunk(),
                          req.totalTranslogOps(),
                          req.sourceThrottleTimeInNanos());
                } else {
                  assert req.content().toBytesRef().bytes == req.content().toBytesRef().bytes
                      : "no internal reference!!";
                  final byte[] array = req.content().toBytesRef().bytes;
                  int i = randomIntBetween(0, req.content().length() - 1);
                  array[i] = (byte) ~array[i]; // flip one byte in the content
                }
              }
              super.sendRequest(node, requestId, action, request, options);
            }
          });
    }

    Settings build =
        Settings.builder()
            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1")
            .put("index.routing.allocation.include._name", "*")
            .build();
    client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
    client().admin().cluster().prepareReroute().get();
    ClusterHealthResponse actionGet =
        client()
            .admin()
            .cluster()
            .health(Requests.clusterHealthRequest("test").waitForGreenStatus())
            .actionGet();
    if (actionGet.isTimedOut()) {
      logger.info(
          "ensureGreen timed out, cluster state:\n{}\n{}",
          client().admin().cluster().prepareState().get().getState().prettyPrint(),
          client().admin().cluster().preparePendingClusterTasks().get().prettyPrint());
      assertThat("timed out waiting for green state", actionGet.isTimedOut(), equalTo(false));
    }
    // we are green so primaries got not corrupted.
    // ensure that no shard is actually allocated on the unlucky node
    ClusterStateResponse clusterStateResponse = client().admin().cluster().prepareState().get();
    for (IndexShardRoutingTable table :
        clusterStateResponse.getState().getRoutingTable().index("test")) {
      for (ShardRouting routing : table) {
        if (unluckyNode.getNode().getId().equals(routing.currentNodeId())) {
          assertThat(routing.state(), not(equalTo(ShardRoutingState.STARTED)));
          assertThat(routing.state(), not(equalTo(ShardRoutingState.RELOCATING)));
        }
      }
    }
    final int numIterations = scaledRandomIntBetween(5, 20);
    for (int i = 0; i < numIterations; i++) {
      SearchResponse response = client().prepareSearch().setSize(numDocs).get();
      assertHitCount(response, numDocs);
    }
  }
  /**
   * This test triggers a corrupt index exception during finalization size if an empty commit point
   * is transferred during recovery we don't know the version of the segments_N file because it has
   * no segments we can take it from. This simulates recoveries from old indices or even without
   * checksums and makes sure if we fail during finalization we also check if the primary is ok.
   * Without the relevant checks this test fails with a RED cluster
   */
  public void testCorruptionOnNetworkLayerFinalizingRecovery()
      throws ExecutionException, InterruptedException, IOException {
    internalCluster().ensureAtLeastNumDataNodes(2);
    NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get();
    List<NodeStats> dataNodeStats = new ArrayList<>();
    for (NodeStats stat : nodeStats.getNodes()) {
      if (stat.getNode().isDataNode()) {
        dataNodeStats.add(stat);
      }
    }

    assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2));
    Collections.shuffle(dataNodeStats, random());
    NodeStats primariesNode = dataNodeStats.get(0);
    NodeStats unluckyNode = dataNodeStats.get(1);
    assertAcked(
        prepareCreate("test")
            .setSettings(
                Settings.builder()
                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0")
                    .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
                    .put(
                        "index.routing.allocation.include._name", primariesNode.getNode().getName())
                    .put(
                        EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(),
                        EnableAllocationDecider.Rebalance.NONE)
                    .put("index.allocation.max_retries", Integer.MAX_VALUE) // keep on retrying
                ));
    ensureGreen(); // allocated with empty commit
    final AtomicBoolean corrupt = new AtomicBoolean(true);
    final CountDownLatch hasCorrupted = new CountDownLatch(1);
    for (NodeStats dataNode : dataNodeStats) {
      MockTransportService mockTransportService =
          ((MockTransportService)
              internalCluster().getInstance(TransportService.class, dataNode.getNode().getName()));
      mockTransportService.addDelegate(
          internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()),
          new MockTransportService.DelegateTransport(mockTransportService.original()) {

            @Override
            public void sendRequest(
                DiscoveryNode node,
                long requestId,
                String action,
                TransportRequest request,
                TransportRequestOptions options)
                throws IOException, TransportException {
              if (corrupt.get() && action.equals(RecoveryTargetService.Actions.FILE_CHUNK)) {
                RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request;
                byte[] array = BytesRef.deepCopyOf(req.content().toBytesRef()).bytes;
                int i = randomIntBetween(0, req.content().length() - 1);
                array[i] = (byte) ~array[i]; // flip one byte in the content
                hasCorrupted.countDown();
              }
              super.sendRequest(node, requestId, action, request, options);
            }
          });
    }

    Settings build =
        Settings.builder()
            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1")
            .put(
                "index.routing.allocation.include._name",
                primariesNode.getNode().getName() + "," + unluckyNode.getNode().getName())
            .build();
    client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
    client().admin().cluster().prepareReroute().get();
    hasCorrupted.await();
    corrupt.set(false);
    ensureGreen();
  }
  /* Test that shard is deleted in case ShardActiveRequest after relocation and next incoming cluster state is an index delete. */
  public void testShardCleanupIfShardDeletionAfterRelocationFailedAndIndexDeleted()
      throws Exception {
    final String node_1 = internalCluster().startNode();
    logger.info("--> creating index [test] with one shard and on replica");
    assertAcked(
        prepareCreate("test")
            .setSettings(
                Settings.builder()
                    .put(indexSettings())
                    .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)));
    ensureGreen("test");
    ClusterState state = client().admin().cluster().prepareState().get().getState();
    Index index = state.metaData().index("test").getIndex();
    assertThat(Files.exists(shardDirectory(node_1, index, 0)), equalTo(true));
    assertThat(Files.exists(indexDirectory(node_1, index)), equalTo(true));

    final String node_2 = internalCluster().startDataOnlyNode(Settings.builder().build());
    assertFalse(client().admin().cluster().prepareHealth().setWaitForNodes("2").get().isTimedOut());

    assertThat(Files.exists(shardDirectory(node_1, index, 0)), equalTo(true));
    assertThat(Files.exists(indexDirectory(node_1, index)), equalTo(true));
    assertThat(Files.exists(shardDirectory(node_2, index, 0)), equalTo(false));
    assertThat(Files.exists(indexDirectory(node_2, index)), equalTo(false));

    // add a transport delegate that will prevent the shard active request to succeed the first time
    // after relocation has finished.
    // node_1 will then wait for the next cluster state change before it tries a next attempt to
    // delete the shard.
    MockTransportService transportServiceNode_1 =
        (MockTransportService) internalCluster().getInstance(TransportService.class, node_1);
    TransportService transportServiceNode_2 =
        internalCluster().getInstance(TransportService.class, node_2);
    final CountDownLatch shardActiveRequestSent = new CountDownLatch(1);
    transportServiceNode_1.addDelegate(
        transportServiceNode_2,
        new MockTransportService.DelegateTransport(transportServiceNode_1.original()) {
          @Override
          public void sendRequest(
              DiscoveryNode node,
              long requestId,
              String action,
              TransportRequest request,
              TransportRequestOptions options)
              throws IOException, TransportException {
            if (action.equals("internal:index/shard/exists")
                && shardActiveRequestSent.getCount() > 0) {
              shardActiveRequestSent.countDown();
              logger.info("prevent shard active request from being sent");
              throw new ConnectTransportException(node, "DISCONNECT: simulated");
            }
            super.sendRequest(node, requestId, action, request, options);
          }
        });

    logger.info("--> move shard from {} to {}, and wait for relocation to finish", node_1, node_2);
    internalCluster()
        .client()
        .admin()
        .cluster()
        .prepareReroute()
        .add(new MoveAllocationCommand("test", 0, node_1, node_2))
        .get();
    shardActiveRequestSent.await();
    ClusterHealthResponse clusterHealth =
        client().admin().cluster().prepareHealth().setWaitForNoRelocatingShards(true).get();
    assertThat(clusterHealth.isTimedOut(), equalTo(false));
    logClusterState();
    // delete the index. node_1 that still waits for the next cluster state update will then get the
    // delete index next.
    // it must still delete the shard, even if it cannot find it anymore in indicesservice
    client().admin().indices().prepareDelete("test").get();

    assertThat(waitForShardDeletion(node_1, index, 0), equalTo(false));
    assertThat(waitForIndexDeletion(node_1, index), equalTo(false));
    assertThat(Files.exists(shardDirectory(node_1, index, 0)), equalTo(false));
    assertThat(Files.exists(indexDirectory(node_1, index)), equalTo(false));
    assertThat(waitForShardDeletion(node_2, index, 0), equalTo(false));
    assertThat(waitForIndexDeletion(node_2, index), equalTo(false));
    assertThat(Files.exists(shardDirectory(node_2, index, 0)), equalTo(false));
    assertThat(Files.exists(indexDirectory(node_2, index)), equalTo(false));
  }
예제 #5
0
  @Test
  public void testCancellationCleansTempFiles() throws Exception {
    final String indexName = "test";

    final String p_node = internalCluster().startNode();

    client()
        .admin()
        .indices()
        .prepareCreate(indexName)
        .setSettings(
            Settings.builder()
                .put(
                    IndexMetaData.SETTING_NUMBER_OF_SHARDS,
                    1,
                    IndexMetaData.SETTING_NUMBER_OF_REPLICAS,
                    0))
        .get();

    internalCluster().startNodesAsync(2).get();

    List<IndexRequestBuilder> requests = new ArrayList<>();
    int numDocs = scaledRandomIntBetween(25, 250);
    for (int i = 0; i < numDocs; i++) {
      requests.add(client().prepareIndex(indexName, "type").setCreate(true).setSource("{}"));
    }
    indexRandom(true, requests);
    assertFalse(
        client()
            .admin()
            .cluster()
            .prepareHealth()
            .setWaitForNodes("3")
            .setWaitForGreenStatus()
            .get()
            .isTimedOut());
    flush();

    int allowedFailures = randomIntBetween(3, 10);
    logger.info("--> blocking recoveries from primary (allowed failures: [{}])", allowedFailures);
    CountDownLatch corruptionCount = new CountDownLatch(allowedFailures);
    ClusterService clusterService = internalCluster().getInstance(ClusterService.class, p_node);
    MockTransportService mockTransportService =
        (MockTransportService) internalCluster().getInstance(TransportService.class, p_node);
    for (DiscoveryNode node : clusterService.state().nodes()) {
      if (!node.equals(clusterService.localNode())) {
        mockTransportService.addDelegate(
            node, new RecoveryCorruption(mockTransportService.original(), corruptionCount));
      }
    }

    client()
        .admin()
        .indices()
        .prepareUpdateSettings(indexName)
        .setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1))
        .get();

    corruptionCount.await();

    logger.info("--> stopping replica assignment");
    assertAcked(
        client()
            .admin()
            .cluster()
            .prepareUpdateSettings()
            .setTransientSettings(
                Settings.builder()
                    .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE, "none")));

    logger.info("--> wait for all replica shards to be removed, on all nodes");
    assertBusy(
        new Runnable() {
          @Override
          public void run() {
            for (String node : internalCluster().getNodeNames()) {
              if (node.equals(p_node)) {
                continue;
              }
              ClusterState state =
                  client(node).admin().cluster().prepareState().setLocal(true).get().getState();
              assertThat(
                  node + " indicates assigned replicas",
                  state
                      .getRoutingTable()
                      .index(indexName)
                      .shardsWithState(ShardRoutingState.UNASSIGNED)
                      .size(),
                  equalTo(1));
            }
          }
        });

    logger.info("--> verifying no temporary recoveries are left");
    for (String node : internalCluster().getNodeNames()) {
      NodeEnvironment nodeEnvironment = internalCluster().getInstance(NodeEnvironment.class, node);
      for (final Path shardLoc : nodeEnvironment.availableShardPaths(new ShardId(indexName, 0))) {
        if (Files.exists(shardLoc)) {
          assertBusy(
              new Runnable() {
                @Override
                public void run() {
                  try {
                    Files.walkFileTree(
                        shardLoc,
                        new SimpleFileVisitor<Path>() {
                          @Override
                          public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
                              throws IOException {
                            assertThat(
                                "found a temporary recovery file: " + file,
                                file.getFileName().toString(),
                                not(startsWith("recovery.")));
                            return FileVisitResult.CONTINUE;
                          }
                        });
                  } catch (IOException e) {
                    throw new AssertionError(
                        "failed to walk file tree starting at [" + shardLoc + "]", e);
                  }
                }
              });
        }
      }
    }
  }
  public void testPrimaryRelocationWhereRecoveryFails() throws Exception {
    Path dataPath = createTempDir();
    Settings nodeSettings =
        Settings.builder()
            .put("node.add_lock_id_to_custom_path", false)
            .put(Environment.PATH_SHARED_DATA_SETTING.getKey(), dataPath)
            .build();

    String node1 = internalCluster().startNode(nodeSettings);
    final String IDX = "test";

    Settings idxSettings =
        Settings.builder()
            .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)
            .put(IndexMetaData.SETTING_DATA_PATH, dataPath.toAbsolutePath().toString())
            .put(IndexMetaData.SETTING_SHADOW_REPLICAS, true)
            .put(IndexMetaData.SETTING_SHARED_FILESYSTEM, true)
            .build();

    prepareCreate(IDX).setSettings(idxSettings).addMapping("doc", "foo", "type=text").get();
    // Node1 has the primary, now node2 has the replica
    String node2 = internalCluster().startNode(nodeSettings);
    ensureGreen(IDX);
    flushAndRefresh(IDX);
    String node3 = internalCluster().startNode(nodeSettings);
    final AtomicInteger counter = new AtomicInteger(0);
    final CountDownLatch started = new CountDownLatch(1);

    final int numPhase1Docs = scaledRandomIntBetween(25, 200);
    final int numPhase2Docs = scaledRandomIntBetween(25, 200);
    final int numPhase3Docs = scaledRandomIntBetween(25, 200);
    final CountDownLatch phase1finished = new CountDownLatch(1);
    final CountDownLatch phase2finished = new CountDownLatch(1);
    final CountDownLatch phase3finished = new CountDownLatch(1);

    final AtomicBoolean keepFailing = new AtomicBoolean(true);

    MockTransportService mockTransportService =
        ((MockTransportService) internalCluster().getInstance(TransportService.class, node1));
    mockTransportService.addDelegate(
        internalCluster().getInstance(TransportService.class, node3),
        new MockTransportService.DelegateTransport(mockTransportService.original()) {

          @Override
          public void sendRequest(
              DiscoveryNode node,
              long requestId,
              String action,
              TransportRequest request,
              TransportRequestOptions options)
              throws IOException, TransportException {
            if (keepFailing.get()
                && action.equals(PeerRecoveryTargetService.Actions.TRANSLOG_OPS)) {
              logger.info("--> failing translog ops");
              throw new ElasticsearchException("failing on purpose");
            }
            super.sendRequest(node, requestId, action, request, options);
          }
        });

    Thread thread =
        new Thread() {
          @Override
          public void run() {
            started.countDown();
            while (counter.get() < (numPhase1Docs + numPhase2Docs + numPhase3Docs)) {
              final IndexResponse indexResponse =
                  client()
                      .prepareIndex(IDX, "doc", Integer.toString(counter.incrementAndGet()))
                      .setSource("foo", "bar")
                      .get();
              assertEquals(DocWriteResponse.Result.CREATED, indexResponse.getResult());
              final int docCount = counter.get();
              if (docCount == numPhase1Docs) {
                phase1finished.countDown();
              } else if (docCount == (numPhase1Docs + numPhase2Docs)) {
                phase2finished.countDown();
              }
            }
            logger.info("--> stopping indexing thread");
            phase3finished.countDown();
          }
        };
    thread.start();
    started.await();
    phase1finished.await(); // wait for a certain number of documents to be indexed
    logger.info("--> excluding {} from allocation", node1);
    // now prevent primary from being allocated on node 1 move to node_3
    Settings build =
        Settings.builder().put("index.routing.allocation.exclude._name", node1).build();
    client().admin().indices().prepareUpdateSettings(IDX).setSettings(build).execute().actionGet();
    // wait for more documents to be indexed post-recovery, also waits for
    // indexing thread to stop
    phase2finished.await();
    // stop failing
    keepFailing.set(false);
    // wait for more docs to be indexed
    phase3finished.await();
    ensureGreen(IDX);
    thread.join();
    logger.info("--> performing query");
    flushAndRefresh();

    SearchResponse resp = client().prepareSearch(IDX).setQuery(matchAllQuery()).get();
    assertHitCount(resp, counter.get());
  }