/**
   * This test verifies that if we corrupt a replica, we can still get to green, even though listing
   * its store fails. Note, we need to make sure that replicas are allocated on all data nodes, so
   * that replica won't be sneaky and allocated on a node that doesn't have a corrupted replica.
   */
  public void testReplicaCorruption() throws Exception {
    int numDocs = scaledRandomIntBetween(100, 1000);
    internalCluster().ensureAtLeastNumDataNodes(2);

    assertAcked(
        prepareCreate("test")
            .setSettings(
                Settings.builder()
                    .put(
                        PrimaryShardAllocator.INDEX_RECOVERY_INITIAL_SHARDS_SETTING.getKey(), "one")
                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, cluster().numDataNodes() - 1)
                    .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false)
                    .put(
                        MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(),
                        false) // no checkindex - we corrupt shards on purpose
                    .put(
                        IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(),
                        new ByteSizeValue(
                            1,
                            ByteSizeUnit
                                .PB)) // no translog based flush - it might change the .liv /
                // segments.N files
                ));
    ensureGreen();
    IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs];
    for (int i = 0; i < builders.length; i++) {
      builders[i] = client().prepareIndex("test", "type").setSource("field", "value");
    }
    indexRandom(true, builders);
    ensureGreen();
    assertAllSuccessful(
        client()
            .admin()
            .indices()
            .prepareFlush()
            .setForce(true)
            .setWaitIfOngoing(true)
            .execute()
            .actionGet());
    // we have to flush at least once here since we don't corrupt the translog
    SearchResponse countResponse = client().prepareSearch().setSize(0).get();
    assertHitCount(countResponse, numDocs);

    // disable allocations of replicas post restart (the restart will change replicas to primaries,
    // so we have
    // to capture replicas post restart)
    assertAcked(
        client()
            .admin()
            .cluster()
            .prepareUpdateSettings()
            .setPersistentSettings(
                Settings.builder()
                    .put(
                        EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(),
                        "primaries")));

    internalCluster().fullRestart();

    ensureYellow();

    final Index index = resolveIndex("test");

    final IndicesShardStoresResponse stores =
        client().admin().indices().prepareShardStores(index.getName()).get();

    for (IntObjectCursor<List<IndicesShardStoresResponse.StoreStatus>> shards :
        stores.getStoreStatuses().get(index.getName())) {
      for (IndicesShardStoresResponse.StoreStatus store : shards.value) {
        final ShardId shardId = new ShardId(index, shards.key);
        if (store
            .getAllocationStatus()
            .equals(IndicesShardStoresResponse.StoreStatus.AllocationStatus.UNUSED)) {
          for (Path path : findFilesToCorruptOnNode(store.getNode().getName(), shardId)) {
            try (OutputStream os = Files.newOutputStream(path)) {
              os.write(0);
            }
            logger.info("corrupting file {} on node {}", path, store.getNode().getName());
          }
        }
      }
    }

    // enable allocation
    assertAcked(
        client()
            .admin()
            .cluster()
            .prepareUpdateSettings()
            .setPersistentSettings(
                Settings.builder()
                    .putNull(
                        EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING
                            .getKey())));

    ensureGreen();
  }
  /**
   * Tests corruption that happens on a single shard when no replicas are present. We make sure that
   * the primary stays unassigned and all other replicas for the healthy shards happens
   */
  public void testCorruptPrimaryNoReplica()
      throws ExecutionException, InterruptedException, IOException {
    int numDocs = scaledRandomIntBetween(100, 1000);
    internalCluster().ensureAtLeastNumDataNodes(2);

    assertAcked(
        prepareCreate("test")
            .setSettings(
                Settings.builder()
                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0")
                    .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false)
                    .put(
                        MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(),
                        false) // no checkindex - we corrupt shards on purpose
                    .put(
                        IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(),
                        new ByteSizeValue(
                            1,
                            ByteSizeUnit
                                .PB)) // no translog based flush - it might change the .liv /
                // segments.N files
                ));
    ensureGreen();
    IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs];
    for (int i = 0; i < builders.length; i++) {
      builders[i] = client().prepareIndex("test", "type").setSource("field", "value");
    }
    indexRandom(true, builders);
    ensureGreen();
    assertAllSuccessful(
        client()
            .admin()
            .indices()
            .prepareFlush()
            .setForce(true)
            .setWaitIfOngoing(true)
            .execute()
            .actionGet());
    // we have to flush at least once here since we don't corrupt the translog
    SearchResponse countResponse = client().prepareSearch().setSize(0).get();
    assertHitCount(countResponse, numDocs);

    ShardRouting shardRouting = corruptRandomPrimaryFile();
    /*
     * we corrupted the primary shard - now lets make sure we never recover from it successfully
     */
    Settings build = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1").build();
    client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
    client().admin().cluster().prepareReroute().get();

    boolean didClusterTurnRed =
        awaitBusy(
            () -> {
              ClusterHealthStatus test =
                  client()
                      .admin()
                      .cluster()
                      .health(Requests.clusterHealthRequest("test"))
                      .actionGet()
                      .getStatus();
              return test == ClusterHealthStatus.RED;
            },
            5,
            TimeUnit
                .MINUTES); // sometimes on slow nodes the replication / recovery is just dead slow
    final ClusterHealthResponse response =
        client().admin().cluster().health(Requests.clusterHealthRequest("test")).get();
    if (response.getStatus() != ClusterHealthStatus.RED) {
      logger.info("Cluster turned red in busy loop: {}", didClusterTurnRed);
      logger.info(
          "cluster state:\n{}\n{}",
          client().admin().cluster().prepareState().get().getState().prettyPrint(),
          client().admin().cluster().preparePendingClusterTasks().get().prettyPrint());
    }
    assertThat(response.getStatus(), is(ClusterHealthStatus.RED));
    ClusterState state = client().admin().cluster().prepareState().get().getState();
    GroupShardsIterator shardIterators =
        state.getRoutingTable().activePrimaryShardsGrouped(new String[] {"test"}, false);
    for (ShardIterator iterator : shardIterators) {
      ShardRouting routing;
      while ((routing = iterator.nextOrNull()) != null) {
        if (routing.getId() == shardRouting.getId()) {
          assertThat(routing.state(), equalTo(ShardRoutingState.UNASSIGNED));
        } else {
          assertThat(
              routing.state(),
              anyOf(equalTo(ShardRoutingState.RELOCATING), equalTo(ShardRoutingState.STARTED)));
        }
      }
    }
    final List<Path> files = listShardFiles(shardRouting);
    Path corruptedFile = null;
    for (Path file : files) {
      if (file.getFileName().toString().startsWith("corrupted_")) {
        corruptedFile = file;
        break;
      }
    }
    assertThat(corruptedFile, notNullValue());
  }
  /**
   * Tests that restoring of a corrupted shard fails and we get a partial snapshot. TODO once
   * checksum verification on snapshotting is implemented this test needs to be fixed or split into
   * several parts... We should also corrupt files on the actual snapshot and check that we don't
   * restore the corrupted shard.
   */
  @TestLogging("monitor.fs:DEBUG")
  public void testCorruptFileThenSnapshotAndRestore()
      throws ExecutionException, InterruptedException, IOException {
    int numDocs = scaledRandomIntBetween(100, 1000);
    internalCluster().ensureAtLeastNumDataNodes(2);

    assertAcked(
        prepareCreate("test")
            .setSettings(
                Settings.builder()
                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0") // no replicas for this test
                    .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false)
                    .put(
                        MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(),
                        false) // no checkindex - we corrupt shards on purpose
                    .put(
                        IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(),
                        new ByteSizeValue(
                            1,
                            ByteSizeUnit
                                .PB)) // no translog based flush - it might change the .liv /
                // segments.N files
                ));
    ensureGreen();
    IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs];
    for (int i = 0; i < builders.length; i++) {
      builders[i] = client().prepareIndex("test", "type").setSource("field", "value");
    }
    indexRandom(true, builders);
    ensureGreen();
    assertAllSuccessful(
        client()
            .admin()
            .indices()
            .prepareFlush()
            .setForce(true)
            .setWaitIfOngoing(true)
            .execute()
            .actionGet());
    // we have to flush at least once here since we don't corrupt the translog
    SearchResponse countResponse = client().prepareSearch().setSize(0).get();
    assertHitCount(countResponse, numDocs);

    ShardRouting shardRouting = corruptRandomPrimaryFile(false);
    // we don't corrupt segments.gen since S/R doesn't snapshot this file
    // the other problem here why we can't corrupt segments.X files is that the snapshot flushes
    // again before
    // it snapshots and that will write a new segments.X+1 file
    logger.info("-->  creating repository");
    assertAcked(
        client()
            .admin()
            .cluster()
            .preparePutRepository("test-repo")
            .setType("fs")
            .setSettings(
                Settings.builder()
                    .put("location", randomRepoPath().toAbsolutePath())
                    .put("compress", randomBoolean())
                    .put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES)));
    logger.info("--> snapshot");
    CreateSnapshotResponse createSnapshotResponse =
        client()
            .admin()
            .cluster()
            .prepareCreateSnapshot("test-repo", "test-snap")
            .setWaitForCompletion(true)
            .setIndices("test")
            .get();
    assertThat(createSnapshotResponse.getSnapshotInfo().state(), equalTo(SnapshotState.PARTIAL));
    logger.info("failed during snapshot -- maybe SI file got corrupted");
    final List<Path> files = listShardFiles(shardRouting);
    Path corruptedFile = null;
    for (Path file : files) {
      if (file.getFileName().toString().startsWith("corrupted_")) {
        corruptedFile = file;
        break;
      }
    }
    assertThat(corruptedFile, notNullValue());
  }
  /**
   * Tests that we can actually recover from a corruption on the primary given that we have replica
   * shards around.
   */
  public void testCorruptFileAndRecover()
      throws ExecutionException, InterruptedException, IOException {
    int numDocs = scaledRandomIntBetween(100, 1000);
    // have enough space for 3 copies
    internalCluster().ensureAtLeastNumDataNodes(3);
    if (cluster().numDataNodes() == 3) {
      logger.info("--> cluster has [3] data nodes, corrupted primary will be overwritten");
    }

    assertThat(cluster().numDataNodes(), greaterThanOrEqualTo(3));

    assertAcked(
        prepareCreate("test")
            .setSettings(
                Settings.builder()
                    .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, "1")
                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1")
                    .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false)
                    .put(
                        MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(),
                        false) // no checkindex - we corrupt shards on purpose
                    .put(
                        IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(),
                        new ByteSizeValue(
                            1,
                            ByteSizeUnit
                                .PB)) // no translog based flush - it might change the .liv /
                // segments.N files
                ));
    ensureGreen();
    disableAllocation("test");
    IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs];
    for (int i = 0; i < builders.length; i++) {
      builders[i] = client().prepareIndex("test", "type").setSource("field", "value");
    }
    indexRandom(true, builders);
    ensureGreen();
    assertAllSuccessful(
        client()
            .admin()
            .indices()
            .prepareFlush()
            .setForce(true)
            .setWaitIfOngoing(true)
            .execute()
            .actionGet());
    // we have to flush at least once here since we don't corrupt the translog
    SearchResponse countResponse = client().prepareSearch().setSize(0).get();
    assertHitCount(countResponse, numDocs);

    final int numShards = numShards("test");
    ShardRouting corruptedShardRouting = corruptRandomPrimaryFile();
    logger.info("--> {} corrupted", corruptedShardRouting);
    enableAllocation("test");
    /*
     * we corrupted the primary shard - now lets make sure we never recover from it successfully
     */
    Settings build = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "2").build();
    client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
    ClusterHealthResponse health =
        client()
            .admin()
            .cluster()
            .health(
                Requests.clusterHealthRequest("test")
                    .waitForGreenStatus()
                    .timeout(
                        "5m") // sometimes due to cluster rebalacing and random settings default
                    // timeout is just not enough.
                    .waitForRelocatingShards(0))
            .actionGet();
    if (health.isTimedOut()) {
      logger.info(
          "cluster state:\n{}\n{}",
          client().admin().cluster().prepareState().get().getState().prettyPrint(),
          client().admin().cluster().preparePendingClusterTasks().get().prettyPrint());
      assertThat("timed out waiting for green state", health.isTimedOut(), equalTo(false));
    }
    assertThat(health.getStatus(), equalTo(ClusterHealthStatus.GREEN));
    final int numIterations = scaledRandomIntBetween(5, 20);
    for (int i = 0; i < numIterations; i++) {
      SearchResponse response = client().prepareSearch().setSize(numDocs).get();
      assertHitCount(response, numDocs);
    }

    /*
     * now hook into the IndicesService and register a close listener to
     * run the checkindex. if the corruption is still there we will catch it.
     */
    final CountDownLatch latch = new CountDownLatch(numShards * 3); // primary + 2 replicas
    final CopyOnWriteArrayList<Exception> exception = new CopyOnWriteArrayList<>();
    final IndexEventListener listener =
        new IndexEventListener() {
          @Override
          public void afterIndexShardClosed(
              ShardId sid, @Nullable IndexShard indexShard, Settings indexSettings) {
            if (indexShard != null) {
              Store store = indexShard.store();
              store.incRef();
              try {
                if (!Lucene.indexExists(store.directory())
                    && indexShard.state() == IndexShardState.STARTED) {
                  return;
                }
                try (CheckIndex checkIndex = new CheckIndex(store.directory())) {
                  BytesStreamOutput os = new BytesStreamOutput();
                  PrintStream out = new PrintStream(os, false, StandardCharsets.UTF_8.name());
                  checkIndex.setInfoStream(out);
                  out.flush();
                  CheckIndex.Status status = checkIndex.checkIndex();
                  if (!status.clean) {
                    logger.warn("check index [failure]\n{}", os.bytes().utf8ToString());
                    throw new IOException("index check failure");
                  }
                }
              } catch (Exception e) {
                exception.add(e);
              } finally {
                store.decRef();
                latch.countDown();
              }
            }
          }
        };

    for (MockIndexEventListener.TestEventListener eventListener :
        internalCluster().getDataNodeInstances(MockIndexEventListener.TestEventListener.class)) {
      eventListener.setNewDelegate(listener);
    }
    try {
      client().admin().indices().prepareDelete("test").get();
      latch.await();
      assertThat(exception, empty());
    } finally {
      for (MockIndexEventListener.TestEventListener eventListener :
          internalCluster().getDataNodeInstances(MockIndexEventListener.TestEventListener.class)) {
        eventListener.setNewDelegate(null);
      }
    }
  }