/** * This test verifies that if we corrupt a replica, we can still get to green, even though listing * its store fails. Note, we need to make sure that replicas are allocated on all data nodes, so * that replica won't be sneaky and allocated on a node that doesn't have a corrupted replica. */ public void testReplicaCorruption() throws Exception { int numDocs = scaledRandomIntBetween(100, 1000); internalCluster().ensureAtLeastNumDataNodes(2); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put( PrimaryShardAllocator.INDEX_RECOVERY_INITIAL_SHARDS_SETTING.getKey(), "one") .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, cluster().numDataNodes() - 1) .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false) .put( MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false) // no checkindex - we corrupt shards on purpose .put( IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(), new ByteSizeValue( 1, ByteSizeUnit .PB)) // no translog based flush - it might change the .liv / // segments.N files )); ensureGreen(); IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs]; for (int i = 0; i < builders.length; i++) { builders[i] = client().prepareIndex("test", "type").setSource("field", "value"); } indexRandom(true, builders); ensureGreen(); assertAllSuccessful( client() .admin() .indices() .prepareFlush() .setForce(true) .setWaitIfOngoing(true) .execute() .actionGet()); // we have to flush at least once here since we don't corrupt the translog SearchResponse countResponse = client().prepareSearch().setSize(0).get(); assertHitCount(countResponse, numDocs); // disable allocations of replicas post restart (the restart will change replicas to primaries, // so we have // to capture replicas post restart) assertAcked( client() .admin() .cluster() .prepareUpdateSettings() .setPersistentSettings( Settings.builder() .put( EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(), "primaries"))); internalCluster().fullRestart(); ensureYellow(); final Index index = resolveIndex("test"); final IndicesShardStoresResponse stores = client().admin().indices().prepareShardStores(index.getName()).get(); for (IntObjectCursor<List<IndicesShardStoresResponse.StoreStatus>> shards : stores.getStoreStatuses().get(index.getName())) { for (IndicesShardStoresResponse.StoreStatus store : shards.value) { final ShardId shardId = new ShardId(index, shards.key); if (store .getAllocationStatus() .equals(IndicesShardStoresResponse.StoreStatus.AllocationStatus.UNUSED)) { for (Path path : findFilesToCorruptOnNode(store.getNode().getName(), shardId)) { try (OutputStream os = Files.newOutputStream(path)) { os.write(0); } logger.info("corrupting file {} on node {}", path, store.getNode().getName()); } } } } // enable allocation assertAcked( client() .admin() .cluster() .prepareUpdateSettings() .setPersistentSettings( Settings.builder() .putNull( EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING .getKey()))); ensureGreen(); }
/** * Tests that restoring of a corrupted shard fails and we get a partial snapshot. TODO once * checksum verification on snapshotting is implemented this test needs to be fixed or split into * several parts... We should also corrupt files on the actual snapshot and check that we don't * restore the corrupted shard. */ @TestLogging("monitor.fs:DEBUG") public void testCorruptFileThenSnapshotAndRestore() throws ExecutionException, InterruptedException, IOException { int numDocs = scaledRandomIntBetween(100, 1000); internalCluster().ensureAtLeastNumDataNodes(2); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0") // no replicas for this test .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false) .put( MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false) // no checkindex - we corrupt shards on purpose .put( IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(), new ByteSizeValue( 1, ByteSizeUnit .PB)) // no translog based flush - it might change the .liv / // segments.N files )); ensureGreen(); IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs]; for (int i = 0; i < builders.length; i++) { builders[i] = client().prepareIndex("test", "type").setSource("field", "value"); } indexRandom(true, builders); ensureGreen(); assertAllSuccessful( client() .admin() .indices() .prepareFlush() .setForce(true) .setWaitIfOngoing(true) .execute() .actionGet()); // we have to flush at least once here since we don't corrupt the translog SearchResponse countResponse = client().prepareSearch().setSize(0).get(); assertHitCount(countResponse, numDocs); ShardRouting shardRouting = corruptRandomPrimaryFile(false); // we don't corrupt segments.gen since S/R doesn't snapshot this file // the other problem here why we can't corrupt segments.X files is that the snapshot flushes // again before // it snapshots and that will write a new segments.X+1 file logger.info("--> creating repository"); assertAcked( client() .admin() .cluster() .preparePutRepository("test-repo") .setType("fs") .setSettings( Settings.builder() .put("location", randomRepoPath().toAbsolutePath()) .put("compress", randomBoolean()) .put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES))); logger.info("--> snapshot"); CreateSnapshotResponse createSnapshotResponse = client() .admin() .cluster() .prepareCreateSnapshot("test-repo", "test-snap") .setWaitForCompletion(true) .setIndices("test") .get(); assertThat(createSnapshotResponse.getSnapshotInfo().state(), equalTo(SnapshotState.PARTIAL)); logger.info("failed during snapshot -- maybe SI file got corrupted"); final List<Path> files = listShardFiles(shardRouting); Path corruptedFile = null; for (Path file : files) { if (file.getFileName().toString().startsWith("corrupted_")) { corruptedFile = file; break; } } assertThat(corruptedFile, notNullValue()); }
/** * Tests corruption that happens on a single shard when no replicas are present. We make sure that * the primary stays unassigned and all other replicas for the healthy shards happens */ public void testCorruptPrimaryNoReplica() throws ExecutionException, InterruptedException, IOException { int numDocs = scaledRandomIntBetween(100, 1000); internalCluster().ensureAtLeastNumDataNodes(2); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0") .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false) .put( MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false) // no checkindex - we corrupt shards on purpose .put( IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(), new ByteSizeValue( 1, ByteSizeUnit .PB)) // no translog based flush - it might change the .liv / // segments.N files )); ensureGreen(); IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs]; for (int i = 0; i < builders.length; i++) { builders[i] = client().prepareIndex("test", "type").setSource("field", "value"); } indexRandom(true, builders); ensureGreen(); assertAllSuccessful( client() .admin() .indices() .prepareFlush() .setForce(true) .setWaitIfOngoing(true) .execute() .actionGet()); // we have to flush at least once here since we don't corrupt the translog SearchResponse countResponse = client().prepareSearch().setSize(0).get(); assertHitCount(countResponse, numDocs); ShardRouting shardRouting = corruptRandomPrimaryFile(); /* * we corrupted the primary shard - now lets make sure we never recover from it successfully */ Settings build = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1").build(); client().admin().indices().prepareUpdateSettings("test").setSettings(build).get(); client().admin().cluster().prepareReroute().get(); boolean didClusterTurnRed = awaitBusy( () -> { ClusterHealthStatus test = client() .admin() .cluster() .health(Requests.clusterHealthRequest("test")) .actionGet() .getStatus(); return test == ClusterHealthStatus.RED; }, 5, TimeUnit .MINUTES); // sometimes on slow nodes the replication / recovery is just dead slow final ClusterHealthResponse response = client().admin().cluster().health(Requests.clusterHealthRequest("test")).get(); if (response.getStatus() != ClusterHealthStatus.RED) { logger.info("Cluster turned red in busy loop: {}", didClusterTurnRed); logger.info( "cluster state:\n{}\n{}", client().admin().cluster().prepareState().get().getState().prettyPrint(), client().admin().cluster().preparePendingClusterTasks().get().prettyPrint()); } assertThat(response.getStatus(), is(ClusterHealthStatus.RED)); ClusterState state = client().admin().cluster().prepareState().get().getState(); GroupShardsIterator shardIterators = state.getRoutingTable().activePrimaryShardsGrouped(new String[] {"test"}, false); for (ShardIterator iterator : shardIterators) { ShardRouting routing; while ((routing = iterator.nextOrNull()) != null) { if (routing.getId() == shardRouting.getId()) { assertThat(routing.state(), equalTo(ShardRoutingState.UNASSIGNED)); } else { assertThat( routing.state(), anyOf(equalTo(ShardRoutingState.RELOCATING), equalTo(ShardRoutingState.STARTED))); } } } final List<Path> files = listShardFiles(shardRouting); Path corruptedFile = null; for (Path file : files) { if (file.getFileName().toString().startsWith("corrupted_")) { corruptedFile = file; break; } } assertThat(corruptedFile, notNullValue()); }
/** * Tests corruption that happens on the network layer and that the primary does not get affected * by corruption that happens on the way to the replica. The file on disk stays uncorrupted */ public void testCorruptionOnNetworkLayer() throws ExecutionException, InterruptedException { int numDocs = scaledRandomIntBetween(100, 1000); internalCluster().ensureAtLeastNumDataNodes(2); if (cluster().numDataNodes() < 3) { internalCluster() .startNode( Settings.builder() .put(Node.NODE_DATA_SETTING.getKey(), true) .put(Node.NODE_MASTER_SETTING.getKey(), false)); } NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get(); List<NodeStats> dataNodeStats = new ArrayList<>(); for (NodeStats stat : nodeStats.getNodes()) { if (stat.getNode().isDataNode()) { dataNodeStats.add(stat); } } assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2)); Collections.shuffle(dataNodeStats, random()); NodeStats primariesNode = dataNodeStats.get(0); NodeStats unluckyNode = dataNodeStats.get(1); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0") .put( IndexMetaData.SETTING_NUMBER_OF_SHARDS, between(1, 4)) // don't go crazy here it must recovery fast // This does corrupt files on the replica, so we can't check: .put(MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false) .put( "index.routing.allocation.include._name", primariesNode.getNode().getName()) .put( EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE))); ensureGreen(); IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs]; for (int i = 0; i < builders.length; i++) { builders[i] = client().prepareIndex("test", "type").setSource("field", "value"); } indexRandom(true, builders); ensureGreen(); assertAllSuccessful( client() .admin() .indices() .prepareFlush() .setForce(true) .setWaitIfOngoing(true) .execute() .actionGet()); // we have to flush at least once here since we don't corrupt the translog SearchResponse countResponse = client().prepareSearch().setSize(0).get(); assertHitCount(countResponse, numDocs); final boolean truncate = randomBoolean(); for (NodeStats dataNode : dataNodeStats) { MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName())); mockTransportService.addDelegate( internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), new MockTransportService.DelegateTransport(mockTransportService.original()) { @Override public void sendRequest( DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException { if (action.equals(RecoveryTargetService.Actions.FILE_CHUNK)) { RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request; if (truncate && req.length() > 1) { BytesRef bytesRef = req.content().toBytesRef(); BytesArray array = new BytesArray(bytesRef.bytes, bytesRef.offset, (int) req.length() - 1); request = new RecoveryFileChunkRequest( req.recoveryId(), req.shardId(), req.metadata(), req.position(), array, req.lastChunk(), req.totalTranslogOps(), req.sourceThrottleTimeInNanos()); } else { assert req.content().toBytesRef().bytes == req.content().toBytesRef().bytes : "no internal reference!!"; final byte[] array = req.content().toBytesRef().bytes; int i = randomIntBetween(0, req.content().length() - 1); array[i] = (byte) ~array[i]; // flip one byte in the content } } super.sendRequest(node, requestId, action, request, options); } }); } Settings build = Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1") .put("index.routing.allocation.include._name", "*") .build(); client().admin().indices().prepareUpdateSettings("test").setSettings(build).get(); client().admin().cluster().prepareReroute().get(); ClusterHealthResponse actionGet = client() .admin() .cluster() .health(Requests.clusterHealthRequest("test").waitForGreenStatus()) .actionGet(); if (actionGet.isTimedOut()) { logger.info( "ensureGreen timed out, cluster state:\n{}\n{}", client().admin().cluster().prepareState().get().getState().prettyPrint(), client().admin().cluster().preparePendingClusterTasks().get().prettyPrint()); assertThat("timed out waiting for green state", actionGet.isTimedOut(), equalTo(false)); } // we are green so primaries got not corrupted. // ensure that no shard is actually allocated on the unlucky node ClusterStateResponse clusterStateResponse = client().admin().cluster().prepareState().get(); for (IndexShardRoutingTable table : clusterStateResponse.getState().getRoutingTable().index("test")) { for (ShardRouting routing : table) { if (unluckyNode.getNode().getId().equals(routing.currentNodeId())) { assertThat(routing.state(), not(equalTo(ShardRoutingState.STARTED))); assertThat(routing.state(), not(equalTo(ShardRoutingState.RELOCATING))); } } } final int numIterations = scaledRandomIntBetween(5, 20); for (int i = 0; i < numIterations; i++) { SearchResponse response = client().prepareSearch().setSize(numDocs).get(); assertHitCount(response, numDocs); } }
/** * Tests that we can actually recover from a corruption on the primary given that we have replica * shards around. */ public void testCorruptFileAndRecover() throws ExecutionException, InterruptedException, IOException { int numDocs = scaledRandomIntBetween(100, 1000); // have enough space for 3 copies internalCluster().ensureAtLeastNumDataNodes(3); if (cluster().numDataNodes() == 3) { logger.info("--> cluster has [3] data nodes, corrupted primary will be overwritten"); } assertThat(cluster().numDataNodes(), greaterThanOrEqualTo(3)); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, "1") .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1") .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false) .put( MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false) // no checkindex - we corrupt shards on purpose .put( IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(), new ByteSizeValue( 1, ByteSizeUnit .PB)) // no translog based flush - it might change the .liv / // segments.N files )); ensureGreen(); disableAllocation("test"); IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs]; for (int i = 0; i < builders.length; i++) { builders[i] = client().prepareIndex("test", "type").setSource("field", "value"); } indexRandom(true, builders); ensureGreen(); assertAllSuccessful( client() .admin() .indices() .prepareFlush() .setForce(true) .setWaitIfOngoing(true) .execute() .actionGet()); // we have to flush at least once here since we don't corrupt the translog SearchResponse countResponse = client().prepareSearch().setSize(0).get(); assertHitCount(countResponse, numDocs); final int numShards = numShards("test"); ShardRouting corruptedShardRouting = corruptRandomPrimaryFile(); logger.info("--> {} corrupted", corruptedShardRouting); enableAllocation("test"); /* * we corrupted the primary shard - now lets make sure we never recover from it successfully */ Settings build = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "2").build(); client().admin().indices().prepareUpdateSettings("test").setSettings(build).get(); ClusterHealthResponse health = client() .admin() .cluster() .health( Requests.clusterHealthRequest("test") .waitForGreenStatus() .timeout( "5m") // sometimes due to cluster rebalacing and random settings default // timeout is just not enough. .waitForRelocatingShards(0)) .actionGet(); if (health.isTimedOut()) { logger.info( "cluster state:\n{}\n{}", client().admin().cluster().prepareState().get().getState().prettyPrint(), client().admin().cluster().preparePendingClusterTasks().get().prettyPrint()); assertThat("timed out waiting for green state", health.isTimedOut(), equalTo(false)); } assertThat(health.getStatus(), equalTo(ClusterHealthStatus.GREEN)); final int numIterations = scaledRandomIntBetween(5, 20); for (int i = 0; i < numIterations; i++) { SearchResponse response = client().prepareSearch().setSize(numDocs).get(); assertHitCount(response, numDocs); } /* * now hook into the IndicesService and register a close listener to * run the checkindex. if the corruption is still there we will catch it. */ final CountDownLatch latch = new CountDownLatch(numShards * 3); // primary + 2 replicas final CopyOnWriteArrayList<Exception> exception = new CopyOnWriteArrayList<>(); final IndexEventListener listener = new IndexEventListener() { @Override public void afterIndexShardClosed( ShardId sid, @Nullable IndexShard indexShard, Settings indexSettings) { if (indexShard != null) { Store store = indexShard.store(); store.incRef(); try { if (!Lucene.indexExists(store.directory()) && indexShard.state() == IndexShardState.STARTED) { return; } try (CheckIndex checkIndex = new CheckIndex(store.directory())) { BytesStreamOutput os = new BytesStreamOutput(); PrintStream out = new PrintStream(os, false, StandardCharsets.UTF_8.name()); checkIndex.setInfoStream(out); out.flush(); CheckIndex.Status status = checkIndex.checkIndex(); if (!status.clean) { logger.warn("check index [failure]\n{}", os.bytes().utf8ToString()); throw new IOException("index check failure"); } } } catch (Exception e) { exception.add(e); } finally { store.decRef(); latch.countDown(); } } } }; for (MockIndexEventListener.TestEventListener eventListener : internalCluster().getDataNodeInstances(MockIndexEventListener.TestEventListener.class)) { eventListener.setNewDelegate(listener); } try { client().admin().indices().prepareDelete("test").get(); latch.await(); assertThat(exception, empty()); } finally { for (MockIndexEventListener.TestEventListener eventListener : internalCluster().getDataNodeInstances(MockIndexEventListener.TestEventListener.class)) { eventListener.setNewDelegate(null); } } }