public void testDisconnectsWhileRecovering() throws Exception { final String indexName = "test"; final Settings nodeSettings = Settings.builder() .put(RecoverySettings.INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING.getKey(), "100ms") .put(RecoverySettings.INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.getKey(), "1s") .put( MockFSDirectoryService.RANDOM_PREVENT_DOUBLE_WRITE_SETTING.getKey(), false) // restarted recoveries will delete temp files and write them again .build(); // start a master node internalCluster().startNode(nodeSettings); final String blueNodeName = internalCluster() .startNode(Settings.builder().put("node.attr.color", "blue").put(nodeSettings).build()); final String redNodeName = internalCluster() .startNode(Settings.builder().put("node.attr.color", "red").put(nodeSettings).build()); ClusterHealthResponse response = client().admin().cluster().prepareHealth().setWaitForNodes(">=3").get(); assertThat(response.isTimedOut(), is(false)); client() .admin() .indices() .prepareCreate(indexName) .setSettings( Settings.builder() .put(IndexMetaData.INDEX_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "color", "blue") .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)) .get(); List<IndexRequestBuilder> requests = new ArrayList<>(); int numDocs = scaledRandomIntBetween(25, 250); for (int i = 0; i < numDocs; i++) { requests.add(client().prepareIndex(indexName, "type").setSource("{}")); } indexRandom(true, requests); ensureSearchable(indexName); ClusterStateResponse stateResponse = client().admin().cluster().prepareState().get(); final String blueNodeId = internalCluster().getInstance(ClusterService.class, blueNodeName).localNode().getId(); assertFalse(stateResponse.getState().getRoutingNodes().node(blueNodeId).isEmpty()); SearchResponse searchResponse = client().prepareSearch(indexName).get(); assertHitCount(searchResponse, numDocs); String[] recoveryActions = new String[] { PeerRecoverySourceService.Actions.START_RECOVERY, PeerRecoveryTargetService.Actions.FILES_INFO, PeerRecoveryTargetService.Actions.FILE_CHUNK, PeerRecoveryTargetService.Actions.CLEAN_FILES, // RecoveryTarget.Actions.TRANSLOG_OPS, <-- may not be sent if already flushed PeerRecoveryTargetService.Actions.PREPARE_TRANSLOG, PeerRecoveryTargetService.Actions.FINALIZE }; final String recoveryActionToBlock = randomFrom(recoveryActions); final boolean dropRequests = randomBoolean(); logger.info( "--> will {} between blue & red on [{}]", dropRequests ? "drop requests" : "break connection", recoveryActionToBlock); MockTransportService blueMockTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, blueNodeName); MockTransportService redMockTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, redNodeName); TransportService redTransportService = internalCluster().getInstance(TransportService.class, redNodeName); TransportService blueTransportService = internalCluster().getInstance(TransportService.class, blueNodeName); final CountDownLatch requestBlocked = new CountDownLatch(1); blueMockTransportService.addDelegate( redTransportService, new RecoveryActionBlocker( dropRequests, recoveryActionToBlock, blueMockTransportService.original(), requestBlocked)); redMockTransportService.addDelegate( blueTransportService, new RecoveryActionBlocker( dropRequests, recoveryActionToBlock, redMockTransportService.original(), requestBlocked)); logger.info("--> starting recovery from blue to red"); client() .admin() .indices() .prepareUpdateSettings(indexName) .setSettings( Settings.builder() .put( IndexMetaData.INDEX_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "color", "red,blue") .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)) .get(); requestBlocked.await(); logger.info("--> stopping to block recovery"); blueMockTransportService.clearAllRules(); redMockTransportService.clearAllRules(); ensureGreen(); searchResponse = client(redNodeName).prepareSearch(indexName).setPreference("_local").get(); assertHitCount(searchResponse, numDocs); }
/** * Tests corruption that happens on the network layer and that the primary does not get affected * by corruption that happens on the way to the replica. The file on disk stays uncorrupted */ public void testCorruptionOnNetworkLayer() throws ExecutionException, InterruptedException { int numDocs = scaledRandomIntBetween(100, 1000); internalCluster().ensureAtLeastNumDataNodes(2); if (cluster().numDataNodes() < 3) { internalCluster() .startNode( Settings.builder() .put(Node.NODE_DATA_SETTING.getKey(), true) .put(Node.NODE_MASTER_SETTING.getKey(), false)); } NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get(); List<NodeStats> dataNodeStats = new ArrayList<>(); for (NodeStats stat : nodeStats.getNodes()) { if (stat.getNode().isDataNode()) { dataNodeStats.add(stat); } } assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2)); Collections.shuffle(dataNodeStats, random()); NodeStats primariesNode = dataNodeStats.get(0); NodeStats unluckyNode = dataNodeStats.get(1); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0") .put( IndexMetaData.SETTING_NUMBER_OF_SHARDS, between(1, 4)) // don't go crazy here it must recovery fast // This does corrupt files on the replica, so we can't check: .put(MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false) .put( "index.routing.allocation.include._name", primariesNode.getNode().getName()) .put( EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE))); ensureGreen(); IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs]; for (int i = 0; i < builders.length; i++) { builders[i] = client().prepareIndex("test", "type").setSource("field", "value"); } indexRandom(true, builders); ensureGreen(); assertAllSuccessful( client() .admin() .indices() .prepareFlush() .setForce(true) .setWaitIfOngoing(true) .execute() .actionGet()); // we have to flush at least once here since we don't corrupt the translog SearchResponse countResponse = client().prepareSearch().setSize(0).get(); assertHitCount(countResponse, numDocs); final boolean truncate = randomBoolean(); for (NodeStats dataNode : dataNodeStats) { MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName())); mockTransportService.addDelegate( internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), new MockTransportService.DelegateTransport(mockTransportService.original()) { @Override public void sendRequest( DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException { if (action.equals(RecoveryTargetService.Actions.FILE_CHUNK)) { RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request; if (truncate && req.length() > 1) { BytesRef bytesRef = req.content().toBytesRef(); BytesArray array = new BytesArray(bytesRef.bytes, bytesRef.offset, (int) req.length() - 1); request = new RecoveryFileChunkRequest( req.recoveryId(), req.shardId(), req.metadata(), req.position(), array, req.lastChunk(), req.totalTranslogOps(), req.sourceThrottleTimeInNanos()); } else { assert req.content().toBytesRef().bytes == req.content().toBytesRef().bytes : "no internal reference!!"; final byte[] array = req.content().toBytesRef().bytes; int i = randomIntBetween(0, req.content().length() - 1); array[i] = (byte) ~array[i]; // flip one byte in the content } } super.sendRequest(node, requestId, action, request, options); } }); } Settings build = Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1") .put("index.routing.allocation.include._name", "*") .build(); client().admin().indices().prepareUpdateSettings("test").setSettings(build).get(); client().admin().cluster().prepareReroute().get(); ClusterHealthResponse actionGet = client() .admin() .cluster() .health(Requests.clusterHealthRequest("test").waitForGreenStatus()) .actionGet(); if (actionGet.isTimedOut()) { logger.info( "ensureGreen timed out, cluster state:\n{}\n{}", client().admin().cluster().prepareState().get().getState().prettyPrint(), client().admin().cluster().preparePendingClusterTasks().get().prettyPrint()); assertThat("timed out waiting for green state", actionGet.isTimedOut(), equalTo(false)); } // we are green so primaries got not corrupted. // ensure that no shard is actually allocated on the unlucky node ClusterStateResponse clusterStateResponse = client().admin().cluster().prepareState().get(); for (IndexShardRoutingTable table : clusterStateResponse.getState().getRoutingTable().index("test")) { for (ShardRouting routing : table) { if (unluckyNode.getNode().getId().equals(routing.currentNodeId())) { assertThat(routing.state(), not(equalTo(ShardRoutingState.STARTED))); assertThat(routing.state(), not(equalTo(ShardRoutingState.RELOCATING))); } } } final int numIterations = scaledRandomIntBetween(5, 20); for (int i = 0; i < numIterations; i++) { SearchResponse response = client().prepareSearch().setSize(numDocs).get(); assertHitCount(response, numDocs); } }
/** * This test triggers a corrupt index exception during finalization size if an empty commit point * is transferred during recovery we don't know the version of the segments_N file because it has * no segments we can take it from. This simulates recoveries from old indices or even without * checksums and makes sure if we fail during finalization we also check if the primary is ok. * Without the relevant checks this test fails with a RED cluster */ public void testCorruptionOnNetworkLayerFinalizingRecovery() throws ExecutionException, InterruptedException, IOException { internalCluster().ensureAtLeastNumDataNodes(2); NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get(); List<NodeStats> dataNodeStats = new ArrayList<>(); for (NodeStats stat : nodeStats.getNodes()) { if (stat.getNode().isDataNode()) { dataNodeStats.add(stat); } } assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2)); Collections.shuffle(dataNodeStats, random()); NodeStats primariesNode = dataNodeStats.get(0); NodeStats unluckyNode = dataNodeStats.get(1); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0") .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put( "index.routing.allocation.include._name", primariesNode.getNode().getName()) .put( EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE) .put("index.allocation.max_retries", Integer.MAX_VALUE) // keep on retrying )); ensureGreen(); // allocated with empty commit final AtomicBoolean corrupt = new AtomicBoolean(true); final CountDownLatch hasCorrupted = new CountDownLatch(1); for (NodeStats dataNode : dataNodeStats) { MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName())); mockTransportService.addDelegate( internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), new MockTransportService.DelegateTransport(mockTransportService.original()) { @Override public void sendRequest( DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException { if (corrupt.get() && action.equals(RecoveryTargetService.Actions.FILE_CHUNK)) { RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request; byte[] array = BytesRef.deepCopyOf(req.content().toBytesRef()).bytes; int i = randomIntBetween(0, req.content().length() - 1); array[i] = (byte) ~array[i]; // flip one byte in the content hasCorrupted.countDown(); } super.sendRequest(node, requestId, action, request, options); } }); } Settings build = Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1") .put( "index.routing.allocation.include._name", primariesNode.getNode().getName() + "," + unluckyNode.getNode().getName()) .build(); client().admin().indices().prepareUpdateSettings("test").setSettings(build).get(); client().admin().cluster().prepareReroute().get(); hasCorrupted.await(); corrupt.set(false); ensureGreen(); }
/* Test that shard is deleted in case ShardActiveRequest after relocation and next incoming cluster state is an index delete. */ public void testShardCleanupIfShardDeletionAfterRelocationFailedAndIndexDeleted() throws Exception { final String node_1 = internalCluster().startNode(); logger.info("--> creating index [test] with one shard and on replica"); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put(indexSettings()) .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0))); ensureGreen("test"); ClusterState state = client().admin().cluster().prepareState().get().getState(); Index index = state.metaData().index("test").getIndex(); assertThat(Files.exists(shardDirectory(node_1, index, 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_1, index)), equalTo(true)); final String node_2 = internalCluster().startDataOnlyNode(Settings.builder().build()); assertFalse(client().admin().cluster().prepareHealth().setWaitForNodes("2").get().isTimedOut()); assertThat(Files.exists(shardDirectory(node_1, index, 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_1, index)), equalTo(true)); assertThat(Files.exists(shardDirectory(node_2, index, 0)), equalTo(false)); assertThat(Files.exists(indexDirectory(node_2, index)), equalTo(false)); // add a transport delegate that will prevent the shard active request to succeed the first time // after relocation has finished. // node_1 will then wait for the next cluster state change before it tries a next attempt to // delete the shard. MockTransportService transportServiceNode_1 = (MockTransportService) internalCluster().getInstance(TransportService.class, node_1); TransportService transportServiceNode_2 = internalCluster().getInstance(TransportService.class, node_2); final CountDownLatch shardActiveRequestSent = new CountDownLatch(1); transportServiceNode_1.addDelegate( transportServiceNode_2, new MockTransportService.DelegateTransport(transportServiceNode_1.original()) { @Override public void sendRequest( DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException { if (action.equals("internal:index/shard/exists") && shardActiveRequestSent.getCount() > 0) { shardActiveRequestSent.countDown(); logger.info("prevent shard active request from being sent"); throw new ConnectTransportException(node, "DISCONNECT: simulated"); } super.sendRequest(node, requestId, action, request, options); } }); logger.info("--> move shard from {} to {}, and wait for relocation to finish", node_1, node_2); internalCluster() .client() .admin() .cluster() .prepareReroute() .add(new MoveAllocationCommand("test", 0, node_1, node_2)) .get(); shardActiveRequestSent.await(); ClusterHealthResponse clusterHealth = client().admin().cluster().prepareHealth().setWaitForNoRelocatingShards(true).get(); assertThat(clusterHealth.isTimedOut(), equalTo(false)); logClusterState(); // delete the index. node_1 that still waits for the next cluster state update will then get the // delete index next. // it must still delete the shard, even if it cannot find it anymore in indicesservice client().admin().indices().prepareDelete("test").get(); assertThat(waitForShardDeletion(node_1, index, 0), equalTo(false)); assertThat(waitForIndexDeletion(node_1, index), equalTo(false)); assertThat(Files.exists(shardDirectory(node_1, index, 0)), equalTo(false)); assertThat(Files.exists(indexDirectory(node_1, index)), equalTo(false)); assertThat(waitForShardDeletion(node_2, index, 0), equalTo(false)); assertThat(waitForIndexDeletion(node_2, index), equalTo(false)); assertThat(Files.exists(shardDirectory(node_2, index, 0)), equalTo(false)); assertThat(Files.exists(indexDirectory(node_2, index)), equalTo(false)); }
@Test public void testCancellationCleansTempFiles() throws Exception { final String indexName = "test"; final String p_node = internalCluster().startNode(); client() .admin() .indices() .prepareCreate(indexName) .setSettings( Settings.builder() .put( IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1, IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)) .get(); internalCluster().startNodesAsync(2).get(); List<IndexRequestBuilder> requests = new ArrayList<>(); int numDocs = scaledRandomIntBetween(25, 250); for (int i = 0; i < numDocs; i++) { requests.add(client().prepareIndex(indexName, "type").setCreate(true).setSource("{}")); } indexRandom(true, requests); assertFalse( client() .admin() .cluster() .prepareHealth() .setWaitForNodes("3") .setWaitForGreenStatus() .get() .isTimedOut()); flush(); int allowedFailures = randomIntBetween(3, 10); logger.info("--> blocking recoveries from primary (allowed failures: [{}])", allowedFailures); CountDownLatch corruptionCount = new CountDownLatch(allowedFailures); ClusterService clusterService = internalCluster().getInstance(ClusterService.class, p_node); MockTransportService mockTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, p_node); for (DiscoveryNode node : clusterService.state().nodes()) { if (!node.equals(clusterService.localNode())) { mockTransportService.addDelegate( node, new RecoveryCorruption(mockTransportService.original(), corruptionCount)); } } client() .admin() .indices() .prepareUpdateSettings(indexName) .setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)) .get(); corruptionCount.await(); logger.info("--> stopping replica assignment"); assertAcked( client() .admin() .cluster() .prepareUpdateSettings() .setTransientSettings( Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE, "none"))); logger.info("--> wait for all replica shards to be removed, on all nodes"); assertBusy( new Runnable() { @Override public void run() { for (String node : internalCluster().getNodeNames()) { if (node.equals(p_node)) { continue; } ClusterState state = client(node).admin().cluster().prepareState().setLocal(true).get().getState(); assertThat( node + " indicates assigned replicas", state .getRoutingTable() .index(indexName) .shardsWithState(ShardRoutingState.UNASSIGNED) .size(), equalTo(1)); } } }); logger.info("--> verifying no temporary recoveries are left"); for (String node : internalCluster().getNodeNames()) { NodeEnvironment nodeEnvironment = internalCluster().getInstance(NodeEnvironment.class, node); for (final Path shardLoc : nodeEnvironment.availableShardPaths(new ShardId(indexName, 0))) { if (Files.exists(shardLoc)) { assertBusy( new Runnable() { @Override public void run() { try { Files.walkFileTree( shardLoc, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { assertThat( "found a temporary recovery file: " + file, file.getFileName().toString(), not(startsWith("recovery."))); return FileVisitResult.CONTINUE; } }); } catch (IOException e) { throw new AssertionError( "failed to walk file tree starting at [" + shardLoc + "]", e); } } }); } } } }
public void testPrimaryRelocationWhereRecoveryFails() throws Exception { Path dataPath = createTempDir(); Settings nodeSettings = Settings.builder() .put("node.add_lock_id_to_custom_path", false) .put(Environment.PATH_SHARED_DATA_SETTING.getKey(), dataPath) .build(); String node1 = internalCluster().startNode(nodeSettings); final String IDX = "test"; Settings idxSettings = Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1) .put(IndexMetaData.SETTING_DATA_PATH, dataPath.toAbsolutePath().toString()) .put(IndexMetaData.SETTING_SHADOW_REPLICAS, true) .put(IndexMetaData.SETTING_SHARED_FILESYSTEM, true) .build(); prepareCreate(IDX).setSettings(idxSettings).addMapping("doc", "foo", "type=text").get(); // Node1 has the primary, now node2 has the replica String node2 = internalCluster().startNode(nodeSettings); ensureGreen(IDX); flushAndRefresh(IDX); String node3 = internalCluster().startNode(nodeSettings); final AtomicInteger counter = new AtomicInteger(0); final CountDownLatch started = new CountDownLatch(1); final int numPhase1Docs = scaledRandomIntBetween(25, 200); final int numPhase2Docs = scaledRandomIntBetween(25, 200); final int numPhase3Docs = scaledRandomIntBetween(25, 200); final CountDownLatch phase1finished = new CountDownLatch(1); final CountDownLatch phase2finished = new CountDownLatch(1); final CountDownLatch phase3finished = new CountDownLatch(1); final AtomicBoolean keepFailing = new AtomicBoolean(true); MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, node1)); mockTransportService.addDelegate( internalCluster().getInstance(TransportService.class, node3), new MockTransportService.DelegateTransport(mockTransportService.original()) { @Override public void sendRequest( DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException { if (keepFailing.get() && action.equals(PeerRecoveryTargetService.Actions.TRANSLOG_OPS)) { logger.info("--> failing translog ops"); throw new ElasticsearchException("failing on purpose"); } super.sendRequest(node, requestId, action, request, options); } }); Thread thread = new Thread() { @Override public void run() { started.countDown(); while (counter.get() < (numPhase1Docs + numPhase2Docs + numPhase3Docs)) { final IndexResponse indexResponse = client() .prepareIndex(IDX, "doc", Integer.toString(counter.incrementAndGet())) .setSource("foo", "bar") .get(); assertEquals(DocWriteResponse.Result.CREATED, indexResponse.getResult()); final int docCount = counter.get(); if (docCount == numPhase1Docs) { phase1finished.countDown(); } else if (docCount == (numPhase1Docs + numPhase2Docs)) { phase2finished.countDown(); } } logger.info("--> stopping indexing thread"); phase3finished.countDown(); } }; thread.start(); started.await(); phase1finished.await(); // wait for a certain number of documents to be indexed logger.info("--> excluding {} from allocation", node1); // now prevent primary from being allocated on node 1 move to node_3 Settings build = Settings.builder().put("index.routing.allocation.exclude._name", node1).build(); client().admin().indices().prepareUpdateSettings(IDX).setSettings(build).execute().actionGet(); // wait for more documents to be indexed post-recovery, also waits for // indexing thread to stop phase2finished.await(); // stop failing keepFailing.set(false); // wait for more docs to be indexed phase3finished.await(); ensureGreen(IDX); thread.join(); logger.info("--> performing query"); flushAndRefresh(); SearchResponse resp = client().prepareSearch(IDX).setQuery(matchAllQuery()).get(); assertHitCount(resp, counter.get()); }