@Override public void onFailure(Exception e) { if (ExceptionsHelper.unwrap(e, EsRejectedExecutionException.class) != null) { if (retries.hasNext()) { retryCount += 1; TimeValue delay = retries.next(); logger.trace("retrying rejected search after [{}]", e, delay); countSearchRetry.run(); threadPool.schedule(delay, ThreadPool.Names.SAME, this); } else { logger.warn( "giving up on search because we retried [{}] times without success", e, retryCount); fail.accept(e); } } else { logger.warn("giving up on search because it failed with a non-retryable exception", e); fail.accept(e); } }
@Override public void messageReceived( final RecoveryTranslogOperationsRequest request, final TransportChannel channel) throws IOException { try (RecoveriesCollection.RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId())) { final ClusterStateObserver observer = new ClusterStateObserver(clusterService, null, logger, threadPool.getThreadContext()); final RecoveryTarget recoveryTarget = recoveryRef.status(); try { recoveryTarget.indexTranslogOperations(request.operations(), request.totalTranslogOps()); channel.sendResponse(TransportResponse.Empty.INSTANCE); } catch (TranslogRecoveryPerformer.BatchOperationException exception) { MapperException mapperException = (MapperException) ExceptionsHelper.unwrap(exception, MapperException.class); if (mapperException == null) { throw exception; } // in very rare cases a translog replay from primary is processed before a mapping update // on this node // which causes local mapping changes since the mapping (clusterstate) might not have // arrived on this node. // we want to wait until these mappings are processed but also need to do some maintenance // and roll back the // number of processed (completed) operations in this batch to ensure accounting is // correct. logger.trace( (Supplier<?>) () -> new ParameterizedMessage( "delaying recovery due to missing mapping changes (rolling back stats for [{}] ops)", exception.completedOperations()), exception); final RecoveryState.Translog translog = recoveryTarget.state().getTranslog(); translog.decrementRecoveredOperations( exception.completedOperations()); // do the maintainance and rollback competed ops // we do not need to use a timeout here since the entire recovery mechanism has an // inactivity protection (it will be // canceled) observer.waitForNextChange( new ClusterStateObserver.Listener() { @Override public void onNewClusterState(ClusterState state) { try { messageReceived(request, channel); } catch (Exception e) { onFailure(e); } } protected void onFailure(Exception e) { try { channel.sendResponse(e); } catch (IOException e1) { logger.warn("failed to send error back to recovery source", e1); } } @Override public void onClusterServiceClose() { onFailure( new ElasticsearchException( "cluster service was closed while waiting for mapping updates")); } @Override public void onTimeout(TimeValue timeout) { // note that we do not use a timeout (see comment above) onFailure( new ElasticsearchTimeoutException( "timed out waiting for mapping updates (timeout [" + timeout + "])")); } }); } } }
@Override public void messageReceived( final RecoveryTranslogOperationsRequest request, final TransportChannel channel) throws Exception { try (RecoveriesCollection.StatusRef statusRef = onGoingRecoveries.getStatusSafe(request.recoveryId(), request.shardId())) { final ClusterStateObserver observer = new ClusterStateObserver(clusterService, null, logger); final RecoveryStatus recoveryStatus = statusRef.status(); final RecoveryState.Translog translog = recoveryStatus.state().getTranslog(); translog.totalOperations(request.totalTranslogOps()); assert recoveryStatus.indexShard().recoveryState() == recoveryStatus.state(); try { recoveryStatus.indexShard().performBatchRecovery(request.operations()); channel.sendResponse(TransportResponse.Empty.INSTANCE); } catch (TranslogRecoveryPerformer.BatchOperationException exception) { MapperException mapperException = (MapperException) ExceptionsHelper.unwrap(exception, MapperException.class); if (mapperException == null) { throw exception; } // in very rare cases a translog replay from primary is processed before a mapping update // on this node // which causes local mapping changes. we want to wait until these mappings are processed. logger.trace( "delaying recovery due to missing mapping changes (rolling back stats for [{}] ops)", exception, exception.completedOperations()); translog.decrementRecoveredOperations(exception.completedOperations()); // we do not need to use a timeout here since the entire recovery mechanism has an // inactivity protection (it will be // canceled) observer.waitForNextChange( new ClusterStateObserver.Listener() { @Override public void onNewClusterState(ClusterState state) { try { messageReceived(request, channel); } catch (Exception e) { onFailure(e); } } protected void onFailure(Exception e) { try { channel.sendResponse(e); } catch (IOException e1) { logger.warn("failed to send error back to recovery source", e1); } } @Override public void onClusterServiceClose() { onFailure( new ElasticsearchException( "cluster service was closed while waiting for mapping updates")); } @Override public void onTimeout(TimeValue timeout) { // note that we do not use a timeout (see comment above) onFailure( new ElasticsearchTimeoutException( "timed out waiting for mapping updates (timeout [" + timeout + "])")); } }); } } }
/** * Test that a breaker correctly redistributes to a different breaker, in this case, the fielddata * breaker borrows space from the request breaker */ @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/18325") public void testParentChecking() throws Exception { if (noopBreakerUsed()) { logger.info("--> noop breakers used, skipping test"); return; } assertAcked( prepareCreate( "cb-test", 1, Settings.builder().put(SETTING_NUMBER_OF_REPLICAS, between(0, 1))) .addMapping("type", "test", "type=text,fielddata=true")); Client client = client(); // index some different terms so we have some field data for loading int docCount = scaledRandomIntBetween(300, 1000); List<IndexRequestBuilder> reqs = new ArrayList<>(); for (long id = 0; id < docCount; id++) { reqs.add( client .prepareIndex("cb-test", "type", Long.toString(id)) .setSource("test", "value" + id)); } indexRandom(true, reqs); Settings resetSettings = Settings.builder() .put( HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_LIMIT_SETTING.getKey(), "10b") .put( HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_OVERHEAD_SETTING.getKey(), 1.0) .build(); assertAcked( client.admin().cluster().prepareUpdateSettings().setTransientSettings(resetSettings)); // Perform a search to load field data for the "test" field try { client .prepareSearch("cb-test") .setQuery(matchAllQuery()) .addSort("test", SortOrder.DESC) .get(); fail("should have thrown an exception"); } catch (Exception e) { String errMsg = "[fielddata] Data too large, data for [test] would be larger than limit of [10/10b]"; assertThat( "Exception: [" + e.toString() + "] should contain a CircuitBreakingException", e.toString(), containsString(errMsg)); } assertFailures( client.prepareSearch("cb-test").setQuery(matchAllQuery()).addSort("test", SortOrder.DESC), RestStatus.INTERNAL_SERVER_ERROR, containsString("Data too large, data for [test] would be larger than limit of [10/10b]")); reset(); // Adjust settings so the parent breaker will fail, but neither the fielddata breaker nor the // node request breaker will fail resetSettings = Settings.builder() .put( HierarchyCircuitBreakerService.TOTAL_CIRCUIT_BREAKER_LIMIT_SETTING.getKey(), "500b") .put( HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_LIMIT_SETTING.getKey(), "90%") .put( HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_OVERHEAD_SETTING.getKey(), 1.0) .build(); client .admin() .cluster() .prepareUpdateSettings() .setTransientSettings(resetSettings) .execute() .actionGet(); // Perform a search to load field data for the "test" field try { client .prepareSearch("cb-test") .setQuery(matchAllQuery()) .addSort("test", SortOrder.DESC) .get(); fail("should have thrown an exception"); } catch (Exception e) { final Throwable cause = ExceptionsHelper.unwrap(e, CircuitBreakingException.class); assertNotNull("CircuitBreakingException is not the cause of " + e, cause); String errMsg = "would be larger than limit of [500/500b]]"; assertThat( "Exception: [" + cause.toString() + "] should contain a CircuitBreakingException", cause.toString(), startsWith("CircuitBreakingException[[parent] Data too large")); assertThat( "Exception: [" + cause.toString() + "] should contain a CircuitBreakingException", cause.toString(), endsWith(errMsg)); } finally { // reset before teardown as it requires properly set up breakers reset(); } }