@Override
 public void onFailure(Exception e) {
   if (ExceptionsHelper.unwrap(e, EsRejectedExecutionException.class) != null) {
     if (retries.hasNext()) {
       retryCount += 1;
       TimeValue delay = retries.next();
       logger.trace("retrying rejected search after [{}]", e, delay);
       countSearchRetry.run();
       threadPool.schedule(delay, ThreadPool.Names.SAME, this);
     } else {
       logger.warn(
           "giving up on search because we retried [{}] times without success", e, retryCount);
       fail.accept(e);
     }
   } else {
     logger.warn("giving up on search because it failed with a non-retryable exception", e);
     fail.accept(e);
   }
 }
    @Override
    public void messageReceived(
        final RecoveryTranslogOperationsRequest request, final TransportChannel channel)
        throws IOException {
      try (RecoveriesCollection.RecoveryRef recoveryRef =
          onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId())) {
        final ClusterStateObserver observer =
            new ClusterStateObserver(clusterService, null, logger, threadPool.getThreadContext());
        final RecoveryTarget recoveryTarget = recoveryRef.status();
        try {
          recoveryTarget.indexTranslogOperations(request.operations(), request.totalTranslogOps());
          channel.sendResponse(TransportResponse.Empty.INSTANCE);
        } catch (TranslogRecoveryPerformer.BatchOperationException exception) {
          MapperException mapperException =
              (MapperException) ExceptionsHelper.unwrap(exception, MapperException.class);
          if (mapperException == null) {
            throw exception;
          }
          // in very rare cases a translog replay from primary is processed before a mapping update
          // on this node
          // which causes local mapping changes since the mapping (clusterstate) might not have
          // arrived on this node.
          // we want to wait until these mappings are processed but also need to do some maintenance
          // and roll back the
          // number of processed (completed) operations in this batch to ensure accounting is
          // correct.
          logger.trace(
              (Supplier<?>)
                  () ->
                      new ParameterizedMessage(
                          "delaying recovery due to missing mapping changes (rolling back stats for [{}] ops)",
                          exception.completedOperations()),
              exception);
          final RecoveryState.Translog translog = recoveryTarget.state().getTranslog();
          translog.decrementRecoveredOperations(
              exception.completedOperations()); // do the maintainance and rollback competed ops
          // we do not need to use a timeout here since the entire recovery mechanism has an
          // inactivity protection (it will be
          // canceled)
          observer.waitForNextChange(
              new ClusterStateObserver.Listener() {
                @Override
                public void onNewClusterState(ClusterState state) {
                  try {
                    messageReceived(request, channel);
                  } catch (Exception e) {
                    onFailure(e);
                  }
                }

                protected void onFailure(Exception e) {
                  try {
                    channel.sendResponse(e);
                  } catch (IOException e1) {
                    logger.warn("failed to send error back to recovery source", e1);
                  }
                }

                @Override
                public void onClusterServiceClose() {
                  onFailure(
                      new ElasticsearchException(
                          "cluster service was closed while waiting for mapping updates"));
                }

                @Override
                public void onTimeout(TimeValue timeout) {
                  // note that we do not use a timeout (see comment above)
                  onFailure(
                      new ElasticsearchTimeoutException(
                          "timed out waiting for mapping updates (timeout [" + timeout + "])"));
                }
              });
        }
      }
    }
Ejemplo n.º 3
0
    @Override
    public void messageReceived(
        final RecoveryTranslogOperationsRequest request, final TransportChannel channel)
        throws Exception {
      try (RecoveriesCollection.StatusRef statusRef =
          onGoingRecoveries.getStatusSafe(request.recoveryId(), request.shardId())) {
        final ClusterStateObserver observer =
            new ClusterStateObserver(clusterService, null, logger);
        final RecoveryStatus recoveryStatus = statusRef.status();
        final RecoveryState.Translog translog = recoveryStatus.state().getTranslog();
        translog.totalOperations(request.totalTranslogOps());
        assert recoveryStatus.indexShard().recoveryState() == recoveryStatus.state();
        try {
          recoveryStatus.indexShard().performBatchRecovery(request.operations());
          channel.sendResponse(TransportResponse.Empty.INSTANCE);
        } catch (TranslogRecoveryPerformer.BatchOperationException exception) {
          MapperException mapperException =
              (MapperException) ExceptionsHelper.unwrap(exception, MapperException.class);
          if (mapperException == null) {
            throw exception;
          }
          // in very rare cases a translog replay from primary is processed before a mapping update
          // on this node
          // which causes local mapping changes. we want to wait until these mappings are processed.
          logger.trace(
              "delaying recovery due to missing mapping changes (rolling back stats for [{}] ops)",
              exception,
              exception.completedOperations());
          translog.decrementRecoveredOperations(exception.completedOperations());
          // we do not need to use a timeout here since the entire recovery mechanism has an
          // inactivity protection (it will be
          // canceled)
          observer.waitForNextChange(
              new ClusterStateObserver.Listener() {
                @Override
                public void onNewClusterState(ClusterState state) {
                  try {
                    messageReceived(request, channel);
                  } catch (Exception e) {
                    onFailure(e);
                  }
                }

                protected void onFailure(Exception e) {
                  try {
                    channel.sendResponse(e);
                  } catch (IOException e1) {
                    logger.warn("failed to send error back to recovery source", e1);
                  }
                }

                @Override
                public void onClusterServiceClose() {
                  onFailure(
                      new ElasticsearchException(
                          "cluster service was closed while waiting for mapping updates"));
                }

                @Override
                public void onTimeout(TimeValue timeout) {
                  // note that we do not use a timeout (see comment above)
                  onFailure(
                      new ElasticsearchTimeoutException(
                          "timed out waiting for mapping updates (timeout [" + timeout + "])"));
                }
              });
        }
      }
    }
  /**
   * Test that a breaker correctly redistributes to a different breaker, in this case, the fielddata
   * breaker borrows space from the request breaker
   */
  @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/18325")
  public void testParentChecking() throws Exception {
    if (noopBreakerUsed()) {
      logger.info("--> noop breakers used, skipping test");
      return;
    }
    assertAcked(
        prepareCreate(
                "cb-test", 1, Settings.builder().put(SETTING_NUMBER_OF_REPLICAS, between(0, 1)))
            .addMapping("type", "test", "type=text,fielddata=true"));
    Client client = client();

    // index some different terms so we have some field data for loading
    int docCount = scaledRandomIntBetween(300, 1000);
    List<IndexRequestBuilder> reqs = new ArrayList<>();
    for (long id = 0; id < docCount; id++) {
      reqs.add(
          client
              .prepareIndex("cb-test", "type", Long.toString(id))
              .setSource("test", "value" + id));
    }
    indexRandom(true, reqs);

    Settings resetSettings =
        Settings.builder()
            .put(
                HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_LIMIT_SETTING.getKey(),
                "10b")
            .put(
                HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_OVERHEAD_SETTING.getKey(),
                1.0)
            .build();
    assertAcked(
        client.admin().cluster().prepareUpdateSettings().setTransientSettings(resetSettings));

    // Perform a search to load field data for the "test" field
    try {
      client
          .prepareSearch("cb-test")
          .setQuery(matchAllQuery())
          .addSort("test", SortOrder.DESC)
          .get();
      fail("should have thrown an exception");
    } catch (Exception e) {
      String errMsg =
          "[fielddata] Data too large, data for [test] would be larger than limit of [10/10b]";
      assertThat(
          "Exception: [" + e.toString() + "] should contain a CircuitBreakingException",
          e.toString(),
          containsString(errMsg));
    }

    assertFailures(
        client.prepareSearch("cb-test").setQuery(matchAllQuery()).addSort("test", SortOrder.DESC),
        RestStatus.INTERNAL_SERVER_ERROR,
        containsString("Data too large, data for [test] would be larger than limit of [10/10b]"));

    reset();

    // Adjust settings so the parent breaker will fail, but neither the fielddata breaker nor the
    // node request breaker will fail
    resetSettings =
        Settings.builder()
            .put(
                HierarchyCircuitBreakerService.TOTAL_CIRCUIT_BREAKER_LIMIT_SETTING.getKey(), "500b")
            .put(
                HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_LIMIT_SETTING.getKey(),
                "90%")
            .put(
                HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_OVERHEAD_SETTING.getKey(),
                1.0)
            .build();
    client
        .admin()
        .cluster()
        .prepareUpdateSettings()
        .setTransientSettings(resetSettings)
        .execute()
        .actionGet();

    // Perform a search to load field data for the "test" field
    try {
      client
          .prepareSearch("cb-test")
          .setQuery(matchAllQuery())
          .addSort("test", SortOrder.DESC)
          .get();
      fail("should have thrown an exception");
    } catch (Exception e) {
      final Throwable cause = ExceptionsHelper.unwrap(e, CircuitBreakingException.class);
      assertNotNull("CircuitBreakingException is not the cause of " + e, cause);
      String errMsg = "would be larger than limit of [500/500b]]";
      assertThat(
          "Exception: [" + cause.toString() + "] should contain a CircuitBreakingException",
          cause.toString(),
          startsWith("CircuitBreakingException[[parent] Data too large"));
      assertThat(
          "Exception: [" + cause.toString() + "] should contain a CircuitBreakingException",
          cause.toString(),
          endsWith(errMsg));
    } finally {
      // reset before teardown as it requires properly set up breakers
      reset();
    }
  }