예제 #1
0
public class GatewayService extends AbstractLifecycleComponent implements ClusterStateListener {

  public static final Setting<Integer> EXPECTED_NODES_SETTING =
      Setting.intSetting("gateway.expected_nodes", -1, -1, Property.NodeScope);
  public static final Setting<Integer> EXPECTED_DATA_NODES_SETTING =
      Setting.intSetting("gateway.expected_data_nodes", -1, -1, Property.NodeScope);
  public static final Setting<Integer> EXPECTED_MASTER_NODES_SETTING =
      Setting.intSetting("gateway.expected_master_nodes", -1, -1, Property.NodeScope);
  public static final Setting<TimeValue> RECOVER_AFTER_TIME_SETTING =
      Setting.positiveTimeSetting(
          "gateway.recover_after_time", TimeValue.timeValueMillis(0), Property.NodeScope);
  public static final Setting<Integer> RECOVER_AFTER_NODES_SETTING =
      Setting.intSetting("gateway.recover_after_nodes", -1, -1, Property.NodeScope);
  public static final Setting<Integer> RECOVER_AFTER_DATA_NODES_SETTING =
      Setting.intSetting("gateway.recover_after_data_nodes", -1, -1, Property.NodeScope);
  public static final Setting<Integer> RECOVER_AFTER_MASTER_NODES_SETTING =
      Setting.intSetting("gateway.recover_after_master_nodes", 0, 0, Property.NodeScope);

  public static final ClusterBlock STATE_NOT_RECOVERED_BLOCK =
      new ClusterBlock(
          1,
          "state not recovered / initialized",
          true,
          true,
          RestStatus.SERVICE_UNAVAILABLE,
          ClusterBlockLevel.ALL);

  public static final TimeValue DEFAULT_RECOVER_AFTER_TIME_IF_EXPECTED_NODES_IS_SET =
      TimeValue.timeValueMinutes(5);

  private final Gateway gateway;

  private final ThreadPool threadPool;

  private final AllocationService allocationService;

  private final ClusterService clusterService;

  private final TimeValue recoverAfterTime;
  private final int recoverAfterNodes;
  private final int expectedNodes;
  private final int recoverAfterDataNodes;
  private final int expectedDataNodes;
  private final int recoverAfterMasterNodes;
  private final int expectedMasterNodes;

  private final AtomicBoolean recovered = new AtomicBoolean();
  private final AtomicBoolean scheduledRecovery = new AtomicBoolean();

  @Inject
  public GatewayService(
      Settings settings,
      AllocationService allocationService,
      ClusterService clusterService,
      ThreadPool threadPool,
      GatewayMetaState metaState,
      TransportNodesListGatewayMetaState listGatewayMetaState,
      Discovery discovery,
      IndicesService indicesService) {
    super(settings);
    this.gateway =
        new Gateway(
            settings, clusterService, metaState, listGatewayMetaState, discovery, indicesService);
    this.allocationService = allocationService;
    this.clusterService = clusterService;
    this.threadPool = threadPool;
    // allow to control a delay of when indices will get created
    this.expectedNodes = EXPECTED_NODES_SETTING.get(this.settings);
    this.expectedDataNodes = EXPECTED_DATA_NODES_SETTING.get(this.settings);
    this.expectedMasterNodes = EXPECTED_MASTER_NODES_SETTING.get(this.settings);

    if (RECOVER_AFTER_TIME_SETTING.exists(this.settings)) {
      recoverAfterTime = RECOVER_AFTER_TIME_SETTING.get(this.settings);
    } else if (expectedNodes >= 0 || expectedDataNodes >= 0 || expectedMasterNodes >= 0) {
      recoverAfterTime = DEFAULT_RECOVER_AFTER_TIME_IF_EXPECTED_NODES_IS_SET;
    } else {
      recoverAfterTime = null;
    }
    this.recoverAfterNodes = RECOVER_AFTER_NODES_SETTING.get(this.settings);
    this.recoverAfterDataNodes = RECOVER_AFTER_DATA_NODES_SETTING.get(this.settings);
    // default the recover after master nodes to the minimum master nodes in the discovery
    if (RECOVER_AFTER_MASTER_NODES_SETTING.exists(this.settings)) {
      recoverAfterMasterNodes = RECOVER_AFTER_MASTER_NODES_SETTING.get(this.settings);
    } else {
      // TODO: change me once the minimum_master_nodes is changed too
      recoverAfterMasterNodes = settings.getAsInt("discovery.zen.minimum_master_nodes", -1);
    }

    // Add the not recovered as initial state block, we don't allow anything until
    this.clusterService.addInitialStateBlock(STATE_NOT_RECOVERED_BLOCK);
  }

  @Override
  protected void doStart() {
    // use post applied so that the state will be visible to the background recovery thread we spawn
    // in performStateRecovery
    clusterService.addListener(this);
  }

  @Override
  protected void doStop() {
    clusterService.removeListener(this);
  }

  @Override
  protected void doClose() {}

  @Override
  public void clusterChanged(final ClusterChangedEvent event) {
    if (lifecycle.stoppedOrClosed()) {
      return;
    }

    final ClusterState state = event.state();

    if (state.nodes().isLocalNodeElectedMaster() == false) {
      // not our job to recover
      return;
    }
    if (state.blocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) {
      // already recovered
      return;
    }

    DiscoveryNodes nodes = state.nodes();
    if (state.nodes().getMasterNodeId() == null) {
      logger.debug("not recovering from gateway, no master elected yet");
    } else if (recoverAfterNodes != -1
        && (nodes.getMasterAndDataNodes().size()) < recoverAfterNodes) {
      logger.debug(
          "not recovering from gateway, nodes_size (data+master) [{}] < recover_after_nodes [{}]",
          nodes.getMasterAndDataNodes().size(),
          recoverAfterNodes);
    } else if (recoverAfterDataNodes != -1 && nodes.getDataNodes().size() < recoverAfterDataNodes) {
      logger.debug(
          "not recovering from gateway, nodes_size (data) [{}] < recover_after_data_nodes [{}]",
          nodes.getDataNodes().size(),
          recoverAfterDataNodes);
    } else if (recoverAfterMasterNodes != -1
        && nodes.getMasterNodes().size() < recoverAfterMasterNodes) {
      logger.debug(
          "not recovering from gateway, nodes_size (master) [{}] < recover_after_master_nodes [{}]",
          nodes.getMasterNodes().size(),
          recoverAfterMasterNodes);
    } else {
      boolean enforceRecoverAfterTime;
      String reason;
      if (expectedNodes == -1 && expectedMasterNodes == -1 && expectedDataNodes == -1) {
        // no expected is set, honor the setting if they are there
        enforceRecoverAfterTime = true;
        reason = "recover_after_time was set to [" + recoverAfterTime + "]";
      } else {
        // one of the expected is set, see if all of them meet the need, and ignore the timeout in
        // this case
        enforceRecoverAfterTime = false;
        reason = "";
        if (expectedNodes != -1
            && (nodes.getMasterAndDataNodes().size()
                < expectedNodes)) { // does not meet the expected...
          enforceRecoverAfterTime = true;
          reason =
              "expecting ["
                  + expectedNodes
                  + "] nodes, but only have ["
                  + nodes.getMasterAndDataNodes().size()
                  + "]";
        } else if (expectedDataNodes != -1
            && (nodes.getDataNodes().size() < expectedDataNodes)) { // does not meet the expected...
          enforceRecoverAfterTime = true;
          reason =
              "expecting ["
                  + expectedDataNodes
                  + "] data nodes, but only have ["
                  + nodes.getDataNodes().size()
                  + "]";
        } else if (expectedMasterNodes != -1
            && (nodes.getMasterNodes().size()
                < expectedMasterNodes)) { // does not meet the expected...
          enforceRecoverAfterTime = true;
          reason =
              "expecting ["
                  + expectedMasterNodes
                  + "] master nodes, but only have ["
                  + nodes.getMasterNodes().size()
                  + "]";
        }
      }
      performStateRecovery(enforceRecoverAfterTime, reason);
    }
  }

  private void performStateRecovery(boolean enforceRecoverAfterTime, String reason) {
    final Gateway.GatewayStateRecoveredListener recoveryListener = new GatewayRecoveryListener();

    if (enforceRecoverAfterTime && recoverAfterTime != null) {
      if (scheduledRecovery.compareAndSet(false, true)) {
        logger.info("delaying initial state recovery for [{}]. {}", recoverAfterTime, reason);
        threadPool.schedule(
            recoverAfterTime,
            ThreadPool.Names.GENERIC,
            () -> {
              if (recovered.compareAndSet(false, true)) {
                logger.info(
                    "recover_after_time [{}] elapsed. performing state recovery...",
                    recoverAfterTime);
                gateway.performStateRecovery(recoveryListener);
              }
            });
      }
    } else {
      if (recovered.compareAndSet(false, true)) {
        threadPool
            .generic()
            .execute(
                new AbstractRunnable() {
                  @Override
                  public void onFailure(Exception e) {
                    logger.warn("Recovery failed", e);
                    // we reset `recovered` in the listener don't reset it here otherwise there
                    // might be a race
                    // that resets it to false while a new recover is already running?
                    recoveryListener.onFailure("state recovery failed: " + e.getMessage());
                  }

                  @Override
                  protected void doRun() throws Exception {
                    gateway.performStateRecovery(recoveryListener);
                  }
                });
      }
    }
  }

  public Gateway getGateway() {
    return gateway;
  }

  class GatewayRecoveryListener implements Gateway.GatewayStateRecoveredListener {

    @Override
    public void onSuccess(final ClusterState recoveredState) {
      logger.trace("successful state recovery, importing cluster state...");
      clusterService.submitStateUpdateTask(
          "local-gateway-elected-state",
          new ClusterStateUpdateTask() {
            @Override
            public ClusterState execute(ClusterState currentState) {
              assert currentState.metaData().indices().isEmpty();

              // remove the block, since we recovered from gateway
              ClusterBlocks.Builder blocks =
                  ClusterBlocks.builder()
                      .blocks(currentState.blocks())
                      .blocks(recoveredState.blocks())
                      .removeGlobalBlock(STATE_NOT_RECOVERED_BLOCK);

              MetaData.Builder metaDataBuilder = MetaData.builder(recoveredState.metaData());
              // automatically generate a UID for the metadata if we need to
              metaDataBuilder.generateClusterUuidIfNeeded();

              if (MetaData.SETTING_READ_ONLY_SETTING.get(recoveredState.metaData().settings())
                  || MetaData.SETTING_READ_ONLY_SETTING.get(currentState.metaData().settings())) {
                blocks.addGlobalBlock(MetaData.CLUSTER_READ_ONLY_BLOCK);
              }

              for (IndexMetaData indexMetaData : recoveredState.metaData()) {
                metaDataBuilder.put(indexMetaData, false);
                blocks.addBlocks(indexMetaData);
              }

              // update the state to reflect the new metadata and routing
              ClusterState updatedState =
                  ClusterState.builder(currentState)
                      .blocks(blocks)
                      .metaData(metaDataBuilder)
                      .build();

              // initialize all index routing tables as empty
              RoutingTable.Builder routingTableBuilder =
                  RoutingTable.builder(updatedState.routingTable());
              for (ObjectCursor<IndexMetaData> cursor :
                  updatedState.metaData().indices().values()) {
                routingTableBuilder.addAsRecovery(cursor.value);
              }
              // start with 0 based versions for routing table
              routingTableBuilder.version(0);

              // now, reroute
              updatedState =
                  ClusterState.builder(updatedState)
                      .routingTable(routingTableBuilder.build())
                      .build();
              return allocationService.reroute(updatedState, "state recovered");
            }

            @Override
            public void onFailure(String source, Exception e) {
              logger.error(
                  (Supplier<?>)
                      () -> new ParameterizedMessage("unexpected failure during [{}]", source),
                  e);
              GatewayRecoveryListener.this.onFailure("failed to updated cluster state");
            }

            @Override
            public void clusterStateProcessed(
                String source, ClusterState oldState, ClusterState newState) {
              logger.info(
                  "recovered [{}] indices into cluster_state",
                  newState.metaData().indices().size());
            }
          });
    }

    @Override
    public void onFailure(String message) {
      recovered.set(false);
      scheduledRecovery.set(false);
      // don't remove the block here, we don't want to allow anything in such a case
      logger.info("metadata state not restored, reason: {}", message);
    }
  }

  // used for testing
  public TimeValue recoverAfterTime() {
    return recoverAfterTime;
  }
}
/** A node level service that delete expired docs on node primary shards. */
public class IndicesTTLService extends AbstractLifecycleComponent<IndicesTTLService> {

  public static final Setting<TimeValue> INDICES_TTL_INTERVAL_SETTING =
      Setting.positiveTimeSetting(
          "indices.ttl.interval",
          TimeValue.timeValueSeconds(60),
          Property.Dynamic,
          Property.NodeScope);

  private final ClusterService clusterService;
  private final IndicesService indicesService;
  private final TransportBulkAction bulkAction;

  private final int bulkSize;
  private PurgerThread purgerThread;

  @Inject
  public IndicesTTLService(
      Settings settings,
      ClusterService clusterService,
      IndicesService indicesService,
      ClusterSettings clusterSettings,
      TransportBulkAction bulkAction) {
    super(settings);
    this.clusterService = clusterService;
    this.indicesService = indicesService;
    TimeValue interval = INDICES_TTL_INTERVAL_SETTING.get(settings);
    this.bulkAction = bulkAction;
    this.bulkSize = this.settings.getAsInt("indices.ttl.bulk_size", 10000);
    this.purgerThread =
        new PurgerThread(EsExecutors.threadName(settings, "[ttl_expire]"), interval);
    clusterSettings.addSettingsUpdateConsumer(
        INDICES_TTL_INTERVAL_SETTING, this.purgerThread::resetInterval);
  }

  @Override
  protected void doStart() {
    this.purgerThread.start();
  }

  @Override
  protected void doStop() {
    try {
      this.purgerThread.shutdown();
    } catch (InterruptedException e) {
      // we intentionally do not want to restore the interruption flag, we're about to shutdown
      // anyway
    }
  }

  @Override
  protected void doClose() {}

  private class PurgerThread extends Thread {
    private final AtomicBoolean running = new AtomicBoolean(true);
    private final Notifier notifier;
    private final CountDownLatch shutdownLatch = new CountDownLatch(1);

    public PurgerThread(String name, TimeValue interval) {
      super(name);
      setDaemon(true);
      this.notifier = new Notifier(interval);
    }

    public void shutdown() throws InterruptedException {
      if (running.compareAndSet(true, false)) {
        notifier.doNotify();
        shutdownLatch.await();
      }
    }

    public void resetInterval(TimeValue interval) {
      notifier.setTimeout(interval);
    }

    @Override
    public void run() {
      try {
        while (running.get()) {
          try {
            List<IndexShard> shardsToPurge = getShardsToPurge();
            purgeShards(shardsToPurge);
          } catch (Throwable e) {
            if (running.get()) {
              logger.warn("failed to execute ttl purge", e);
            }
          }
          if (running.get()) {
            notifier.await();
          }
        }
      } finally {
        shutdownLatch.countDown();
      }
    }

    /**
     * Returns the shards to purge, i.e. the local started primary shards that have ttl enabled and
     * disable_purge to false
     */
    private List<IndexShard> getShardsToPurge() {
      List<IndexShard> shardsToPurge = new ArrayList<>();
      MetaData metaData = clusterService.state().metaData();
      for (IndexService indexService : indicesService) {
        // check the value of disable_purge for this index
        IndexMetaData indexMetaData = metaData.index(indexService.index());
        if (indexMetaData == null) {
          continue;
        }
        if (indexService.getIndexSettings().isTTLPurgeDisabled()) {
          continue;
        }

        // check if ttl is enabled for at least one type of this index
        boolean hasTTLEnabled = false;
        for (String type : indexService.mapperService().types()) {
          DocumentMapper documentType = indexService.mapperService().documentMapper(type);
          if (documentType.TTLFieldMapper().enabled()) {
            hasTTLEnabled = true;
            break;
          }
        }
        if (hasTTLEnabled) {
          for (IndexShard indexShard : indexService) {
            if (indexShard.state() == IndexShardState.STARTED
                && indexShard.routingEntry().primary()
                && indexShard.routingEntry().started()) {
              shardsToPurge.add(indexShard);
            }
          }
        }
      }
      return shardsToPurge;
    }

    public TimeValue getInterval() {
      return notifier.getTimeout();
    }
  }

  private void purgeShards(List<IndexShard> shardsToPurge) {
    for (IndexShard shardToPurge : shardsToPurge) {
      Query query =
          shardToPurge
              .mapperService()
              .fullName(TTLFieldMapper.NAME)
              .rangeQuery(null, System.currentTimeMillis(), false, true);
      Engine.Searcher searcher = shardToPurge.acquireSearcher("indices_ttl");
      try {
        logger.debug(
            "[{}][{}] purging shard",
            shardToPurge.routingEntry().index(),
            shardToPurge.routingEntry().id());
        ExpiredDocsCollector expiredDocsCollector = new ExpiredDocsCollector();
        searcher.searcher().search(query, expiredDocsCollector);
        List<DocToPurge> docsToPurge = expiredDocsCollector.getDocsToPurge();

        BulkRequest bulkRequest = new BulkRequest();
        for (DocToPurge docToPurge : docsToPurge) {

          bulkRequest.add(
              new DeleteRequest()
                  .index(shardToPurge.routingEntry().getIndexName())
                  .type(docToPurge.type)
                  .id(docToPurge.id)
                  .version(docToPurge.version)
                  .routing(docToPurge.routing));
          bulkRequest = processBulkIfNeeded(bulkRequest, false);
        }
        processBulkIfNeeded(bulkRequest, true);
      } catch (Exception e) {
        logger.warn("failed to purge", e);
      } finally {
        searcher.close();
      }
    }
  }

  private static class DocToPurge {
    public final String type;
    public final String id;
    public final long version;
    public final String routing;

    public DocToPurge(String type, String id, long version, String routing) {
      this.type = type;
      this.id = id;
      this.version = version;
      this.routing = routing;
    }
  }

  private class ExpiredDocsCollector extends SimpleCollector {
    private LeafReaderContext context;
    private List<DocToPurge> docsToPurge = new ArrayList<>();

    public ExpiredDocsCollector() {}

    @Override
    public void setScorer(Scorer scorer) {}

    @Override
    public boolean needsScores() {
      return false;
    }

    @Override
    public void collect(int doc) {
      try {
        FieldsVisitor fieldsVisitor = new FieldsVisitor(false);
        context.reader().document(doc, fieldsVisitor);
        Uid uid = fieldsVisitor.uid();
        final long version =
            Versions.loadVersion(context.reader(), new Term(UidFieldMapper.NAME, uid.toBytesRef()));
        docsToPurge.add(new DocToPurge(uid.type(), uid.id(), version, fieldsVisitor.routing()));
      } catch (Exception e) {
        logger.trace("failed to collect doc", e);
      }
    }

    @Override
    public void doSetNextReader(LeafReaderContext context) throws IOException {
      this.context = context;
    }

    public List<DocToPurge> getDocsToPurge() {
      return this.docsToPurge;
    }
  }

  private BulkRequest processBulkIfNeeded(BulkRequest bulkRequest, boolean force) {
    if ((force && bulkRequest.numberOfActions() > 0) || bulkRequest.numberOfActions() >= bulkSize) {
      try {
        bulkAction.executeBulk(
            bulkRequest,
            new ActionListener<BulkResponse>() {
              @Override
              public void onResponse(BulkResponse bulkResponse) {
                if (bulkResponse.hasFailures()) {
                  int failedItems = 0;
                  for (BulkItemResponse response : bulkResponse) {
                    if (response.isFailed()) failedItems++;
                  }
                  if (logger.isTraceEnabled()) {
                    logger.trace(
                        "bulk deletion failures for [{}]/[{}] items, failure message: [{}]",
                        failedItems,
                        bulkResponse.getItems().length,
                        bulkResponse.buildFailureMessage());
                  } else {
                    logger.error(
                        "bulk deletion failures for [{}]/[{}] items",
                        failedItems,
                        bulkResponse.getItems().length);
                  }
                } else {
                  logger.trace("bulk deletion took {}ms", bulkResponse.getTookInMillis());
                }
              }

              @Override
              public void onFailure(Throwable e) {
                if (logger.isTraceEnabled()) {
                  logger.trace("failed to execute bulk", e);
                } else {
                  logger.warn("failed to execute bulk: ", e);
                }
              }
            });
      } catch (Exception e) {
        logger.warn("failed to process bulk", e);
      }
      bulkRequest = new BulkRequest();
    }
    return bulkRequest;
  }

  private static final class Notifier {

    private final ReentrantLock lock = new ReentrantLock();
    private final Condition condition = lock.newCondition();
    private volatile TimeValue timeout;

    public Notifier(TimeValue timeout) {
      assert timeout != null;
      this.timeout = timeout;
    }

    public void await() {
      lock.lock();
      try {
        condition.await(timeout.millis(), TimeUnit.MILLISECONDS);
      } catch (InterruptedException e) {
        // we intentionally do not want to restore the interruption flag, we're about to shutdown
        // anyway
      } finally {
        lock.unlock();
      }
    }

    public void setTimeout(TimeValue timeout) {
      assert timeout != null;
      this.timeout = timeout;
      doNotify();
    }

    public TimeValue getTimeout() {
      return timeout;
    }

    public void doNotify() {
      lock.lock();
      try {
        condition.signalAll();
      } finally {
        lock.unlock();
      }
    }
  }
}
public class ClusterService extends AbstractLifecycleComponent {

  public static final Setting<TimeValue> CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING =
      Setting.positiveTimeSetting(
          "cluster.service.slow_task_logging_threshold",
          TimeValue.timeValueSeconds(30),
          Property.Dynamic,
          Property.NodeScope);

  public static final String UPDATE_THREAD_NAME = "clusterService#updateTask";
  private final ThreadPool threadPool;
  private final ClusterName clusterName;

  private BiConsumer<ClusterChangedEvent, Discovery.AckListener> clusterStatePublisher;

  private final OperationRouting operationRouting;

  private final ClusterSettings clusterSettings;

  private TimeValue slowTaskLoggingThreshold;

  private volatile PrioritizedEsThreadPoolExecutor updateTasksExecutor;

  /** Those 3 state listeners are changing infrequently - CopyOnWriteArrayList is just fine */
  private final Collection<ClusterStateListener> priorityClusterStateListeners =
      new CopyOnWriteArrayList<>();

  private final Collection<ClusterStateListener> clusterStateListeners =
      new CopyOnWriteArrayList<>();
  private final Collection<ClusterStateListener> lastClusterStateListeners =
      new CopyOnWriteArrayList<>();
  private final Map<ClusterStateTaskExecutor, List<UpdateTask>> updateTasksPerExecutor =
      new HashMap<>();
  // TODO this is rather frequently changing I guess a Synced Set would be better here and a
  // dedicated remove API
  private final Collection<ClusterStateListener> postAppliedListeners =
      new CopyOnWriteArrayList<>();
  private final Iterable<ClusterStateListener> preAppliedListeners =
      Iterables.concat(
          priorityClusterStateListeners, clusterStateListeners, lastClusterStateListeners);

  private final LocalNodeMasterListeners localNodeMasterListeners;

  private final Queue<NotifyTimeout> onGoingTimeouts = ConcurrentCollections.newQueue();

  private volatile ClusterState clusterState;

  private final ClusterBlocks.Builder initialBlocks;

  private NodeConnectionsService nodeConnectionsService;

  public ClusterService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool) {
    super(settings);
    this.operationRouting = new OperationRouting(settings, clusterSettings);
    this.threadPool = threadPool;
    this.clusterSettings = clusterSettings;
    this.clusterName = ClusterName.CLUSTER_NAME_SETTING.get(settings);
    // will be replaced on doStart.
    this.clusterState = ClusterState.builder(clusterName).build();

    this.clusterSettings.addSettingsUpdateConsumer(
        CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING, this::setSlowTaskLoggingThreshold);

    this.slowTaskLoggingThreshold =
        CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING.get(settings);

    localNodeMasterListeners = new LocalNodeMasterListeners(threadPool);

    initialBlocks = ClusterBlocks.builder();
  }

  private void setSlowTaskLoggingThreshold(TimeValue slowTaskLoggingThreshold) {
    this.slowTaskLoggingThreshold = slowTaskLoggingThreshold;
  }

  public synchronized void setClusterStatePublisher(
      BiConsumer<ClusterChangedEvent, Discovery.AckListener> publisher) {
    clusterStatePublisher = publisher;
  }

  public synchronized void setLocalNode(DiscoveryNode localNode) {
    assert clusterState.nodes().getLocalNodeId() == null : "local node is already set";
    DiscoveryNodes.Builder nodeBuilder =
        DiscoveryNodes.builder(clusterState.nodes()).add(localNode).localNodeId(localNode.getId());
    this.clusterState = ClusterState.builder(clusterState).nodes(nodeBuilder).build();
  }

  public synchronized void setNodeConnectionsService(
      NodeConnectionsService nodeConnectionsService) {
    assert this.nodeConnectionsService == null : "nodeConnectionsService is already set";
    this.nodeConnectionsService = nodeConnectionsService;
  }

  /** Adds an initial block to be set on the first cluster state created. */
  public synchronized void addInitialStateBlock(ClusterBlock block) throws IllegalStateException {
    if (lifecycle.started()) {
      throw new IllegalStateException("can't set initial block when started");
    }
    initialBlocks.addGlobalBlock(block);
  }

  /** Remove an initial block to be set on the first cluster state created. */
  public synchronized void removeInitialStateBlock(ClusterBlock block)
      throws IllegalStateException {
    removeInitialStateBlock(block.id());
  }

  /** Remove an initial block to be set on the first cluster state created. */
  public synchronized void removeInitialStateBlock(int blockId) throws IllegalStateException {
    if (lifecycle.started()) {
      throw new IllegalStateException("can't set initial block when started");
    }
    initialBlocks.removeGlobalBlock(blockId);
  }

  @Override
  protected synchronized void doStart() {
    Objects.requireNonNull(
        clusterStatePublisher, "please set a cluster state publisher before starting");
    Objects.requireNonNull(
        clusterState.nodes().getLocalNode(), "please set the local node before starting");
    Objects.requireNonNull(
        nodeConnectionsService, "please set the node connection service before starting");
    add(localNodeMasterListeners);
    this.clusterState = ClusterState.builder(clusterState).blocks(initialBlocks).build();
    this.updateTasksExecutor =
        EsExecutors.newSinglePrioritizing(
            UPDATE_THREAD_NAME,
            daemonThreadFactory(settings, UPDATE_THREAD_NAME),
            threadPool.getThreadContext());
    this.clusterState = ClusterState.builder(clusterState).blocks(initialBlocks).build();
  }

  @Override
  protected synchronized void doStop() {
    for (NotifyTimeout onGoingTimeout : onGoingTimeouts) {
      onGoingTimeout.cancel();
      try {
        onGoingTimeout.cancel();
        onGoingTimeout.listener.onClose();
      } catch (Exception ex) {
        logger.debug("failed to notify listeners on shutdown", ex);
      }
    }
    ThreadPool.terminate(updateTasksExecutor, 10, TimeUnit.SECONDS);
    // close timeout listeners that did not have an ongoing timeout
    postAppliedListeners
        .stream()
        .filter(listener -> listener instanceof TimeoutClusterStateListener)
        .map(listener -> (TimeoutClusterStateListener) listener)
        .forEach(TimeoutClusterStateListener::onClose);
    remove(localNodeMasterListeners);
  }

  @Override
  protected synchronized void doClose() {}

  /** The local node. */
  public DiscoveryNode localNode() {
    DiscoveryNode localNode = clusterState.getNodes().getLocalNode();
    if (localNode == null) {
      throw new IllegalStateException("No local node found. Is the node started?");
    }
    return localNode;
  }

  public OperationRouting operationRouting() {
    return operationRouting;
  }

  /** The current state. */
  public ClusterState state() {
    return this.clusterState;
  }

  /** Adds a priority listener for updated cluster states. */
  public void addFirst(ClusterStateListener listener) {
    priorityClusterStateListeners.add(listener);
  }

  /** Adds last listener. */
  public void addLast(ClusterStateListener listener) {
    lastClusterStateListeners.add(listener);
  }

  /** Adds a listener for updated cluster states. */
  public void add(ClusterStateListener listener) {
    clusterStateListeners.add(listener);
  }

  /** Removes a listener for updated cluster states. */
  public void remove(ClusterStateListener listener) {
    clusterStateListeners.remove(listener);
    priorityClusterStateListeners.remove(listener);
    lastClusterStateListeners.remove(listener);
    postAppliedListeners.remove(listener);
    for (Iterator<NotifyTimeout> it = onGoingTimeouts.iterator(); it.hasNext(); ) {
      NotifyTimeout timeout = it.next();
      if (timeout.listener.equals(listener)) {
        timeout.cancel();
        it.remove();
      }
    }
  }

  /** Add a listener for on/off local node master events */
  public void add(LocalNodeMasterListener listener) {
    localNodeMasterListeners.add(listener);
  }

  /** Remove the given listener for on/off local master events */
  public void remove(LocalNodeMasterListener listener) {
    localNodeMasterListeners.remove(listener);
  }

  /**
   * Adds a cluster state listener that will timeout after the provided timeout, and is executed
   * after the clusterstate has been successfully applied ie. is in state {@link
   * org.elasticsearch.cluster.ClusterState.ClusterStateStatus#APPLIED} NOTE: a {@code null} timeout
   * means that the listener will never be removed automatically
   */
  public void add(@Nullable final TimeValue timeout, final TimeoutClusterStateListener listener) {
    if (lifecycle.stoppedOrClosed()) {
      listener.onClose();
      return;
    }
    // call the post added notification on the same event thread
    try {
      updateTasksExecutor.execute(
          new SourcePrioritizedRunnable(Priority.HIGH, "_add_listener_") {
            @Override
            public void run() {
              if (timeout != null) {
                NotifyTimeout notifyTimeout = new NotifyTimeout(listener, timeout);
                notifyTimeout.future =
                    threadPool.schedule(timeout, ThreadPool.Names.GENERIC, notifyTimeout);
                onGoingTimeouts.add(notifyTimeout);
              }
              postAppliedListeners.add(listener);
              listener.postAdded();
            }
          });
    } catch (EsRejectedExecutionException e) {
      if (lifecycle.stoppedOrClosed()) {
        listener.onClose();
      } else {
        throw e;
      }
    }
  }

  /**
   * Submits a cluster state update task; unlike {@link #submitStateUpdateTask(String, Object,
   * ClusterStateTaskConfig, ClusterStateTaskExecutor, ClusterStateTaskListener)}, submitted updates
   * will not be batched.
   *
   * @param source the source of the cluster state update task
   * @param updateTask the full context for the cluster state update task
   */
  public void submitStateUpdateTask(final String source, final ClusterStateUpdateTask updateTask) {
    submitStateUpdateTask(source, updateTask, updateTask, updateTask, updateTask);
  }

  /**
   * Submits a cluster state update task; submitted updates will be batched across the same instance
   * of executor. The exact batching semantics depend on the underlying implementation but a rough
   * guideline is that if the update task is submitted while there are pending update tasks for the
   * same executor, these update tasks will all be executed on the executor in a single batch
   *
   * @param source the source of the cluster state update task
   * @param task the state needed for the cluster state update task
   * @param config the cluster state update task configuration
   * @param executor the cluster state update task executor; tasks that share the same executor will
   *     be executed batches on this executor
   * @param listener callback after the cluster state update task completes
   * @param <T> the type of the cluster state update task state
   */
  public <T> void submitStateUpdateTask(
      final String source,
      final T task,
      final ClusterStateTaskConfig config,
      final ClusterStateTaskExecutor<T> executor,
      final ClusterStateTaskListener listener) {
    submitStateUpdateTasks(source, Collections.singletonMap(task, listener), config, executor);
  }

  /**
   * Submits a batch of cluster state update tasks; submitted updates are guaranteed to be processed
   * together, potentially with more tasks of the same executor.
   *
   * @param source the source of the cluster state update task
   * @param tasks a map of update tasks and their corresponding listeners
   * @param config the cluster state update task configuration
   * @param executor the cluster state update task executor; tasks that share the same executor will
   *     be executed batches on this executor
   * @param <T> the type of the cluster state update task state
   */
  public <T> void submitStateUpdateTasks(
      final String source,
      final Map<T, ClusterStateTaskListener> tasks,
      final ClusterStateTaskConfig config,
      final ClusterStateTaskExecutor<T> executor) {
    if (!lifecycle.started()) {
      return;
    }
    if (tasks.isEmpty()) {
      return;
    }
    try {
      // convert to an identity map to check for dups based on update tasks semantics of using
      // identity instead of equal
      final IdentityHashMap<T, ClusterStateTaskListener> tasksIdentity =
          new IdentityHashMap<>(tasks);
      final List<UpdateTask<T>> updateTasks =
          tasksIdentity
              .entrySet()
              .stream()
              .map(
                  entry ->
                      new UpdateTask<>(
                          source, entry.getKey(), config, executor, safe(entry.getValue(), logger)))
              .collect(Collectors.toList());

      synchronized (updateTasksPerExecutor) {
        List<UpdateTask> existingTasks =
            updateTasksPerExecutor.computeIfAbsent(executor, k -> new ArrayList<>());
        for (@SuppressWarnings("unchecked") UpdateTask<T> existing : existingTasks) {
          if (tasksIdentity.containsKey(existing.task)) {
            throw new IllegalStateException(
                "task ["
                    + executor.describeTasks(Collections.singletonList(existing.task))
                    + "] with source ["
                    + source
                    + "] is already queued");
          }
        }
        existingTasks.addAll(updateTasks);
      }

      final UpdateTask<T> firstTask = updateTasks.get(0);

      if (config.timeout() != null) {
        updateTasksExecutor.execute(
            firstTask,
            threadPool.scheduler(),
            config.timeout(),
            () ->
                threadPool
                    .generic()
                    .execute(
                        () -> {
                          for (UpdateTask<T> task : updateTasks) {
                            if (task.processed.getAndSet(true) == false) {
                              logger.debug(
                                  "cluster state update task [{}] timed out after [{}]",
                                  source,
                                  config.timeout());
                              task.listener.onFailure(
                                  source,
                                  new ProcessClusterEventTimeoutException(
                                      config.timeout(), source));
                            }
                          }
                        }));
      } else {
        updateTasksExecutor.execute(firstTask);
      }
    } catch (EsRejectedExecutionException e) {
      // ignore cases where we are shutting down..., there is really nothing interesting
      // to be done here...
      if (!lifecycle.stoppedOrClosed()) {
        throw e;
      }
    }
  }

  /** Returns the tasks that are pending. */
  public List<PendingClusterTask> pendingTasks() {
    PrioritizedEsThreadPoolExecutor.Pending[] pendings = updateTasksExecutor.getPending();
    List<PendingClusterTask> pendingClusterTasks = new ArrayList<>(pendings.length);
    for (PrioritizedEsThreadPoolExecutor.Pending pending : pendings) {
      final String source;
      final long timeInQueue;
      // we have to capture the task as it will be nulled after execution and we don't want to
      // change while we check things here.
      final Object task = pending.task;
      if (task == null) {
        continue;
      } else if (task instanceof SourcePrioritizedRunnable) {
        SourcePrioritizedRunnable runnable = (SourcePrioritizedRunnable) task;
        source = runnable.source();
        timeInQueue = runnable.getAgeInMillis();
      } else {
        assert false : "expected SourcePrioritizedRunnable got " + task.getClass();
        source = "unknown [" + task.getClass() + "]";
        timeInQueue = 0;
      }

      pendingClusterTasks.add(
          new PendingClusterTask(
              pending.insertionOrder,
              pending.priority,
              new Text(source),
              timeInQueue,
              pending.executing));
    }
    return pendingClusterTasks;
  }

  /** Returns the number of currently pending tasks. */
  public int numberOfPendingTasks() {
    return updateTasksExecutor.getNumberOfPendingTasks();
  }

  /**
   * Returns the maximum wait time for tasks in the queue
   *
   * @return A zero time value if the queue is empty, otherwise the time value oldest task waiting
   *     in the queue
   */
  public TimeValue getMaxTaskWaitTime() {
    return updateTasksExecutor.getMaxTaskWaitTime();
  }

  /** asserts that the current thread is the cluster state update thread */
  public static boolean assertClusterStateThread() {
    assert Thread.currentThread().getName().contains(ClusterService.UPDATE_THREAD_NAME)
        : "not called from the cluster state update thread";
    return true;
  }

  public ClusterName getClusterName() {
    return clusterName;
  }

  abstract static class SourcePrioritizedRunnable extends PrioritizedRunnable {
    protected final String source;

    public SourcePrioritizedRunnable(Priority priority, String source) {
      super(priority);
      this.source = source;
    }

    public String source() {
      return source;
    }
  }

  <T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) {
    final ArrayList<UpdateTask<T>> toExecute = new ArrayList<>();
    final Map<String, ArrayList<T>> processTasksBySource = new HashMap<>();
    synchronized (updateTasksPerExecutor) {
      List<UpdateTask> pending = updateTasksPerExecutor.remove(executor);
      if (pending != null) {
        for (UpdateTask<T> task : pending) {
          if (task.processed.getAndSet(true) == false) {
            logger.trace("will process {}", task.toString(executor));
            toExecute.add(task);
            processTasksBySource
                .computeIfAbsent(task.source, s -> new ArrayList<>())
                .add(task.task);
          } else {
            logger.trace("skipping {}, already processed", task.toString(executor));
          }
        }
      }
    }
    if (toExecute.isEmpty()) {
      return;
    }
    final String tasksSummary =
        processTasksBySource
            .entrySet()
            .stream()
            .map(
                entry -> {
                  String tasks = executor.describeTasks(entry.getValue());
                  return tasks.isEmpty() ? entry.getKey() : entry.getKey() + "[" + tasks + "]";
                })
            .reduce((s1, s2) -> s1 + ", " + s2)
            .orElse("");

    if (!lifecycle.started()) {
      logger.debug("processing [{}]: ignoring, cluster_service not started", tasksSummary);
      return;
    }
    logger.debug("processing [{}]: execute", tasksSummary);
    ClusterState previousClusterState = clusterState;
    if (!previousClusterState.nodes().isLocalNodeElectedMaster() && executor.runOnlyOnMaster()) {
      logger.debug("failing [{}]: local node is no longer master", tasksSummary);
      toExecute.stream().forEach(task -> task.listener.onNoLongerMaster(task.source));
      return;
    }
    ClusterStateTaskExecutor.BatchResult<T> batchResult;
    long startTimeNS = currentTimeInNanos();
    try {
      List<T> inputs =
          toExecute.stream().map(tUpdateTask -> tUpdateTask.task).collect(Collectors.toList());
      batchResult = executor.execute(previousClusterState, inputs);
    } catch (Exception e) {
      TimeValue executionTime =
          TimeValue.timeValueMillis(
              Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
      if (logger.isTraceEnabled()) {
        logger.trace(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "failed to execute cluster state update in [{}], state:\nversion [{}], source [{}]\n{}{}{}",
                        executionTime,
                        previousClusterState.version(),
                        tasksSummary,
                        previousClusterState.nodes().prettyPrint(),
                        previousClusterState.routingTable().prettyPrint(),
                        previousClusterState.getRoutingNodes().prettyPrint()),
            e);
      }
      warnAboutSlowTaskIfNeeded(executionTime, tasksSummary);
      batchResult =
          ClusterStateTaskExecutor.BatchResult.<T>builder()
              .failures(toExecute.stream().map(updateTask -> updateTask.task)::iterator, e)
              .build(previousClusterState);
    }

    assert batchResult.executionResults != null;
    assert batchResult.executionResults.size() == toExecute.size()
        : String.format(
            Locale.ROOT,
            "expected [%d] task result%s but was [%d]",
            toExecute.size(),
            toExecute.size() == 1 ? "" : "s",
            batchResult.executionResults.size());
    boolean assertsEnabled = false;
    assert (assertsEnabled = true);
    if (assertsEnabled) {
      for (UpdateTask<T> updateTask : toExecute) {
        assert batchResult.executionResults.containsKey(updateTask.task)
            : "missing task result for " + updateTask.toString(executor);
      }
    }

    ClusterState newClusterState = batchResult.resultingState;
    final ArrayList<UpdateTask<T>> proccessedListeners = new ArrayList<>();
    // fail all tasks that have failed and extract those that are waiting for results
    for (UpdateTask<T> updateTask : toExecute) {
      assert batchResult.executionResults.containsKey(updateTask.task)
          : "missing " + updateTask.toString(executor);
      final ClusterStateTaskExecutor.TaskResult executionResult =
          batchResult.executionResults.get(updateTask.task);
      executionResult.handle(
          () -> proccessedListeners.add(updateTask),
          ex -> {
            logger.debug(
                (Supplier<?>)
                    () ->
                        new ParameterizedMessage(
                            "cluster state update task {} failed", updateTask.toString(executor)),
                ex);
            updateTask.listener.onFailure(updateTask.source, ex);
          });
    }

    if (previousClusterState == newClusterState) {
      for (UpdateTask<T> task : proccessedListeners) {
        if (task.listener instanceof AckedClusterStateTaskListener) {
          // no need to wait for ack if nothing changed, the update can be counted as acknowledged
          ((AckedClusterStateTaskListener) task.listener).onAllNodesAcked(null);
        }
        task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState);
      }
      TimeValue executionTime =
          TimeValue.timeValueMillis(
              Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
      logger.debug(
          "processing [{}]: took [{}] no change in cluster_state", tasksSummary, executionTime);
      warnAboutSlowTaskIfNeeded(executionTime, tasksSummary);
      return;
    }

    try {
      ArrayList<Discovery.AckListener> ackListeners = new ArrayList<>();
      if (newClusterState.nodes().isLocalNodeElectedMaster()) {
        // only the master controls the version numbers
        Builder builder = ClusterState.builder(newClusterState).incrementVersion();
        if (previousClusterState.routingTable() != newClusterState.routingTable()) {
          builder.routingTable(
              RoutingTable.builder(newClusterState.routingTable())
                  .version(newClusterState.routingTable().version() + 1)
                  .build());
        }
        if (previousClusterState.metaData() != newClusterState.metaData()) {
          builder.metaData(
              MetaData.builder(newClusterState.metaData())
                  .version(newClusterState.metaData().version() + 1));
        }
        newClusterState = builder.build();
        for (UpdateTask<T> task : proccessedListeners) {
          if (task.listener instanceof AckedClusterStateTaskListener) {
            final AckedClusterStateTaskListener ackedListener =
                (AckedClusterStateTaskListener) task.listener;
            if (ackedListener.ackTimeout() == null || ackedListener.ackTimeout().millis() == 0) {
              ackedListener.onAckTimeout();
            } else {
              try {
                ackListeners.add(
                    new AckCountDownListener(
                        ackedListener,
                        newClusterState.version(),
                        newClusterState.nodes(),
                        threadPool));
              } catch (EsRejectedExecutionException ex) {
                if (logger.isDebugEnabled()) {
                  logger.debug(
                      "Couldn't schedule timeout thread - node might be shutting down", ex);
                }
                // timeout straightaway, otherwise we could wait forever as the timeout thread has
                // not started
                ackedListener.onAckTimeout();
              }
            }
          }
        }
      }
      final Discovery.AckListener ackListener = new DelegetingAckListener(ackListeners);

      newClusterState.status(ClusterState.ClusterStateStatus.BEING_APPLIED);

      if (logger.isTraceEnabled()) {
        logger.trace(
            "cluster state updated, source [{}]\n{}", tasksSummary, newClusterState.prettyPrint());
      } else if (logger.isDebugEnabled()) {
        logger.debug(
            "cluster state updated, version [{}], source [{}]",
            newClusterState.version(),
            tasksSummary);
      }

      ClusterChangedEvent clusterChangedEvent =
          new ClusterChangedEvent(tasksSummary, newClusterState, previousClusterState);
      // new cluster state, notify all listeners
      final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta();
      if (nodesDelta.hasChanges() && logger.isInfoEnabled()) {
        String summary = nodesDelta.shortSummary();
        if (summary.length() > 0) {
          logger.info("{}, reason: {}", summary, tasksSummary);
        }
      }

      nodeConnectionsService.connectToAddedNodes(clusterChangedEvent);

      // if we are the master, publish the new state to all nodes
      // we publish here before we send a notification to all the listeners, since if it fails
      // we don't want to notify
      if (newClusterState.nodes().isLocalNodeElectedMaster()) {
        logger.debug("publishing cluster state version [{}]", newClusterState.version());
        try {
          clusterStatePublisher.accept(clusterChangedEvent, ackListener);
        } catch (Discovery.FailedToCommitClusterStateException t) {
          final long version = newClusterState.version();
          logger.warn(
              (Supplier<?>)
                  () ->
                      new ParameterizedMessage(
                          "failing [{}]: failed to commit cluster state version [{}]",
                          tasksSummary,
                          version),
              t);
          proccessedListeners.forEach(task -> task.listener.onFailure(task.source, t));
          return;
        }
      }

      // update the current cluster state
      clusterState = newClusterState;
      logger.debug("set local cluster state to version {}", newClusterState.version());
      try {
        // nothing to do until we actually recover from the gateway or any other block indicates we
        // need to disable persistency
        if (clusterChangedEvent.state().blocks().disableStatePersistence() == false
            && clusterChangedEvent.metaDataChanged()) {
          final Settings incomingSettings = clusterChangedEvent.state().metaData().settings();
          clusterSettings.applySettings(incomingSettings);
        }
      } catch (Exception ex) {
        logger.warn("failed to apply cluster settings", ex);
      }
      for (ClusterStateListener listener : preAppliedListeners) {
        try {
          listener.clusterChanged(clusterChangedEvent);
        } catch (Exception ex) {
          logger.warn("failed to notify ClusterStateListener", ex);
        }
      }

      nodeConnectionsService.disconnectFromRemovedNodes(clusterChangedEvent);

      newClusterState.status(ClusterState.ClusterStateStatus.APPLIED);

      for (ClusterStateListener listener : postAppliedListeners) {
        try {
          listener.clusterChanged(clusterChangedEvent);
        } catch (Exception ex) {
          logger.warn("failed to notify ClusterStateListener", ex);
        }
      }

      // manual ack only from the master at the end of the publish
      if (newClusterState.nodes().isLocalNodeElectedMaster()) {
        try {
          ackListener.onNodeAck(newClusterState.nodes().getLocalNode(), null);
        } catch (Exception e) {
          final DiscoveryNode localNode = newClusterState.nodes().getLocalNode();
          logger.debug(
              (Supplier<?>)
                  () ->
                      new ParameterizedMessage(
                          "error while processing ack for master node [{}]", localNode),
              e);
        }
      }

      for (UpdateTask<T> task : proccessedListeners) {
        task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState);
      }

      try {
        executor.clusterStatePublished(clusterChangedEvent);
      } catch (Exception e) {
        logger.error(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "exception thrown while notifying executor of new cluster state publication [{}]",
                        tasksSummary),
            e);
      }

      TimeValue executionTime =
          TimeValue.timeValueMillis(
              Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
      logger.debug(
          "processing [{}]: took [{}] done applying updated cluster_state (version: {}, uuid: {})",
          tasksSummary,
          executionTime,
          newClusterState.version(),
          newClusterState.stateUUID());
      warnAboutSlowTaskIfNeeded(executionTime, tasksSummary);
    } catch (Exception e) {
      TimeValue executionTime =
          TimeValue.timeValueMillis(
              Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
      final long version = newClusterState.version();
      final String stateUUID = newClusterState.stateUUID();
      final String prettyPrint = newClusterState.prettyPrint();
      logger.warn(
          (Supplier<?>)
              () ->
                  new ParameterizedMessage(
                      "failed to apply updated cluster state in [{}]:\nversion [{}], uuid [{}], source [{}]\n{}",
                      executionTime,
                      version,
                      stateUUID,
                      tasksSummary,
                      prettyPrint),
          e);
      // TODO: do we want to call updateTask.onFailure here?
    }
  }

  // this one is overridden in tests so we can control time
  protected long currentTimeInNanos() {
    return System.nanoTime();
  }

  private static SafeClusterStateTaskListener safe(
      ClusterStateTaskListener listener, Logger logger) {
    if (listener instanceof AckedClusterStateTaskListener) {
      return new SafeAckedClusterStateTaskListener(
          (AckedClusterStateTaskListener) listener, logger);
    } else {
      return new SafeClusterStateTaskListener(listener, logger);
    }
  }

  private static class SafeClusterStateTaskListener implements ClusterStateTaskListener {
    private final ClusterStateTaskListener listener;
    private final Logger logger;

    public SafeClusterStateTaskListener(ClusterStateTaskListener listener, Logger logger) {
      this.listener = listener;
      this.logger = logger;
    }

    @Override
    public void onFailure(String source, Exception e) {
      try {
        listener.onFailure(source, e);
      } catch (Exception inner) {
        inner.addSuppressed(e);
        logger.error(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "exception thrown by listener notifying of failure from [{}]", source),
            inner);
      }
    }

    @Override
    public void onNoLongerMaster(String source) {
      try {
        listener.onNoLongerMaster(source);
      } catch (Exception e) {
        logger.error(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "exception thrown by listener while notifying no longer master from [{}]",
                        source),
            e);
      }
    }

    @Override
    public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
      try {
        listener.clusterStateProcessed(source, oldState, newState);
      } catch (Exception e) {
        logger.error(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "exception thrown by listener while notifying of cluster state processed from [{}], old cluster state:\n"
                            + "{}\nnew cluster state:\n{}",
                        source,
                        oldState.prettyPrint(),
                        newState.prettyPrint()),
            e);
      }
    }
  }

  private static class SafeAckedClusterStateTaskListener extends SafeClusterStateTaskListener
      implements AckedClusterStateTaskListener {
    private final AckedClusterStateTaskListener listener;
    private final Logger logger;

    public SafeAckedClusterStateTaskListener(
        AckedClusterStateTaskListener listener, Logger logger) {
      super(listener, logger);
      this.listener = listener;
      this.logger = logger;
    }

    @Override
    public boolean mustAck(DiscoveryNode discoveryNode) {
      return listener.mustAck(discoveryNode);
    }

    @Override
    public void onAllNodesAcked(@Nullable Exception e) {
      try {
        listener.onAllNodesAcked(e);
      } catch (Exception inner) {
        inner.addSuppressed(e);
        logger.error("exception thrown by listener while notifying on all nodes acked", inner);
      }
    }

    @Override
    public void onAckTimeout() {
      try {
        listener.onAckTimeout();
      } catch (Exception e) {
        logger.error("exception thrown by listener while notifying on ack timeout", e);
      }
    }

    @Override
    public TimeValue ackTimeout() {
      return listener.ackTimeout();
    }
  }

  class UpdateTask<T> extends SourcePrioritizedRunnable {

    public final T task;
    public final ClusterStateTaskConfig config;
    public final ClusterStateTaskExecutor<T> executor;
    public final ClusterStateTaskListener listener;
    public final AtomicBoolean processed = new AtomicBoolean();

    UpdateTask(
        String source,
        T task,
        ClusterStateTaskConfig config,
        ClusterStateTaskExecutor<T> executor,
        ClusterStateTaskListener listener) {
      super(config.priority(), source);
      this.task = task;
      this.config = config;
      this.executor = executor;
      this.listener = listener;
    }

    @Override
    public void run() {
      // if this task is already processed, the executor shouldn't execute other tasks (that arrived
      // later),
      // to give other executors a chance to execute their tasks.
      if (processed.get() == false) {
        runTasksForExecutor(executor);
      }
    }

    public String toString(ClusterStateTaskExecutor<T> executor) {
      String taskDescription = executor.describeTasks(Collections.singletonList(task));
      if (taskDescription.isEmpty()) {
        return "[" + source + "]";
      } else {
        return "[" + source + "[" + taskDescription + "]]";
      }
    }
  }

  private void warnAboutSlowTaskIfNeeded(TimeValue executionTime, String source) {
    if (executionTime.getMillis() > slowTaskLoggingThreshold.getMillis()) {
      logger.warn(
          "cluster state update task [{}] took [{}] above the warn threshold of {}",
          source,
          executionTime,
          slowTaskLoggingThreshold);
    }
  }

  class NotifyTimeout implements Runnable {
    final TimeoutClusterStateListener listener;
    final TimeValue timeout;
    volatile ScheduledFuture future;

    NotifyTimeout(TimeoutClusterStateListener listener, TimeValue timeout) {
      this.listener = listener;
      this.timeout = timeout;
    }

    public void cancel() {
      FutureUtils.cancel(future);
    }

    @Override
    public void run() {
      if (future != null && future.isCancelled()) {
        return;
      }
      if (lifecycle.stoppedOrClosed()) {
        listener.onClose();
      } else {
        listener.onTimeout(this.timeout);
      }
      // note, we rely on the listener to remove itself in case of timeout if needed
    }
  }

  private static class LocalNodeMasterListeners implements ClusterStateListener {

    private final List<LocalNodeMasterListener> listeners = new CopyOnWriteArrayList<>();
    private final ThreadPool threadPool;
    private volatile boolean master = false;

    private LocalNodeMasterListeners(ThreadPool threadPool) {
      this.threadPool = threadPool;
    }

    @Override
    public void clusterChanged(ClusterChangedEvent event) {
      if (!master && event.localNodeMaster()) {
        master = true;
        for (LocalNodeMasterListener listener : listeners) {
          Executor executor = threadPool.executor(listener.executorName());
          executor.execute(new OnMasterRunnable(listener));
        }
        return;
      }

      if (master && !event.localNodeMaster()) {
        master = false;
        for (LocalNodeMasterListener listener : listeners) {
          Executor executor = threadPool.executor(listener.executorName());
          executor.execute(new OffMasterRunnable(listener));
        }
      }
    }

    private void add(LocalNodeMasterListener listener) {
      listeners.add(listener);
    }

    private void remove(LocalNodeMasterListener listener) {
      listeners.remove(listener);
    }

    private void clear() {
      listeners.clear();
    }
  }

  private static class OnMasterRunnable implements Runnable {

    private final LocalNodeMasterListener listener;

    private OnMasterRunnable(LocalNodeMasterListener listener) {
      this.listener = listener;
    }

    @Override
    public void run() {
      listener.onMaster();
    }
  }

  private static class OffMasterRunnable implements Runnable {

    private final LocalNodeMasterListener listener;

    private OffMasterRunnable(LocalNodeMasterListener listener) {
      this.listener = listener;
    }

    @Override
    public void run() {
      listener.offMaster();
    }
  }

  private static class DelegetingAckListener implements Discovery.AckListener {

    private final List<Discovery.AckListener> listeners;

    private DelegetingAckListener(List<Discovery.AckListener> listeners) {
      this.listeners = listeners;
    }

    @Override
    public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
      for (Discovery.AckListener listener : listeners) {
        listener.onNodeAck(node, e);
      }
    }

    @Override
    public void onTimeout() {
      throw new UnsupportedOperationException("no timeout delegation");
    }
  }

  private static class AckCountDownListener implements Discovery.AckListener {

    private static final Logger logger = Loggers.getLogger(AckCountDownListener.class);

    private final AckedClusterStateTaskListener ackedTaskListener;
    private final CountDown countDown;
    private final DiscoveryNodes nodes;
    private final long clusterStateVersion;
    private final Future<?> ackTimeoutCallback;
    private Exception lastFailure;

    AckCountDownListener(
        AckedClusterStateTaskListener ackedTaskListener,
        long clusterStateVersion,
        DiscoveryNodes nodes,
        ThreadPool threadPool) {
      this.ackedTaskListener = ackedTaskListener;
      this.clusterStateVersion = clusterStateVersion;
      this.nodes = nodes;
      int countDown = 0;
      for (DiscoveryNode node : nodes) {
        if (ackedTaskListener.mustAck(node)) {
          countDown++;
        }
      }
      // we always wait for at least 1 node (the master)
      countDown = Math.max(1, countDown);
      logger.trace(
          "expecting {} acknowledgements for cluster_state update (version: {})",
          countDown,
          clusterStateVersion);
      this.countDown = new CountDown(countDown);
      this.ackTimeoutCallback =
          threadPool.schedule(
              ackedTaskListener.ackTimeout(),
              ThreadPool.Names.GENERIC,
              new Runnable() {
                @Override
                public void run() {
                  onTimeout();
                }
              });
    }

    @Override
    public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
      if (!ackedTaskListener.mustAck(node)) {
        // we always wait for the master ack anyway
        if (!node.equals(nodes.getMasterNode())) {
          return;
        }
      }
      if (e == null) {
        logger.trace(
            "ack received from node [{}], cluster_state update (version: {})",
            node,
            clusterStateVersion);
      } else {
        this.lastFailure = e;
        logger.debug(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "ack received from node [{}], cluster_state update (version: {})",
                        node,
                        clusterStateVersion),
            e);
      }

      if (countDown.countDown()) {
        logger.trace(
            "all expected nodes acknowledged cluster_state update (version: {})",
            clusterStateVersion);
        FutureUtils.cancel(ackTimeoutCallback);
        ackedTaskListener.onAllNodesAcked(lastFailure);
      }
    }

    @Override
    public void onTimeout() {
      if (countDown.fastForward()) {
        logger.trace(
            "timeout waiting for acknowledgement for cluster_state update (version: {})",
            clusterStateVersion);
        ackedTaskListener.onAckTimeout();
      }
    }
  }

  public ClusterSettings getClusterSettings() {
    return clusterSettings;
  }

  public Settings getSettings() {
    return settings;
  }
}
public class RecoverySettings extends AbstractComponent {

  public static final Setting<ByteSizeValue> INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING =
      Setting.byteSizeSetting(
          "indices.recovery.max_bytes_per_sec",
          new ByteSizeValue(40, ByteSizeUnit.MB),
          Property.Dynamic,
          Property.NodeScope);

  /**
   * how long to wait before retrying after issues cause by cluster state syncing between nodes
   * i.e., local node is not yet known on remote node, remote shard not yet started etc.
   */
  public static final Setting<TimeValue> INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING =
      Setting.positiveTimeSetting(
          "indices.recovery.retry_delay_state_sync",
          TimeValue.timeValueMillis(500),
          Property.Dynamic,
          Property.NodeScope);

  /** how long to wait before retrying after network related issues */
  public static final Setting<TimeValue> INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING =
      Setting.positiveTimeSetting(
          "indices.recovery.retry_delay_network",
          TimeValue.timeValueSeconds(5),
          Property.Dynamic,
          Property.NodeScope);

  /** timeout value to use for requests made as part of the recovery process */
  public static final Setting<TimeValue> INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING =
      Setting.positiveTimeSetting(
          "indices.recovery.internal_action_timeout",
          TimeValue.timeValueMinutes(15),
          Property.Dynamic,
          Property.NodeScope);

  /**
   * timeout value to use for requests made as part of the recovery process that are expected to
   * take long time. defaults to twice `indices.recovery.internal_action_timeout`.
   */
  public static final Setting<TimeValue> INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING =
      Setting.timeSetting(
          "indices.recovery.internal_action_long_timeout",
          (s) ->
              TimeValue.timeValueMillis(
                  INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.get(s).millis() * 2),
          TimeValue.timeValueSeconds(0),
          Property.Dynamic,
          Property.NodeScope);

  /**
   * recoveries that don't show any activity for more then this interval will be failed. defaults to
   * `indices.recovery.internal_action_long_timeout`
   */
  public static final Setting<TimeValue> INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING =
      Setting.timeSetting(
          "indices.recovery.recovery_activity_timeout",
          INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING::get,
          TimeValue.timeValueSeconds(0),
          Property.Dynamic,
          Property.NodeScope);

  public static final ByteSizeValue DEFAULT_CHUNK_SIZE = new ByteSizeValue(512, ByteSizeUnit.KB);

  private volatile ByteSizeValue maxBytesPerSec;
  private volatile SimpleRateLimiter rateLimiter;
  private volatile TimeValue retryDelayStateSync;
  private volatile TimeValue retryDelayNetwork;
  private volatile TimeValue activityTimeout;
  private volatile TimeValue internalActionTimeout;
  private volatile TimeValue internalActionLongTimeout;

  private volatile ByteSizeValue chunkSize = DEFAULT_CHUNK_SIZE;

  @Inject
  public RecoverySettings(Settings settings, ClusterSettings clusterSettings) {
    super(settings);

    this.retryDelayStateSync = INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING.get(settings);
    // doesn't have to be fast as nodes are reconnected every 10s by default (see
    // InternalClusterService.ReconnectToNodes)
    // and we want to give the master time to remove a faulty node
    this.retryDelayNetwork = INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING.get(settings);

    this.internalActionTimeout = INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.get(settings);
    this.internalActionLongTimeout =
        INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING.get(settings);

    this.activityTimeout = INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING.get(settings);
    this.maxBytesPerSec = INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING.get(settings);
    if (maxBytesPerSec.getBytes() <= 0) {
      rateLimiter = null;
    } else {
      rateLimiter = new SimpleRateLimiter(maxBytesPerSec.getMbFrac());
    }

    logger.debug("using max_bytes_per_sec[{}]", maxBytesPerSec);

    clusterSettings.addSettingsUpdateConsumer(
        INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING, this::setMaxBytesPerSec);
    clusterSettings.addSettingsUpdateConsumer(
        INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING, this::setRetryDelayStateSync);
    clusterSettings.addSettingsUpdateConsumer(
        INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING, this::setRetryDelayNetwork);
    clusterSettings.addSettingsUpdateConsumer(
        INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING, this::setInternalActionTimeout);
    clusterSettings.addSettingsUpdateConsumer(
        INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING, this::setInternalActionLongTimeout);
    clusterSettings.addSettingsUpdateConsumer(
        INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING, this::setActivityTimeout);
  }

  public RateLimiter rateLimiter() {
    return rateLimiter;
  }

  public TimeValue retryDelayNetwork() {
    return retryDelayNetwork;
  }

  public TimeValue retryDelayStateSync() {
    return retryDelayStateSync;
  }

  public TimeValue activityTimeout() {
    return activityTimeout;
  }

  public TimeValue internalActionTimeout() {
    return internalActionTimeout;
  }

  public TimeValue internalActionLongTimeout() {
    return internalActionLongTimeout;
  }

  public ByteSizeValue getChunkSize() {
    return chunkSize;
  }

  void setChunkSize(ByteSizeValue chunkSize) { // only settable for tests
    if (chunkSize.bytesAsInt() <= 0) {
      throw new IllegalArgumentException("chunkSize must be > 0");
    }
    this.chunkSize = chunkSize;
  }

  public void setRetryDelayStateSync(TimeValue retryDelayStateSync) {
    this.retryDelayStateSync = retryDelayStateSync;
  }

  public void setRetryDelayNetwork(TimeValue retryDelayNetwork) {
    this.retryDelayNetwork = retryDelayNetwork;
  }

  public void setActivityTimeout(TimeValue activityTimeout) {
    this.activityTimeout = activityTimeout;
  }

  public void setInternalActionTimeout(TimeValue internalActionTimeout) {
    this.internalActionTimeout = internalActionTimeout;
  }

  public void setInternalActionLongTimeout(TimeValue internalActionLongTimeout) {
    this.internalActionLongTimeout = internalActionLongTimeout;
  }

  private void setMaxBytesPerSec(ByteSizeValue maxBytesPerSec) {
    this.maxBytesPerSec = maxBytesPerSec;
    if (maxBytesPerSec.getBytes() <= 0) {
      rateLimiter = null;
    } else if (rateLimiter != null) {
      rateLimiter.setMBPerSec(maxBytesPerSec.getMbFrac());
    } else {
      rateLimiter = new SimpleRateLimiter(maxBytesPerSec.getMbFrac());
    }
  }
}
예제 #5
0
/**
 * A base class for {@link org.elasticsearch.discovery.zen.fd.MasterFaultDetection} &amp; {@link
 * org.elasticsearch.discovery.zen.fd.NodesFaultDetection}, making sure both use the same setting.
 */
public abstract class FaultDetection extends AbstractComponent {

  public static final Setting<Boolean> CONNECT_ON_NETWORK_DISCONNECT_SETTING =
      Setting.boolSetting(
          "discovery.zen.fd.connect_on_network_disconnect", false, Property.NodeScope);
  public static final Setting<TimeValue> PING_INTERVAL_SETTING =
      Setting.positiveTimeSetting(
          "discovery.zen.fd.ping_interval", timeValueSeconds(1), Property.NodeScope);
  public static final Setting<TimeValue> PING_TIMEOUT_SETTING =
      Setting.timeSetting(
          "discovery.zen.fd.ping_timeout", timeValueSeconds(30), Property.NodeScope);
  public static final Setting<Integer> PING_RETRIES_SETTING =
      Setting.intSetting("discovery.zen.fd.ping_retries", 3, Property.NodeScope);
  public static final Setting<Boolean> REGISTER_CONNECTION_LISTENER_SETTING =
      Setting.boolSetting(
          "discovery.zen.fd.register_connection_listener", true, Property.NodeScope);

  protected final ThreadPool threadPool;
  protected final ClusterName clusterName;
  protected final TransportService transportService;

  // used mainly for testing, should always be true
  protected final boolean registerConnectionListener;
  protected final FDConnectionListener connectionListener;
  protected final boolean connectOnNetworkDisconnect;

  protected final TimeValue pingInterval;
  protected final TimeValue pingRetryTimeout;
  protected final int pingRetryCount;

  public FaultDetection(
      Settings settings,
      ThreadPool threadPool,
      TransportService transportService,
      ClusterName clusterName) {
    super(settings);
    this.threadPool = threadPool;
    this.transportService = transportService;
    this.clusterName = clusterName;

    this.connectOnNetworkDisconnect = CONNECT_ON_NETWORK_DISCONNECT_SETTING.get(settings);
    this.pingInterval = PING_INTERVAL_SETTING.get(settings);
    this.pingRetryTimeout = PING_TIMEOUT_SETTING.get(settings);
    this.pingRetryCount = PING_RETRIES_SETTING.get(settings);
    this.registerConnectionListener = REGISTER_CONNECTION_LISTENER_SETTING.get(settings);

    this.connectionListener = new FDConnectionListener();
    if (registerConnectionListener) {
      transportService.addConnectionListener(connectionListener);
    }
  }

  public void close() {
    transportService.removeConnectionListener(connectionListener);
  }

  /**
   * This method will be called when the {@link org.elasticsearch.transport.TransportService} raised
   * a node disconnected event
   */
  abstract void handleTransportDisconnect(DiscoveryNode node);

  private class FDConnectionListener implements TransportConnectionListener {
    @Override
    public void onNodeConnected(DiscoveryNode node) {}

    @Override
    public void onNodeDisconnected(DiscoveryNode node) {
      handleTransportDisconnect(node);
    }
  }
}